HashTable（二）字符串Hash函数（转载）

字符串hash函数非常多，常见的主要有Simple_hash, RS_hash, JS_hash, PJW_hash, ELF_hash, BKDR_hash, SDBM_hash, DJB_hash, AP_hash, CRC_hash等。

评估hash函数优劣的基准主要有以下两个指标：

(1) 散列分布性：即桶(list)的使用率backet_usage = (已使用桶数) / (总的桶数)，这个比例越高，说明分布性良好，是好的hash设计。

(2) 平均桶长：即avg_backet_len，所有已使用桶的平均长度。理想状态下这个值应该=1，越小说明冲突发生地越少，是好的hash设计。

评估方案设计是这样的：

(1) 以200M的视频文件作为输入源，以4KB的块为大小计算MD5值，并以此作为hash关键字;或者随机生成1000个字符串，每个字符串的长度均为10。将这1000个字符串写入一个文件test.txt。

(2) 分别应用上面提到的各种字符串hash函数，进行hash散列模拟；

(3) 统计结果，用散列分布性和平均桶长两个指标进行评估分析。

#include "stdafx.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctype.h>

#include <time.h>

#define STRINGSIZE 10

#define STRINGCOUNT 1000

int _tmain(int argc, _TCHAR* argv[]){

FILE *fp1; //定义文件流指针，用于打开读取的文件

char text[10]; //定义一个字符串数组，用于存储读取的字符

int i=0,j=0,lstr;

char *str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

lstr = strlen(str);//计算字符串长度

fp1 = fopen("d:\\test.txt","r+");//只读写方式打开文件a.txt

srand((unsigned int)time((time_t *)NULL));//使用系统时间来初始化随机数发生器

for(j=0;j<STRINGCOUNT;j++){

for(i = 0; i < STRINGSIZE-2; i++){ //按指定大小返回相应的字符串

text[i]=str[(rand()%lstr)];

}

text[i++]='\n';

text[i]='\0';

fputs(text,fp1);//将内容写到fp1所指向文件中

}

fclose(fp1);//关闭文件test.txt，有打开就要有关闭

return 0;

}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

unsigned int simple_hash(const std::string&str){

for(std::size_t i=0;i<str.length();i++)

hash = 31 * hash + str[i];

return (hash & 0x7FFFFFFF);

}

/*End of Simple Hash Function*/

unsigned int RSHash(const std::string& str){

unsigned int b = 378551;

unsigned int a = 63689;

unsigned int hash = 0;

for(std::size_t i = 0; i < str.length(); i++)

{

hash = hash * a + str[i];

a = a * b;

}

return hash;

}

/* End Of RS Hash Function */

unsigned int JSHash(const std::string& str){

unsigned int hash = 1315423911;

for(std::size_t i = 0; i < str.length(); i++)

{

hash ^= ((hash << 5) + str[i] + (hash >> 2));

}

return hash;

}

/* End Of JS Hash Function */

unsigned int PJWHash(const std::string& str){

unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);

unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);

unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);

unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);

unsigned int hash = 0;

unsigned int test = 0;

for(std::size_t i = 0; i < str.length(); i++){

hash = (hash << OneEighth) + str[i];

if((test = hash & HighBits) != 0){

hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));

}

return hash;

}

/* End Of P. J. Weinberger Hash Function */

unsigned int ELFHash(const std::string& str)

{

unsigned int hash = 0;

unsigned int x = 0;

for(std::size_t i = 0; i < str.length(); i++)

{

hash = (hash << 4) + str[i];

if((x = hash & 0xF0000000L) != 0)

{

hash ^= (x >> 24);

}

hash &= ~x;

}

return hash;

}

/* End Of ELF Hash Function */

unsigned int BKDRHash(const std::string& str)

{

unsigned int seed = 131; // 31 131 1313 13131 131313 etc..

unsigned int hash = 0;

for(std::size_t i = 0; i < str.length(); i++)

{

hash = (hash * seed) + str[i];

}

return hash;

}

/* End Of BKDR Hash Function */

//我们就选用这种hash函数嘞

unsigned int SDBMHash(const std::string& str)

{

unsigned int hash = 0;

for(std::size_t i = 0; i < str.length(); i++)

{

hash = str[i] + (hash << 6) + (hash << 16) - hash;

}

return hash;

}

/* End Of SDBM Hash Function */

unsigned int DJBHash(const std::string& str)

{

unsigned int hash = 5381;

for(std::size_t i = 0; i < str.length(); i++)

{

hash = ((hash << 5) + hash) + str[i];

}

return hash;

}

/* End Of DJB Hash Function */

unsigned int DEKHash(const std::string& str)

{

unsigned int hash = static_cast<unsigned int>(str.length());

for(std::size_t i = 0; i < str.length(); i++)

{

hash = ((hash << 5) ^ (hash >> 27)) ^ str[i];

}

return hash;

}

/* End Of DEK Hash Function */

unsigned int BPHash(const std::string& str)

{

unsigned int hash = 0;

for(std::size_t i = 0; i < str.length(); i++)

{

hash = hash << 7 ^ str[i];

}

return hash;

}

/* End Of BP Hash Function */

unsigned int FNVHash(const std::string& str)

{

const unsigned int fnv_prime = 0x811C9DC5;

unsigned int hash = 0;

for(std::size_t i = 0; i < str.length(); i++)

{

hash *= fnv_prime;

hash ^= str[i];

}

return hash;

}

/* End Of FNV Hash Function */

unsigned int APHash(const std::string& str)

{

unsigned int hash = 0xAAAAAAAA;

for(std::size_t i = 0; i < str.length(); i++)

{

hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ str[i] * (hash >> 3)) :

(~((hash << 11) + (str[i] ^ (hash >> 5))));

}

return hash;

}

/* End Of AP Hash Function */

/* CRC Hash Function */

unsigned int CRC_hash(char *str){

unsigned int nleft = strlen(str);

unsigned long long sum = 0;

unsigned short int *w = (unsigned short int *)str;

unsigned short int answer = 0;

* Our algorithm is simple, using a 32 bit accumulator (sum), we add

* sequential 16 bit words to it, and at the end, fold back all the

* carry bits from the top 16 bits into the lower 16 bits.

while ( nleft > 1 ) {

sum += *w++;

nleft -= 2;

}

* mop up an odd byte, if necessary

if ( 1 == nleft ) {

*( unsigned char * )( &answer ) = *( unsigned char * )w ;

sum += answer;

}

* add back carry outs from top 16 bits to low 16 bits

* add hi 16 to low 16

sum = ( sum >> 16 ) + ( sum & 0xFFFF );

/* add carry */

sum += ( sum >> 16 );

/* truncate to 16 bits */

answer = ~sum;

return (answer & 0xFFFFFFFF);

}

实验的结果如下：

表格中字符串的解释，参见如下注释：

printf("bucket_len = %d\n", pHashTable->mBucketLen); ///哈希表的桶的个数

printf("hit_count = %d\n", hit_count); ///建立hash表的不重复的元素的个数

printf("buket conflict count = %d\n", conflict_count); ///冲突的桶的个数

printf("longest hash entry = %d\n", max_link); ///最长的链的长度

printf("average hash entry length = %.2f\n", avg_link); ///链表的平均长度

printf("backet usage = %.2f%\n", backet_usage); ///hash table的桶的使用率

以上实验结果使用的装填因子是1，装填因子更小些，更能评估不同hash函数散列结果的好坏。

参考资料：

1）http://www.cnblogs.com/luxiaoxun/archive/2012/08/06/2625659.html

2）http://blog.csdn.net/liuaigui/article/details/5050697#reply

2）http://blog.csdn.net/love__coder/article/details/8274886

3）http://blog.csdn.net/sangyongjia/article/details/37312851

附录：有bug，有待调试

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

#include "stdafx.h"

#include <stdio.h>

#include <stdlib.h>

///do not know where use those head file

///#include <sys/types.h>

///#include <sys/stat.h>

///#include <fcntl.h>

///#include <errno.h>

#include <string.h>

#include "hashTest.h"

#include <iostream>

using namespace std;

//#include "md5.h"

#define STRING_LEN 255

///one atom of the chain when build the hash table

struct AtomOfBucketChain {

unsigned char *pKey;

struct AtomOfBucketChain *pNext;

};

struct ChainOfHashTable {

unsigned int mHitCount;

unsigned int mEntryCount;

struct AtomOfBucketChain *pKeys;

};

struct HashTable {

unsigned int mBucketLen;

struct ChainOfHashTable *pTable;

};

unsigned int (*pHashFunc)(char *str);

///choose which hash function to be used

void chooseHashFunc( char *pHashFuncName)

{

if (0 == strcmp(pHashFuncName, "simple_hash"))

pHashFunc = simple_hash;

else if (0 == strcmp(pHashFuncName, "RS_hash"))

pHashFunc = RS_hash;

else if (0 == strcmp(pHashFuncName, "JS_hash"))

pHashFunc = JS_hash;

else if (0 == strcmp(pHashFuncName, "PJW_hash"))

pHashFunc = PJW_hash;

else if (0 == strcmp(pHashFuncName, "ELF_hash"))

pHashFunc = ELF_hash;

else if (0 == strcmp(pHashFuncName, "BKDR_hash"))

pHashFunc = BKDR_hash;

else if (0 == strcmp(pHashFuncName, "SDBM_hash"))

pHashFunc = SDBM_hash;

else if (0 == strcmp(pHashFuncName, "DJB_hash"))

pHashFunc = DJB_hash;

else if (0 == strcmp(pHashFuncName, "AP_hash"))

pHashFunc = AP_hash;

// else if (0 == strcmp(pHashFuncName, "CRC_hash"))

// pHashFunc = CRC_hash;

else

pHashFunc = NULL;

}

///build the hash table

void buildHashTable(char *pKey, struct HashTable *pHashTable)

{

unsigned int mHashValue = pHashFunc(pKey) % pHashTable->mBucketLen;

struct AtomOfBucketChain *p=NULL;

p = pHashTable->pTable[mHashValue].pKeys;

while(p)

{

if (0 == strcmp(pKey, (const char*)p->pKey))

{

break;

}

p = p->pNext;

}

if (p == NULL)

{

p = (struct AtomOfBucketChain *)malloc(sizeof(struct AtomOfBucketChain));

if (p == NULL)

{

printf("malloc in buildHashTable filled");

return ;///must have 'return',否则失败也不会停止。

}

p->pKey = strdup(pKey);

p->pNext = pHashTable->pTable[mHashValue].pKeys;

pHashTable->pTable[mHashValue].pKeys = p;

pHashTable->pTable[mHashValue].mEntryCount++;

}

pHashTable->pTable[mHashValue].mHitCount++;

}

///initial hash table

void hashTableInit(struct HashTable *pHashTable)

{

unsigned int i;

if ((NULL == pHashTable) || (NULL==pHashTable->pTable))

{

printf("hashTableInit: malloc pHashTable or pTable failed");

return;

}

for (i = 0; i < pHashTable->mBucketLen; i++)

{

pHashTable->pTable[i].mHitCount=0;

pHashTable->pTable[i].mEntryCount=0;

pHashTable->pTable[i].pKeys=NULL;

}

///free space hash table used

void freeHashTable(struct HashTable *pHashTable)

{

unsigned int i;

struct AtomOfBucketChain *pFront, *pBack;

if ((NULL == pHashTable) || (NULL==pHashTable->pTable))

{

printf("hash table has been free");

return;

}

for (i = 0; i < pHashTable->mBucketLen; i++)

{

pFront = pHashTable->pTable[i].pKeys;

while(pFront)

{

pBack = pFront->pNext;

if (pFront->pKey) free(pFront->pKey);

free(pFront);

pFront = pBack;

}

free(pHashTable->pTable);

}

///显示统计结果

void showTestsResult(struct HashTable *pHashTable)

{

int backet = 0, sum = 0;

unsigned i=0, max_link=0;

int conflict_count = 0, hit_count = 0;

double avg_link, backet_usage;

for(i = 0; i < pHashTable->mBucketLen; i++)

{

if (pHashTable->pTable[i].mHitCount > 0)

{

backet++;

sum += pHashTable->pTable[i].mEntryCount;

if (pHashTable->pTable[i].mEntryCount > max_link)

{

max_link = pHashTable->pTable[i].mEntryCount;

}

if (pHashTable->pTable[i].mEntryCount > 1)

{

conflict_count++;

}

hit_count += pHashTable->pTable[i].mHitCount;

}

backet_usage = backet/1.0/pHashTable->mBucketLen * 100;

avg_link = sum/1.0/backet;

printf("bucket_len = %d\n", pHashTable->mBucketLen); ///哈希表的桶的个数

/// printf("hash_call_count = %d/n", hash_call_count); ///建立hash表的字符串的个数

printf("hit_count = %d\n", hit_count); ///建立hash表的不重复的元素的个数

printf("buket conflict count = %d\n", conflict_count); ///冲突的桶的个数

printf("longest hash entry = %d\n", max_link); ///最长的链的长度

printf("average hash entry length = %.2f\n", avg_link); ///链表的平均长度

printf("backet usage = %.2f%\n", backet_usage); ///hash table的桶的使用率

}

void usage()

{

printf("Usage: hash_func_name [backet_len]\n");

printf("hash_func_name:\n");

printf("/tsimple_hash\n");

printf("/tRS_hash\n");

printf("/tJS_hash\n");

printf("/tPJW_hash\n");

printf("/tELF_hash\n");

printf("/tBKDR_hash\n");

printf("/tSDBM_hash\n");

printf("/tDJB_hash\n");

printf("/tAP_hash\n");

// printf("/tCRC_hash\n");

}

int main(int argc, char *argv[])

{

FILE *fp;

int mStringCount=0;

unsigned char pKey[10];

struct HashTable *pHashTable=NULL;

///参数输入

char hashfunctionname[10],bucketcount[10];

printf("input hashfunctionname\n");

gets(hashfunctionname);

printf("input bucketcount\n");

gets(bucketcount);

pHashTable=(struct HashTable*)malloc(sizeof(struct HashTable));

if(NULL==pHashTable)

{

printf("malloc hash table filled");

return -1;

}

if (argc<=1)

{

usage();

return -1;

}

if (2==argc)

{

usage();

}

// pHashTable->mBucketLen = atoi(argv[1]);

pHashTable->mBucketLen = atoi(bucketcount);

pHashTable->pTable=(struct ChainOfHashTable*)malloc(sizeof(struct ChainOfHashTable) * pHashTable->mBucketLen);

if (!(fp = fopen("d:\\test.txt", "r"))) ///假设文件已经生成，需要补充自动生成字符串的函数。将生成的字符串保存在一个文件中。

{

printf("open source file filled");

return -1;

}

hashTableInit(pHashTable);

//chooseHashFunc(argv[0]);

chooseHashFunc(hashfunctionname);

while(fgets(pKey,10,fp)!=NULL)//逐行读取fp1所指向文件中的内容到text中

{

mStringCount++;

buildHashTable(pKey,pHashTable);

}

fclose(fp);

showTestsResult(pHashTable);

printf("String Count: %d",mStringCount); ///建立hash表的字符串的个数

freeHashTable(pHashTable);

return 0;

}