字符串hash函数非常多,常见的主要有Simple_hash, RS_hash, JS_hash, PJW_hash, ELF_hash, BKDR_hash, SDBM_hash, DJB_hash, AP_hash, CRC_hash等。
评估hash函数优劣的基准主要有以下两个指标:
(1) 散列分布性:即桶(list)的使用率backet_usage = (已使用桶数) / (总的桶数),这个比例越高,说明分布性良好,是好的hash设计。
(2) 平均桶长:即avg_backet_len,所有已使用桶的平均长度。理想状态下这个值应该=1,越小说明冲突发生地越少,是好的hash设计。
评估方案设计是这样的:
(1) 以200M的视频文件作为输入源,以4KB的块为大小计算MD5值,并以此作为hash关键字;或者随机生成1000个字符串,每个字符串的长度均为10。将这1000个 字符串写入一个文件test.txt。
(2) 分别应用上面提到的各种字符串hash函数,进行hash散列模拟;
(3) 统计结果,用散列分布性和平均桶长两个指标进行评估分析。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
#include "stdafx.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <time.h> #define STRINGSIZE 10 #define STRINGCOUNT 1000 int _tmain(int argc, _TCHAR* argv[]){ FILE *fp1; //定义文件流指针,用于打开读取的文件 char text[10]; //定义一个字符串数组,用于存储读取的字符 int i=0,j=0,lstr; char *str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; lstr = strlen(str);//计算字符串长度 fp1 = fopen("d:\\test.txt","r+");//只读写方式打开文件a.txt srand((unsigned int)time((time_t *)NULL));//使用系统时间来初始化随机数发生器 for(j=0;j<STRINGCOUNT;j++){ for(i = 0; i < STRINGSIZE-2; i++){ //按指定大小返回相应的字符串 text[i]=str[(rand()%lstr)]; } text[i++]='\n'; text[i]='\0'; fputs(text,fp1);//将内容写到fp1所指向文件中 } fclose(fp1);//关闭文件test.txt,有打开就要有关闭 return 0; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
unsigned int simple_hash(const std::string&str){ register unsigned int hash=0;//register早期机器用 register unsigned char *p; for(std::size_t i=0;i<str.length();i++) hash = 31 * hash + str[i]; return (hash & 0x7FFFFFFF); } /*End of Simple Hash Function*/ unsigned int RSHash(const std::string& str){ unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = hash * a + str[i]; a = a * b; } return hash; } /* End Of RS Hash Function */ unsigned int JSHash(const std::string& str){ unsigned int hash = 1315423911; for(std::size_t i = 0; i < str.length(); i++) { hash ^= ((hash << 5) + str[i] + (hash >> 2)); } return hash; } /* End Of JS Hash Function */ unsigned int PJWHash(const std::string& str){ unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; for(std::size_t i = 0; i < str.length(); i++){ hash = (hash << OneEighth) + str[i]; if((test = hash & HighBits) != 0){ hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return hash; } /* End Of P. J. Weinberger Hash Function */ unsigned int ELFHash(const std::string& str) { unsigned int hash = 0; unsigned int x = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = (hash << 4) + str[i]; if((x = hash & 0xF0000000L) != 0) { hash ^= (x >> 24); } hash &= ~x; } return hash; } /* End Of ELF Hash Function */ unsigned int BKDRHash(const std::string& str) { unsigned int seed = 131; // 31 131 1313 13131 131313 etc.. unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = (hash * seed) + str[i]; } return hash; } /* End Of BKDR Hash Function */ //我们就选用这种hash函数嘞 unsigned int SDBMHash(const std::string& str) { unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = str[i] + (hash << 6) + (hash << 16) - hash; } return hash; } /* End Of SDBM Hash Function */ unsigned int DJBHash(const std::string& str) { unsigned int hash = 5381; for(std::size_t i = 0; i < str.length(); i++) { hash = ((hash << 5) + hash) + str[i]; } return hash; } /* End Of DJB Hash Function */ unsigned int DEKHash(const std::string& str) { unsigned int hash = static_cast<unsigned int>(str.length()); for(std::size_t i = 0; i < str.length(); i++) { hash = ((hash << 5) ^ (hash >> 27)) ^ str[i]; } return hash; } /* End Of DEK Hash Function */ unsigned int BPHash(const std::string& str) { unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash = hash << 7 ^ str[i]; } return hash; } /* End Of BP Hash Function */ unsigned int FNVHash(const std::string& str) { const unsigned int fnv_prime = 0x811C9DC5; unsigned int hash = 0; for(std::size_t i = 0; i < str.length(); i++) { hash *= fnv_prime; hash ^= str[i]; } return hash; } /* End Of FNV Hash Function */ unsigned int APHash(const std::string& str) { unsigned int hash = 0xAAAAAAAA; for(std::size_t i = 0; i < str.length(); i++) { hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ str[i] * (hash >> 3)) : (~((hash << 11) + (str[i] ^ (hash >> 5)))); } return hash; } /* End Of AP Hash Function */ /* CRC Hash Function */ unsigned int CRC_hash(char *str){ unsigned int nleft = strlen(str); unsigned long long sum = 0; unsigned short int *w = (unsigned short int *)str; unsigned short int answer = 0; /* * Our algorithm is simple, using a 32 bit accumulator (sum), we add * sequential 16 bit words to it, and at the end, fold back all the * carry bits from the top 16 bits into the lower 16 bits. */ while ( nleft > 1 ) { sum += *w++; nleft -= 2; } /* * mop up an odd byte, if necessary */ if ( 1 == nleft ) { *( unsigned char * )( &answer ) = *( unsigned char * )w ; sum += answer; } /* * add back carry outs from top 16 bits to low 16 bits * add hi 16 to low 16 */ sum = ( sum >> 16 ) + ( sum & 0xFFFF ); /* add carry */ sum += ( sum >> 16 ); /* truncate to 16 bits */ answer = ~sum; return (answer & 0xFFFFFFFF); } |
实验的结果如下:
表格中字符串的解释,参见如下注释:
1 2 3 4 5 6 |
printf("bucket_len = %d\n", pHashTable->mBucketLen); ///哈希表的桶的个数 printf("hit_count = %d\n", hit_count); ///建立hash表的不重复的元素的个数 printf("buket conflict count = %d\n", conflict_count); ///冲突的桶的个数 printf("longest hash entry = %d\n", max_link); ///最长的链的长度 printf("average hash entry length = %.2f\n", avg_link); ///链表的平均长度 printf("backet usage = %.2f%\n", backet_usage); ///hash table的桶的使用率 |
以上实验结果使用的装填因子是1,装填因子更小些,更能评估不同hash函数散列结果的好坏。
参考资料:
1)http://www.cnblogs.com/luxiaoxun/archive/2012/08/06/2625659.html
2)http://blog.csdn.net/liuaigui/article/details/5050697#reply
2)http://blog.csdn.net/love__coder/article/details/8274886
3)http://blog.csdn.net/sangyongjia/article/details/37312851
附录:有bug,有待调试
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
#include "stdafx.h" #include <stdio.h> #include <stdlib.h> ///do not know where use those head file ///#include <sys/types.h> ///#include <sys/stat.h> ///#include <fcntl.h> ///#include <errno.h> #include <string.h> #include "hashTest.h" #include <iostream> using namespace std; //#include "md5.h" #define STRING_LEN 255 ///one atom of the chain when build the hash table struct AtomOfBucketChain { unsigned char *pKey; struct AtomOfBucketChain *pNext; }; struct ChainOfHashTable { unsigned int mHitCount; unsigned int mEntryCount; struct AtomOfBucketChain *pKeys; }; struct HashTable { unsigned int mBucketLen; struct ChainOfHashTable *pTable; }; unsigned int (*pHashFunc)(char *str); ///choose which hash function to be used void chooseHashFunc( char *pHashFuncName) { if (0 == strcmp(pHashFuncName, "simple_hash")) pHashFunc = simple_hash; else if (0 == strcmp(pHashFuncName, "RS_hash")) pHashFunc = RS_hash; else if (0 == strcmp(pHashFuncName, "JS_hash")) pHashFunc = JS_hash; else if (0 == strcmp(pHashFuncName, "PJW_hash")) pHashFunc = PJW_hash; else if (0 == strcmp(pHashFuncName, "ELF_hash")) pHashFunc = ELF_hash; else if (0 == strcmp(pHashFuncName, "BKDR_hash")) pHashFunc = BKDR_hash; else if (0 == strcmp(pHashFuncName, "SDBM_hash")) pHashFunc = SDBM_hash; else if (0 == strcmp(pHashFuncName, "DJB_hash")) pHashFunc = DJB_hash; else if (0 == strcmp(pHashFuncName, "AP_hash")) pHashFunc = AP_hash; // else if (0 == strcmp(pHashFuncName, "CRC_hash")) // pHashFunc = CRC_hash; else pHashFunc = NULL; } ///build the hash table void buildHashTable(char *pKey, struct HashTable *pHashTable) { unsigned int mHashValue = pHashFunc(pKey) % pHashTable->mBucketLen; struct AtomOfBucketChain *p=NULL; p = pHashTable->pTable[mHashValue].pKeys; while(p) { if (0 == strcmp(pKey, (const char*)p->pKey)) { break; } p = p->pNext; } if (p == NULL) { p = (struct AtomOfBucketChain *)malloc(sizeof(struct AtomOfBucketChain)); if (p == NULL) { printf("malloc in buildHashTable filled"); return ;///must have 'return',否则失败也不会停止。 } p->pKey = strdup(pKey); p->pNext = pHashTable->pTable[mHashValue].pKeys; pHashTable->pTable[mHashValue].pKeys = p; pHashTable->pTable[mHashValue].mEntryCount++; } pHashTable->pTable[mHashValue].mHitCount++; } ///initial hash table void hashTableInit(struct HashTable *pHashTable) { unsigned int i; if ((NULL == pHashTable) || (NULL==pHashTable->pTable)) { printf("hashTableInit: malloc pHashTable or pTable failed"); return; } for (i = 0; i < pHashTable->mBucketLen; i++) { pHashTable->pTable[i].mHitCount=0; pHashTable->pTable[i].mEntryCount=0; pHashTable->pTable[i].pKeys=NULL; } } ///free space hash table used void freeHashTable(struct HashTable *pHashTable) { unsigned int i; struct AtomOfBucketChain *pFront, *pBack; if ((NULL == pHashTable) || (NULL==pHashTable->pTable)) { printf("hash table has been free"); return; } for (i = 0; i < pHashTable->mBucketLen; i++) { pFront = pHashTable->pTable[i].pKeys; while(pFront) { pBack = pFront->pNext; if (pFront->pKey) free(pFront->pKey); free(pFront); pFront = pBack; } } free(pHashTable->pTable); } ///显示统计结果 void showTestsResult(struct HashTable *pHashTable) { int backet = 0, sum = 0; unsigned i=0, max_link=0; int conflict_count = 0, hit_count = 0; double avg_link, backet_usage; for(i = 0; i < pHashTable->mBucketLen; i++) { if (pHashTable->pTable[i].mHitCount > 0) { backet++; sum += pHashTable->pTable[i].mEntryCount; if (pHashTable->pTable[i].mEntryCount > max_link) { max_link = pHashTable->pTable[i].mEntryCount; } if (pHashTable->pTable[i].mEntryCount > 1) { conflict_count++; } hit_count += pHashTable->pTable[i].mHitCount; } } backet_usage = backet/1.0/pHashTable->mBucketLen * 100; avg_link = sum/1.0/backet; printf("bucket_len = %d\n", pHashTable->mBucketLen); ///哈希表的桶的个数 /// printf("hash_call_count = %d/n", hash_call_count); ///建立hash表的字符串的个数 printf("hit_count = %d\n", hit_count); ///建立hash表的不重复的元素的个数 printf("buket conflict count = %d\n", conflict_count); ///冲突的桶的个数 printf("longest hash entry = %d\n", max_link); ///最长的链的长度 printf("average hash entry length = %.2f\n", avg_link); ///链表的平均长度 printf("backet usage = %.2f%\n", backet_usage); ///hash table的桶的使用率 } // void usage() { printf("Usage: hash_func_name [backet_len]\n"); printf("hash_func_name:\n"); printf("/tsimple_hash\n"); printf("/tRS_hash\n"); printf("/tJS_hash\n"); printf("/tPJW_hash\n"); printf("/tELF_hash\n"); printf("/tBKDR_hash\n"); printf("/tSDBM_hash\n"); printf("/tDJB_hash\n"); printf("/tAP_hash\n"); // printf("/tCRC_hash\n"); } int main(int argc, char *argv[]) { FILE *fp; int mStringCount=0; unsigned char pKey[10]; struct HashTable *pHashTable=NULL; ///参数输入 char hashfunctionname[10],bucketcount[10]; printf("input hashfunctionname\n"); gets(hashfunctionname); printf("input bucketcount\n"); gets(bucketcount); pHashTable=(struct HashTable*)malloc(sizeof(struct HashTable)); if(NULL==pHashTable) { printf("malloc hash table filled"); return -1; } /* if (argc<=1) { usage(); return -1; } if (2==argc) { usage(); } */ // pHashTable->mBucketLen = atoi(argv[1]); pHashTable->mBucketLen = atoi(bucketcount); pHashTable->pTable=(struct ChainOfHashTable*)malloc(sizeof(struct ChainOfHashTable) * pHashTable->mBucketLen); if (!(fp = fopen("d:\\test.txt", "r"))) ///假设文件已经生成,需要补充自动生成字符串的函数。将生成的字符串保存在一个文件中。 { printf("open source file filled"); return -1; } hashTableInit(pHashTable); //chooseHashFunc(argv[0]); chooseHashFunc(hashfunctionname); while(fgets(pKey,10,fp)!=NULL)//逐行读取fp1所指向文件中的内容到text中 { mStringCount++; buildHashTable(pKey,pHashTable); } fclose(fp); showTestsResult(pHashTable); printf("String Count: %d",mStringCount); ///建立hash表的字符串的个数 freeHashTable(pHashTable); return 0; } |