一个很纠结的问题—— 字符串加密 byte数组 Hash 散列算法问题，其实提到这个或许很多人和我一样会感到头痛...这里劳烦大神给讲解讲解。。。

mrsupersky 2012-02-05 05:08:51

RT
Hash 算法
暂且抛开MD5等比较大的算法不提
打个比方
在网上搜到的一些方法如下：





2.1 PHP中出现的字符串Hash函数



static unsigned long hashpjw(char *arKey, unsigned int nKeyLength)



{



unsigned long h = 0, g;



char *arEnd=arKey+nKeyLength;









while (arKey < arEnd) {



h = (h << 4) + *arKey++;



if ((g = (h & 0xF0000000))) {



h = h ^ (g >> 24);



h = h ^ g;



}



}



return h;



}



2.2 OpenSSL中出现的字符串Hash函数



unsigned long lh_strhash(char *str)



{



int i,l;



unsigned long ret=0;



unsigned short *s;









if (str == NULL) return(0);



l=(strlen(str)+1)/2;



s=(unsigned short *)str;



for (i=0; i



ret^=(s[i]<<(i&0x0f));



return(ret);



} */









/* The following hash seems to work very well on normal text strings



* no collisions on /usr/dict/words and it distributes on %2^n quite



* well, not as good as MD5, but still good.



*/



unsigned long lh_strhash(const char *c)



{



unsigned long ret=0;



long n;



unsigned long v;



int r;









if ((c == NULL) || (*c == '\0'))



return(ret);



/*



unsigned char b[16];



MD5(c,strlen(c),b);



return(b[0]|(b[1]<<8)|(b[2]<<16)|(b[3]<<24)); 



*/









n=0x100;



while (*c)



{



v=n|(*c);



n+=0x100;



r= (int)((v>>2)^v)&0x0f;



ret=(ret(32-r));



ret&=0xFFFFFFFFL;



ret^=v*v;



c++;



}



return((ret>>16)^ret);



}



在下面的测量过程中我们分别将上面的两个函数标记为OpenSSL_Hash1和OpenSSL_Hash2，至于上面的实现中使用MD5算法的实现函数我们不作测试。



2.3 MySql中出现的字符串Hash函数



#ifndef NEW_HASH_FUNCTION









/* Calc hashvalue for a key */









static uint calc_hashnr(const byte *key,uint length)



{



register uint nr=1, nr2=4;



while (length--)



{



nr^= (((nr & 63)+nr2)*((uint) (uchar) *key++))+ (nr << 8);



nr2+=3;



}



return((uint) nr);



}









/* Calc hashvalue for a key, case indepenently */









static uint calc_hashnr_caseup(const byte *key,uint length)



{



register uint nr=1, nr2=4;



while (length--)



{



nr^= (((nr & 63)+nr2)*((uint) (uchar) toupper(*key++)))+ (nr << 8);



nr2+=3;



}



return((uint) nr);



}









#else









/*



* Fowler/Noll/Vo hash



*



* The basis of the hash algorithm was taken from an idea sent by email to the



* IEEE Posix P1003.2 mailing list from Phong Vo (kpv@research.att.com) and



* Glenn Fowler (gsf@research.att.com). Landon Curt Noll (chongo@toad.com)



* later improved on their algorithm.



*



* The magic is in the interesting relationship between the special prime



* 16777619 (2^24 + 403) and 2^32 and 2^8.



*



* This hash produces the fewest collisions of any function that we've seen so



* far, and works well on both numbers and strings.



*/









uint calc_hashnr(const byte *key, uint len)



{



const byte *end=key+len;



uint hash;



for (hash = 0; key < end; key++)



{



hash *= 16777619;



hash ^= (uint) *(uchar*) key;



}



return (hash);



}









uint calc_hashnr_caseup(const byte *key, uint len)



{



const byte *end=key+len;



uint hash;



for (hash = 0; key < end; key++)



{



hash *= 16777619;



hash ^= (uint) (uchar) toupper(*key);



}



return (hash);



}









#endif



Mysql中对字符串Hash函数还区分了大小写，我们的测试中使用不区分大小写的字符串Hash函数，另外我们将上面的两个函数分别记为MYSQL_Hash1和MYSQL_Hash2。



2.4 另一个经典字符串Hash函数



unsigned int hash(char *str)



{



register unsigned int h;



register unsigned char *p;









for(h=0, p = (unsigned char *)str; *p ; p++)



h = 31 * h + *p;









return h;



}

这些都是网上的，或许很多都是身经百战的老算法了

但是我一直都弄不明白一个很严重的问题，

问题一：

例如我在一段时间内需要存储 1亿以上个字符串

难道在定义Hash存储区域的时候必须要先定义一个长度为1亿以上的数组，然后再根据Hash函数计算出 hash值在这1亿多个元素的数组中的某一位置，然后插入？

如果不需要定义1亿长度的数组的话，那我们生成出来的int值和字符串又该放在哪里呢？无序存放，就是生成一个放一个，但是那样的话以后查找的话用什么索引这个位置呢？

以上1亿只是一种假设。

问题二：

网上虽然对Hash的文章确实有不少，但大多都是千篇一律并且主要讨论的是 “碰撞” 问题
但我不明白的是例如像上面的那些算法，这些算法的操作过程是凭感觉或者猜测或者运气创造出来的吗？

“凭感觉或者猜测或者运气创造出来”注解：这里用一个某位网友自己尝试写一个 hash函数的过程来解释这句话：
比如某位网友考虑过程：
1.Hash算法最简单的就是将字符串字符值进行相加



private int FirstHash(String str){  

        char[] chars = str.trim().toCharArray();  

        int hash = 0;  

        int count = 0;  

        int length = chars.length;  

        while (count < length) {  

                   hash = (int) chars[count] + (hash << 8) + (hash << 16);  

                   count++;  

        }  

        return hash & 0x7FFFFFFF;  

}

2.考虑到例如字符串 Hash("abcdefg")=Hash("gfedcba")
所以行不通
于是改成



private int FirstHash(String str){  

        char[] chars = str.trim().toCharArray();  

        int hash = 0;  

        int count = 0;  

        int length = chars.length;  

        while (count < length) {  

         hash = (int) chars[count] + (hash << 8) + (hash << 16) – hash;  

         count++;   

        return hash & 0x7FFFFFFF;  

}

引用原文
【既然扯关系，那不如把关系扯得错综复杂一点，然后就得到设计3：把“hash<<8”改为“hash<<7”,即标号4的低位模块的数据没有全部移到标号3的模块，而是插一腿到原来自己呆的模块。此时，额，已经很难追寻踪迹了。。。】
于是又得到



private int FirstHash(String str){  

        char[] chars = str.trim().toCharArray();  

        int hash = 0;  

        int count = 0;  

        int length = chars.length;  

        while (count < length) {  

          hash = (int) chars[count] + (hash << 7) + (hash << 16) – hash;  

          count++;  

        return hash & 0x7FFFFFFF; 

}

引用原文【怎么又加上“hash<<24”呢？很简单，就是让hash更复杂，但这个复杂是有道理的，目的就是让标号1和2的模块也与存入的整个字符串扯上关系】
于是又得到



private int SystemHash(String s) {  

      int hash = s.hashCode();  

      hash ^= (hash >>> 20) ^ (hash >>> 12);  

      return (hash ^ (hash >>> 7) ^ (hash >>> 4));  

}

全文见：字符串哈希的研究

难道 Hash（散列）就是这么产生的？依靠猜测，推断？

...全文