This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
add a hardened one-at-a-time hash variant
[perl5.git] / hv_func.h
CommitLineData
4d3a042d
YO
1/* hash a key
2 *--------------------------------------------------------------------------------------
3 * The "hash seed" feature was added in Perl 5.8.1 to perturb the results
4 * to avoid "algorithmic complexity attacks".
5 *
6 * If USE_HASH_SEED is defined, hash randomisation is done by default
7 * If USE_HASH_SEED_EXPLICIT is defined, hash randomisation is done
8 * only if the environment variable PERL_HASH_SEED is set.
9 * (see also perl.c:perl_parse() and S_init_tls_and_interp() and util.c:get_hash_seed())
10 */
11
12#ifndef PERL_SEEN_HV_FUNC_H /* compile once */
13#define PERL_SEEN_HV_FUNC_H
14
15#if !( 0 \
16 || defined(PERL_HASH_FUNC_SDBM) \
17 || defined(PERL_HASH_FUNC_DJB2) \
18 || defined(PERL_HASH_FUNC_SUPERFAST) \
19 || defined(PERL_HASH_FUNC_MURMUR3) \
20 || defined(PERL_HASH_FUNC_ONE_AT_A_TIME) \
b1300a73 21 || defined(PERL_HASH_FUNC_ONE_AT_A_TIME_HARD) \
4d3a042d
YO
22 || defined(PERL_HASH_FUNC_ONE_AT_A_TIME_OLD) \
23 )
24#ifdef HAS_QUAD
25#define PERL_HASH_FUNC_SIPHASH
26#else
b1300a73 27#define PERL_HASH_FUNC_ONE_AT_A_TIME_HARD
4d3a042d
YO
28#endif
29#endif
30
31#if defined(PERL_HASH_FUNC_SIPHASH)
32# define PERL_HASH_FUNC "SIPHASH_2_4"
33# define PERL_HASH_SEED_BYTES 16
34# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_siphash_2_4(PERL_HASH_SEED,(U8*)(str),(len))
35#elif defined(PERL_HASH_FUNC_SUPERFAST)
36# define PERL_HASH_FUNC "SUPERFAST"
37# define PERL_HASH_SEED_BYTES 4
38# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_superfast(PERL_HASH_SEED,(U8*)(str),(len))
39#elif defined(PERL_HASH_FUNC_MURMUR3)
40# define PERL_HASH_FUNC "MURMUR3"
41# define PERL_HASH_SEED_BYTES 4
42# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_murmur3(PERL_HASH_SEED,(U8*)(str),(len))
43#elif defined(PERL_HASH_FUNC_DJB2)
44# define PERL_HASH_FUNC "DJB2"
45# define PERL_HASH_SEED_BYTES 4
46# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_djb2(PERL_HASH_SEED,(U8*)(str),(len))
47#elif defined(PERL_HASH_FUNC_SDBM)
48# define PERL_HASH_FUNC "SDBM"
49# define PERL_HASH_SEED_BYTES 4
50# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_sdbm(PERL_HASH_SEED,(U8*)(str),(len))
b1300a73
YO
51#elif defined(PERL_HASH_FUNC_ONE_AT_A_TIME_HARD)
52# define PERL_HASH_FUNC "ONE_AT_A_TIME_HARD"
53# define PERL_HASH_SEED_BYTES 8
54# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_one_at_a_time_hard(PERL_HASH_SEED,(U8*)(str),(len))
4d3a042d
YO
55#elif defined(PERL_HASH_FUNC_ONE_AT_A_TIME)
56# define PERL_HASH_FUNC "ONE_AT_A_TIME"
57# define PERL_HASH_SEED_BYTES 4
58# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_one_at_a_time(PERL_HASH_SEED,(U8*)(str),(len))
59#elif defined(PERL_HASH_FUNC_ONE_AT_A_TIME_OLD)
60# define PERL_HASH_FUNC "ONE_AT_A_TIME_OLD"
61# define PERL_HASH_SEED_BYTES 4
62# define PERL_HASH(hash,str,len) (hash)= S_perl_hash_old_one_at_a_time(PERL_HASH_SEED,(U8*)(str),(len))
63#endif
64
65#ifndef PERL_HASH
66#error "No hash function defined!"
67#endif
68#ifndef PERL_HASH_SEED_BYTES
69#error "PERL_HASH_SEED_BYTES not defined"
70#endif
71#ifndef PERL_HASH_FUNC
72#error "PERL_HASH_FUNC not defined"
73#endif
74
75#ifndef PERL_HASH_SEED
76# if defined(USE_HASH_SEED) || defined(USE_HASH_SEED_EXPLICIT)
77# define PERL_HASH_SEED PL_hash_seed
78# elif PERL_HASH_SEED_BYTES == 4
79# define PERL_HASH_SEED "PeRl"
80# elif PERL_HASH_SEED_BYTES == 16
81# define PERL_HASH_SEED "PeRlHaShhAcKpErl"
82# else
83# error "No PERL_HASH_SEED definition for " PERL_HASH_FUNC
84# endif
85#endif
86
87/*-----------------------------------------------------------------------------
88 * Endianess, misalignment capabilities and util macros
89 *
90 * The following 3 macros are defined in this section. The other macros defined
91 * are only needed to help derive these 3.
92 *
93 * U8TO32_LE(x) Read a little endian unsigned 32-bit int
94 * UNALIGNED_SAFE Defined if READ_UINT32 works on non-word boundaries
95 * ROTL32(x,r) Rotate x left by r bits
96 */
97
98#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
99 || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
100#define U8TO16_LE(d) (*((const U16 *) (d)))
101#endif
102
103#if !defined (U8TO16_LE)
104#define U8TO16_LE(d) ((((const U8 *)(d))[1] << 8)\
105 +((const U8 *)(d))[0])
106#endif
107
108
109/* Now find best way we can to READ_UINT32 */
110#if (BYTEORDER == 0x1234 || BYTEORDER == 0x12345678) && U32SIZE == 4
111 /* CPU endian matches murmurhash algorithm, so read 32-bit word directly */
112 #define U8TO32_LE(ptr) (*((U32*)(ptr)))
113#elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
114 /* TODO: Add additional cases below where a compiler provided bswap32 is available */
115 #if defined(__GNUC__) && (__GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>=3))
116 #define U8TO32_LE(ptr) (__builtin_bswap32(*((U32*)(ptr))))
117 #else
118 /* Without a known fast bswap32 we're just as well off doing this */
119 #define U8TO32_LE(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24)
120 #define UNALIGNED_SAFE
121 #endif
122#else
123 /* Unknown endianess so last resort is to read individual bytes */
124 #define U8TO32_LE(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24)
125 /* Since we're not doing word-reads we can skip the messing about with realignment */
126 #define UNALIGNED_SAFE
127#endif
128
129/* Find best way to ROTL32 */
130#if defined(_MSC_VER)
131 #include <stdlib.h> /* Microsoft put _rotl declaration in here */
132 #define ROTL32(x,r) _rotl(x,r)
133#else
134 /* gcc recognises this code and generates a rotate instruction for CPUs with one */
135 #define ROTL32(x,r) (((U32)x << r) | ((U32)x >> (32 - r)))
136#endif
137
138
139/* This is SipHash by Jean-Philippe Aumasson and Daniel J. Bernstein.
140 * The authors claim it is relatively secure compared to the alternatives
141 * and that performance wise it is a suitable hash for languages like Perl.
142 * See:
143 *
144 * https://www.131002.net/siphash/
145 *
146 * This implementation seems to perform slightly slower than one-at-a-time for
147 * short keys, but degrades slower for longer keys. Murmur Hash outperforms it
148 * regardless of keys size.
149 *
150 * It is 64 bit only.
151 */
152
153#ifdef HAS_QUAD
154
155#ifndef U64TYPE
156/* This probably isn't going to work, but failing with a compiler error due to
157 lack of uint64_t is no worse than failing right now with an #error. */
158#define U64TYPE uint64_t
159#endif
160
161
162#define ROTL64(x,b) (U64TYPE)( ((x) << (b)) | ( (x) >> (64 - (b))) )
163
164#define U8TO64_LE(p) \
165 (((U64TYPE)((p)[0]) ) | \
166 ((U64TYPE)((p)[1]) << 8) | \
167 ((U64TYPE)((p)[2]) << 16) | \
168 ((U64TYPE)((p)[3]) << 24) | \
169 ((U64TYPE)((p)[4]) << 32) | \
170 ((U64TYPE)((p)[5]) << 40) | \
171 ((U64TYPE)((p)[6]) << 48) | \
172 ((U64TYPE)((p)[7]) << 56))
173
174#define SIPROUND \
175 do { \
176 v0 += v1; v1=ROTL64(v1,13); v1 ^= v0; v0=ROTL64(v0,32); \
177 v2 += v3; v3=ROTL64(v3,16); v3 ^= v2; \
178 v0 += v3; v3=ROTL64(v3,21); v3 ^= v0; \
179 v2 += v1; v1=ROTL64(v1,17); v1 ^= v2; v2=ROTL64(v2,32); \
180 } while(0)
181
182/* SipHash-2-4 */
183
184PERL_STATIC_INLINE U32
185S_perl_hash_siphash_2_4(const unsigned char * const seed, const unsigned char *in, const STRLEN inlen) {
186 /* "somepseudorandomlygeneratedbytes" */
187 U64TYPE v0 = 0x736f6d6570736575ULL;
188 U64TYPE v1 = 0x646f72616e646f6dULL;
189 U64TYPE v2 = 0x6c7967656e657261ULL;
190 U64TYPE v3 = 0x7465646279746573ULL;
191
192 U64TYPE b;
193 U64TYPE k0 = ((U64TYPE*)seed)[0];
194 U64TYPE k1 = ((U64TYPE*)seed)[1];
195 U64TYPE m;
196 const int left = inlen & 7;
197 const U8 *end = in + inlen - left;
198
199 b = ( ( U64TYPE )(inlen) ) << 56;
200 v3 ^= k1;
201 v2 ^= k0;
202 v1 ^= k1;
203 v0 ^= k0;
204
205 for ( ; in != end; in += 8 )
206 {
207 m = U8TO64_LE( in );
208 v3 ^= m;
209 SIPROUND;
210 SIPROUND;
211 v0 ^= m;
212 }
213
214 switch( left )
215 {
216 case 7: b |= ( ( U64TYPE )in[ 6] ) << 48;
217 case 6: b |= ( ( U64TYPE )in[ 5] ) << 40;
218 case 5: b |= ( ( U64TYPE )in[ 4] ) << 32;
219 case 4: b |= ( ( U64TYPE )in[ 3] ) << 24;
220 case 3: b |= ( ( U64TYPE )in[ 2] ) << 16;
221 case 2: b |= ( ( U64TYPE )in[ 1] ) << 8;
222 case 1: b |= ( ( U64TYPE )in[ 0] ); break;
223 case 0: break;
224 }
225
226 v3 ^= b;
227 SIPROUND;
228 SIPROUND;
229 v0 ^= b;
230
231 v2 ^= 0xff;
232 SIPROUND;
233 SIPROUND;
234 SIPROUND;
235 SIPROUND;
236 b = v0 ^ v1 ^ v2 ^ v3;
237 return (U32)(b & U32_MAX);
238}
239#endif /* defined(HAS_QUAD) */
240
241/* FYI: This is the "Super-Fast" algorithm mentioned by Bob Jenkins in
242 * (http://burtleburtle.net/bob/hash/doobs.html)
243 * It is by Paul Hsieh (c) 2004 and is analysed here
244 * http://www.azillionmonkeys.com/qed/hash.html
245 * license terms are here:
246 * http://www.azillionmonkeys.com/qed/weblicense.html
247 */
248
249
250PERL_STATIC_INLINE U32
251S_perl_hash_superfast(const unsigned char * const seed, const unsigned char *str, STRLEN len) {
252 U32 hash = *((U32*)seed) + len;
253 U32 tmp;
254 int rem= len & 3;
255 len >>= 2;
256
257 for (;len > 0; len--) {
258 hash += U8TO16_LE (str);
259 tmp = (U8TO16_LE (str+2) << 11) ^ hash;
260 hash = (hash << 16) ^ tmp;
261 str += 2 * sizeof (U16);
262 hash += hash >> 11;
263 }
264
265 /* Handle end cases */
266 switch (rem) { \
267 case 3: hash += U8TO16_LE (str);
268 hash ^= hash << 16;
269 hash ^= str[sizeof (U16)] << 18;
270 hash += hash >> 11;
271 break;
272 case 2: hash += U8TO16_LE (str);
273 hash ^= hash << 11;
274 hash += hash >> 17;
275 break;
276 case 1: hash += *str;
277 hash ^= hash << 10;
278 hash += hash >> 1;
279 }
280 /* Force "avalanching" of final 127 bits */
281 hash ^= hash << 3;
282 hash += hash >> 5;
283 hash ^= hash << 4;
284 hash += hash >> 17;
285 hash ^= hash << 25;
286 return (hash + (hash >> 6));
287}
288
289
290/*-----------------------------------------------------------------------------
291 * MurmurHash3 was written by Austin Appleby, and is placed in the public
292 * domain.
293 *
294 * This implementation was originally written by Shane Day, and is also public domain,
295 * and was modified to function as a macro similar to other perl hash functions by
296 * Yves Orton.
297 *
298 * This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A)
299 * with support for progressive processing.
300 *
301 * If you want to understand the MurmurHash algorithm you would be much better
302 * off reading the original source. Just point your browser at:
303 * http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
304 *
305 * How does it work?
306 *
307 * We can only process entire 32 bit chunks of input, except for the very end
308 * that may be shorter.
309 *
310 * To handle endianess I simply use a macro that reads a U32 and define
311 * that macro to be a direct read on little endian machines, a read and swap
312 * on big endian machines, or a byte-by-byte read if the endianess is unknown.
313 */
314
315
316/*-----------------------------------------------------------------------------
317 * Core murmurhash algorithm macros */
318
319#define MURMUR_C1 (0xcc9e2d51)
320#define MURMUR_C2 (0x1b873593)
321#define MURMUR_C3 (0xe6546b64)
322#define MURMUR_C4 (0x85ebca6b)
323#define MURMUR_C5 (0xc2b2ae35)
324
325/* This is the main processing body of the algorithm. It operates
326 * on each full 32-bits of input. */
327#define MURMUR_DOBLOCK(h1, k1) STMT_START { \
328 k1 *= MURMUR_C1; \
329 k1 = ROTL32(k1,15); \
330 k1 *= MURMUR_C2; \
331 \
332 h1 ^= k1; \
333 h1 = ROTL32(h1,13); \
334 h1 = h1 * 5 + MURMUR_C3; \
335} STMT_END
336
337
338/* Append unaligned bytes to carry, forcing hash churn if we have 4 bytes */
339/* cnt=bytes to process, h1=name of h1 var, c=carry, n=bytes in c, ptr/len=payload */
340#define MURMUR_DOBYTES(cnt, h1, c, n, ptr, len) STMT_START { \
341 int MURMUR_DOBYTES_i = cnt; \
342 while(MURMUR_DOBYTES_i--) { \
343 c = c>>8 | *ptr++<<24; \
344 n++; len--; \
345 if(n==4) { \
346 MURMUR_DOBLOCK(h1, c); \
347 n = 0; \
348 } \
349 } \
350} STMT_END
351
352
353/* now we create the hash function */
354PERL_STATIC_INLINE U32
355S_perl_hash_murmur3(const unsigned char * const seed, const unsigned char *ptr, STRLEN len) {
356 U32 h1 = *((U32*)seed);
357 U32 k1;
358 U32 carry = 0;
359
360 const unsigned char *end;
361 int bytes_in_carry = 0; /* bytes in carry */
362 I32 total_length= len;
363
364#if defined(UNALIGNED_SAFE)
365 /* Handle carry: commented out as its only used in incremental mode - it never fires for us
366 int i = (4-n) & 3;
367 if(i && i <= len) {
368 MURMUR_DOBYTES(i, h1, carry, bytes_in_carry, ptr, len);
369 }
370 */
371
372 /* This CPU handles unaligned word access */
373 /* Process 32-bit chunks */
374 end = ptr + len/4*4;
375 for( ; ptr < end ; ptr+=4) {
376 k1 = U8TO32_LE(ptr);
377 MURMUR_DOBLOCK(h1, k1);
378 }
379#else
380 /* This CPU does not handle unaligned word access */
381
382 /* Consume enough so that the next data byte is word aligned */
383 int i = -(long)ptr & 3;
384 if(i && i <= len) {
385 MURMUR_DOBYTES(i, h1, carry, bytes_in_carry, ptr, len);
386 }
387
388 /* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */
389 end = ptr + len/4*4;
390 switch(bytes_in_carry) { /* how many bytes in carry */
391 case 0: /* c=[----] w=[3210] b=[3210]=w c'=[----] */
392 for( ; ptr < end ; ptr+=4) {
393 k1 = U8TO32_LE(ptr);
394 MURMUR_DOBLOCK(h1, k1);
395 }
396 break;
397 case 1: /* c=[0---] w=[4321] b=[3210]=c>>24|w<<8 c'=[4---] */
398 for( ; ptr < end ; ptr+=4) {
399 k1 = carry>>24;
400 carry = U8TO32_LE(ptr);
401 k1 |= carry<<8;
402 MURMUR_DOBLOCK(h1, k1);
403 }
404 break;
405 case 2: /* c=[10--] w=[5432] b=[3210]=c>>16|w<<16 c'=[54--] */
406 for( ; ptr < end ; ptr+=4) {
407 k1 = carry>>16;
408 carry = U8TO32_LE(ptr);
409 k1 |= carry<<16;
410 MURMUR_DOBLOCK(h1, k1);
411 }
412 break;
413 case 3: /* c=[210-] w=[6543] b=[3210]=c>>8|w<<24 c'=[654-] */
414 for( ; ptr < end ; ptr+=4) {
415 k1 = carry>>8;
416 carry = U8TO32_LE(ptr);
417 k1 |= carry<<24;
418 MURMUR_DOBLOCK(h1, k1);
419 }
420 }
421#endif
422 /* Advance over whole 32-bit chunks, possibly leaving 1..3 bytes */
423 len -= len/4*4;
424
425 /* Append any remaining bytes into carry */
426 MURMUR_DOBYTES(len, h1, carry, bytes_in_carry, ptr, len);
427
428 if (bytes_in_carry) {
429 k1 = carry >> ( 4 - bytes_in_carry ) * 8;
430 k1 *= MURMUR_C1;
431 k1 = ROTL32(k1,15);
432 k1 *= MURMUR_C2;
433 h1 ^= k1;
434 }
435 h1 ^= total_length;
436
437 /* fmix */
438 h1 ^= h1 >> 16;
439 h1 *= MURMUR_C4;
440 h1 ^= h1 >> 13;
441 h1 *= MURMUR_C5;
442 h1 ^= h1 >> 16;
443 return h1;
444}
445
446
447PERL_STATIC_INLINE U32
448S_perl_hash_djb2(const unsigned char * const seed, const unsigned char *str, const STRLEN len) {
449 const unsigned char * const end = (const unsigned char *)str + len;
450 U32 hash = *((U32*)seed + len);
451 while (str < end) {
452 hash = ((hash << 5) + hash) + *str++;
453 }
454 return hash;
455}
456
457PERL_STATIC_INLINE U32
458S_perl_hash_sdbm(const unsigned char * const seed, const unsigned char *str, const STRLEN len) {
459 const unsigned char * const end = (const unsigned char *)str + len;
460 U32 hash = *((U32*)seed + len);
461 while (str < end) {
462 hash = (hash << 6) + (hash << 16) - hash + *str++;
463 }
464 return hash;
465}
466
467
b1300a73 468/* This is the "One-at-a-Time" algorithm by Bob Jenkins
4d3a042d 469 * from requirements by Colin Plumb.
b1300a73
YO
470 * (http://burtleburtle.net/bob/hash/doobs.html)
471 * With seed/len tweak.
472 * */
4d3a042d
YO
473PERL_STATIC_INLINE U32
474S_perl_hash_one_at_a_time(const unsigned char * const seed, const unsigned char *str, const STRLEN len) {
475 const unsigned char * const end = (const unsigned char *)str + len;
476 U32 hash = *((U32*)seed) + len;
477 while (str < end) {
478 hash += *str++;
479 hash += (hash << 10);
480 hash ^= (hash >> 6);
481 }
482 hash += (hash << 3);
483 hash ^= (hash >> 11);
484 return (hash + (hash << 15));
485}
486
b1300a73
YO
487/* Derived from "One-at-a-Time" algorithm by Bob Jenkins */
488PERL_STATIC_INLINE U32
489S_perl_hash_one_at_a_time_hard(const unsigned char * const seed, const unsigned char *str, const STRLEN len) {
490 const unsigned char * const end = (const unsigned char *)str + len;
491 U32 hash = *((U32*)seed) + len;
492
493 while (str < end) {
494 hash += (hash << 10);
495 hash ^= (hash >> 6);
496 hash += *str++;
497 }
498
499 hash += (hash << 10);
500 hash ^= (hash >> 6);
501 hash += seed[4];
502
503 hash += (hash << 10);
504 hash ^= (hash >> 6);
505 hash += seed[5];
506
507 hash += (hash << 10);
508 hash ^= (hash >> 6);
509 hash += seed[6];
510
511 hash += (hash << 10);
512 hash ^= (hash >> 6);
513 hash += seed[7];
514
515 hash += (hash << 10);
516 hash ^= (hash >> 6);
517
518 hash += (hash << 3);
519 hash ^= (hash >> 11);
520 return (hash + (hash << 15));
521}
522
4d3a042d
YO
523PERL_STATIC_INLINE U32
524S_perl_hash_old_one_at_a_time(const unsigned char * const seed, const unsigned char *str, const STRLEN len) {
525 const unsigned char * const end = (const unsigned char *)str + len;
526 U32 hash = *((U32*)seed);
527 while (str < end) {
528 hash += *str++;
529 hash += (hash << 10);
530 hash ^= (hash >> 6);
531 }
532 hash += (hash << 3);
533 hash ^= (hash >> 11);
534 return (hash + (hash << 15));
535}
536
537/* legacy - only mod_perl should be doing this. */
538#ifdef PERL_HASH_INTERNAL_ACCESS
539#define PERL_HASH_INTERNAL(hash,str,len) PERL_HASH(hash,str,len)
540#endif
541
542#endif /*compile once*/
543
544/*
545 * Local variables:
546 * c-indentation-style: bsd
547 * c-basic-offset: 4
548 * indent-tabs-mode: nil
549 * End:
550 *
551 * ex: set ts=8 sts=4 sw=4 et:
552 */