maphash.go

Documentation: hash/maphash

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package maphash provides hash functions on byte sequences.
     6  // These hash functions are intended to be used to implement hash tables or
     7  // other data structures that need to map arbitrary strings or byte
     8  // sequences to a uniform distribution on unsigned 64-bit integers.
     9  // Each different instance of a hash table or data structure should use its own Seed.
    10  //
    11  // The hash functions are not cryptographically secure.
    12  // (See crypto/sha256 and crypto/sha512 for cryptographic use.)
    13  package maphash
    14  
    15  import (
    16  	"internal/unsafeheader"
    17  	"unsafe"
    18  )
    19  
    20  // A Seed is a random value that selects the specific hash function
    21  // computed by a Hash. If two Hashes use the same Seeds, they
    22  // will compute the same hash values for any given input.
    23  // If two Hashes use different Seeds, they are very likely to compute
    24  // distinct hash values for any given input.
    25  //
    26  // A Seed must be initialized by calling MakeSeed.
    27  // The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
    28  //
    29  // Each Seed value is local to a single process and cannot be serialized
    30  // or otherwise recreated in a different process.
    31  type Seed struct {
    32  	s uint64
    33  }
    34  
    35  // Bytes returns the hash of b with the given seed.
    36  //
    37  // Bytes is equivalent to, but more convenient and efficient than:
    38  //
    39  //	var h Hash
    40  //	h.SetSeed(seed)
    41  //	h.Write(b)
    42  //	return h.Sum64()
    43  func Bytes(seed Seed, b []byte) uint64 {
    44  	state := seed.s
    45  	if state == 0 {
    46  		panic("maphash: use of uninitialized Seed")
    47  	}
    48  	if len(b) == 0 {
    49  		return rthash(nil, 0, state) // avoid &b[0] index panic below
    50  	}
    51  	if len(b) > bufSize {
    52  		b = b[:len(b):len(b)] // merge len and cap calculations when reslicing
    53  		for len(b) > bufSize {
    54  			state = rthash(&b[0], bufSize, state)
    55  			b = b[bufSize:]
    56  		}
    57  	}
    58  	return rthash(&b[0], len(b), state)
    59  }
    60  
    61  // String returns the hash of s with the given seed.
    62  //
    63  // String is equivalent to, but more convenient and efficient than:
    64  //
    65  //	var h Hash
    66  //	h.SetSeed(seed)
    67  //	h.WriteString(s)
    68  //	return h.Sum64()
    69  func String(seed Seed, s string) uint64 {
    70  	state := seed.s
    71  	if state == 0 {
    72  		panic("maphash: use of uninitialized Seed")
    73  	}
    74  	for len(s) > bufSize {
    75  		p := (*byte)((*unsafeheader.String)(unsafe.Pointer(&s)).Data)
    76  		state = rthash(p, bufSize, state)
    77  		s = s[bufSize:]
    78  	}
    79  	p := (*byte)((*unsafeheader.String)(unsafe.Pointer(&s)).Data)
    80  	return rthash(p, len(s), state)
    81  }
    82  
    83  // A Hash computes a seeded hash of a byte sequence.
    84  //
    85  // The zero Hash is a valid Hash ready to use.
    86  // A zero Hash chooses a random seed for itself during
    87  // the first call to a Reset, Write, Seed, or Sum64 method.
    88  // For control over the seed, use SetSeed.
    89  //
    90  // The computed hash values depend only on the initial seed and
    91  // the sequence of bytes provided to the Hash object, not on the way
    92  // in which the bytes are provided. For example, the three sequences
    93  //
    94  //	h.Write([]byte{'f','o','o'})
    95  //	h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
    96  //	h.WriteString("foo")
    97  //
    98  // all have the same effect.
    99  //
   100  // Hashes are intended to be collision-resistant, even for situations
   101  // where an adversary controls the byte sequences being hashed.
   102  //
   103  // A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
   104  // If multiple goroutines must compute the same seeded hash,
   105  // each can declare its own Hash and call SetSeed with a common Seed.
   106  type Hash struct {
   107  	_     [0]func()     // not comparable
   108  	seed  Seed          // initial seed used for this hash
   109  	state Seed          // current hash of all flushed bytes
   110  	buf   [bufSize]byte // unflushed byte buffer
   111  	n     int           // number of unflushed bytes
   112  }
   113  
   114  // bufSize is the size of the Hash write buffer.
   115  // The buffer ensures that writes depend only on the sequence of bytes,
   116  // not the sequence of WriteByte/Write/WriteString calls,
   117  // by always calling rthash with a full buffer (except for the tail).
   118  const bufSize = 128
   119  
   120  // initSeed seeds the hash if necessary.
   121  // initSeed is called lazily before any operation that actually uses h.seed/h.state.
   122  // Note that this does not include Write/WriteByte/WriteString in the case
   123  // where they only add to h.buf. (If they write too much, they call h.flush,
   124  // which does call h.initSeed.)
   125  func (h *Hash) initSeed() {
   126  	if h.seed.s == 0 {
   127  		seed := MakeSeed()
   128  		h.seed = seed
   129  		h.state = seed
   130  	}
   131  }
   132  
   133  // WriteByte adds b to the sequence of bytes hashed by h.
   134  // It never fails; the error result is for implementing io.ByteWriter.
   135  func (h *Hash) WriteByte(b byte) error {
   136  	if h.n == len(h.buf) {
   137  		h.flush()
   138  	}
   139  	h.buf[h.n] = b
   140  	h.n++
   141  	return nil
   142  }
   143  
   144  // Write adds b to the sequence of bytes hashed by h.
   145  // It always writes all of b and never fails; the count and error result are for implementing io.Writer.
   146  func (h *Hash) Write(b []byte) (int, error) {
   147  	size := len(b)
   148  	// Deal with bytes left over in h.buf.
   149  	// h.n <= bufSize is always true.
   150  	// Checking it is ~free and it lets the compiler eliminate a bounds check.
   151  	if h.n > 0 && h.n <= bufSize {
   152  		k := copy(h.buf[h.n:], b)
   153  		h.n += k
   154  		if h.n < bufSize {
   155  			// Copied the entirety of b to h.buf.
   156  			return size, nil
   157  		}
   158  		b = b[k:]
   159  		h.flush()
   160  		// No need to set h.n = 0 here; it happens just before exit.
   161  	}
   162  	// Process as many full buffers as possible, without copying, and calling initSeed only once.
   163  	if len(b) > bufSize {
   164  		h.initSeed()
   165  		for len(b) > bufSize {
   166  			h.state.s = rthash(&b[0], bufSize, h.state.s)
   167  			b = b[bufSize:]
   168  		}
   169  	}
   170  	// Copy the tail.
   171  	copy(h.buf[:], b)
   172  	h.n = len(b)
   173  	return size, nil
   174  }
   175  
   176  // WriteString adds the bytes of s to the sequence of bytes hashed by h.
   177  // It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
   178  func (h *Hash) WriteString(s string) (int, error) {
   179  	// WriteString mirrors Write. See Write for comments.
   180  	size := len(s)
   181  	if h.n > 0 && h.n <= bufSize {
   182  		k := copy(h.buf[h.n:], s)
   183  		h.n += k
   184  		if h.n < bufSize {
   185  			return size, nil
   186  		}
   187  		s = s[k:]
   188  		h.flush()
   189  	}
   190  	if len(s) > bufSize {
   191  		h.initSeed()
   192  		for len(s) > bufSize {
   193  			ptr := (*byte)((*unsafeheader.String)(unsafe.Pointer(&s)).Data)
   194  			h.state.s = rthash(ptr, bufSize, h.state.s)
   195  			s = s[bufSize:]
   196  		}
   197  	}
   198  	copy(h.buf[:], s)
   199  	h.n = len(s)
   200  	return size, nil
   201  }
   202  
   203  // Seed returns h's seed value.
   204  func (h *Hash) Seed() Seed {
   205  	h.initSeed()
   206  	return h.seed
   207  }
   208  
   209  // SetSeed sets h to use seed, which must have been returned by MakeSeed
   210  // or by another Hash's Seed method.
   211  // Two Hash objects with the same seed behave identically.
   212  // Two Hash objects with different seeds will very likely behave differently.
   213  // Any bytes added to h before this call will be discarded.
   214  func (h *Hash) SetSeed(seed Seed) {
   215  	if seed.s == 0 {
   216  		panic("maphash: use of uninitialized Seed")
   217  	}
   218  	h.seed = seed
   219  	h.state = seed
   220  	h.n = 0
   221  }
   222  
   223  // Reset discards all bytes added to h.
   224  // (The seed remains the same.)
   225  func (h *Hash) Reset() {
   226  	h.initSeed()
   227  	h.state = h.seed
   228  	h.n = 0
   229  }
   230  
   231  // precondition: buffer is full.
   232  func (h *Hash) flush() {
   233  	if h.n != len(h.buf) {
   234  		panic("maphash: flush of partially full buffer")
   235  	}
   236  	h.initSeed()
   237  	h.state.s = rthash(&h.buf[0], h.n, h.state.s)
   238  	h.n = 0
   239  }
   240  
   241  // Sum64 returns h's current 64-bit value, which depends on
   242  // h's seed and the sequence of bytes added to h since the
   243  // last call to Reset or SetSeed.
   244  //
   245  // All bits of the Sum64 result are close to uniformly and
   246  // independently distributed, so it can be safely reduced
   247  // by using bit masking, shifting, or modular arithmetic.
   248  func (h *Hash) Sum64() uint64 {
   249  	h.initSeed()
   250  	return rthash(&h.buf[0], h.n, h.state.s)
   251  }
   252  
   253  // MakeSeed returns a new random seed.
   254  func MakeSeed() Seed {
   255  	var s uint64
   256  	for {
   257  		s = runtime_fastrand64()
   258  		// We use seed 0 to indicate an uninitialized seed/hash,
   259  		// so keep trying until we get a non-zero seed.
   260  		if s != 0 {
   261  			break
   262  		}
   263  	}
   264  	return Seed{s: s}
   265  }
   266  
   267  //go:linkname runtime_fastrand64 runtime.fastrand64
   268  func runtime_fastrand64() uint64
   269  
   270  func rthash(ptr *byte, len int, seed uint64) uint64 {
   271  	if len == 0 {
   272  		return seed
   273  	}
   274  	// The runtime hasher only works on uintptr. For 64-bit
   275  	// architectures, we use the hasher directly. Otherwise,
   276  	// we use two parallel hashers on the lower and upper 32 bits.
   277  	if unsafe.Sizeof(uintptr(0)) == 8 {
   278  		return uint64(runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len)))
   279  	}
   280  	lo := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len))
   281  	hi := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed>>32), uintptr(len))
   282  	return uint64(hi)<<32 | uint64(lo)
   283  }
   284  
   285  //go:linkname runtime_memhash runtime.memhash
   286  //go:noescape
   287  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
   288  
   289  // Sum appends the hash's current 64-bit value to b.
   290  // It exists for implementing hash.Hash.
   291  // For direct calls, it is more efficient to use Sum64.
   292  func (h *Hash) Sum(b []byte) []byte {
   293  	x := h.Sum64()
   294  	return append(b,
   295  		byte(x>>0),
   296  		byte(x>>8),
   297  		byte(x>>16),
   298  		byte(x>>24),
   299  		byte(x>>32),
   300  		byte(x>>40),
   301  		byte(x>>48),
   302  		byte(x>>56))
   303  }
   304  
   305  // Size returns h's hash value size, 8 bytes.
   306  func (h *Hash) Size() int { return 8 }
   307  
   308  // BlockSize returns h's block size.
   309  func (h *Hash) BlockSize() int { return len(h.buf) }
   310
View as plain text