|
| 1 | +/* __ *\ |
| 2 | +** ________ ___ / / ___ Scala API ** |
| 3 | +** / __/ __// _ | / / / _ | (c) 2003-2011, LAMP/EPFL ** |
| 4 | +** __\ \/ /__/ __ |/ /__/ __ | http://scala-lang.org/ ** |
| 5 | +** /____/\___/_/ |_/____/_/ | | ** |
| 6 | +** |/ ** |
| 7 | +\* */ |
| 8 | + |
| 9 | +package scala.util |
| 10 | + |
| 11 | +/** An implementation of Austin Appleby's MurmurHash 3.0 algorithm |
| 12 | + * (32 bit version); reference: http://code.google.com/p/smhasher |
| 13 | + * |
| 14 | + * This is the hash used by collections and case classes (including |
| 15 | + * tuples). |
| 16 | + * |
| 17 | + * @author Rex Kerr |
| 18 | + * @version 2.9 |
| 19 | + * @since 2.9 |
| 20 | + */ |
| 21 | + |
| 22 | +import java.lang.Integer.{ rotateLeft => rotl } |
| 23 | +import scala.collection.Iterator |
| 24 | + |
| 25 | +/** A class designed to generate well-distributed non-cryptographic |
| 26 | + * hashes. It is designed to be passed to a collection's foreach method, |
| 27 | + * or can take individual hash values with append. Its own hash code is |
| 28 | + * set equal to the hash code of whatever it is hashing. |
| 29 | + */ |
| 30 | +class MurmurHash[@specialized(Int,Long,Float,Double) T](seed: Int) extends (T => Unit) { |
| 31 | + import MurmurHash._ |
| 32 | + |
| 33 | + private var h = startHash(seed) |
| 34 | + private var c = hiddenMagicA |
| 35 | + private var k = hiddenMagicB |
| 36 | + private var hashed = false |
| 37 | + private var hashvalue = h |
| 38 | + |
| 39 | + /** Begin a new hash using the same seed. */ |
| 40 | + def reset() { |
| 41 | + h = startHash(seed) |
| 42 | + c = hiddenMagicA |
| 43 | + k = hiddenMagicB |
| 44 | + hashed = false |
| 45 | + } |
| 46 | + |
| 47 | + /** Incorporate the hash value of one item. */ |
| 48 | + def apply(t: T) { |
| 49 | + h = extendHash(h,t.##,c,k) |
| 50 | + c = nextMagicA(c) |
| 51 | + k = nextMagicB(k) |
| 52 | + hashed = false |
| 53 | + } |
| 54 | + |
| 55 | + /** Incorporate a known hash value. */ |
| 56 | + def append(i: Int) { |
| 57 | + h = extendHash(h,i,c,k) |
| 58 | + c = nextMagicA(c) |
| 59 | + k = nextMagicB(k) |
| 60 | + hashed = false |
| 61 | + } |
| 62 | + |
| 63 | + /** Retrieve the hash value */ |
| 64 | + def hash = { |
| 65 | + if (!hashed) { |
| 66 | + hashvalue = finalizeHash(h) |
| 67 | + hashed = true |
| 68 | + } |
| 69 | + hashvalue |
| 70 | + } |
| 71 | + override def hashCode = hash |
| 72 | +} |
| 73 | + |
| 74 | +/** An object designed to generate well-distributed non-cryptographic |
| 75 | + * hashes. It is designed to hash a collection of integers; along with |
| 76 | + * the integers to hash, it generates two magic streams of integers to |
| 77 | + * increase the distribution of repetitive input sequences. Thus, |
| 78 | + * three methods need to be called at each step (to start and to |
| 79 | + * incorporate a new integer) to update the values. Only one method |
| 80 | + * needs to be called to finalize the hash. |
| 81 | + */ |
| 82 | + |
| 83 | +object MurmurHash { |
| 84 | + // Magic values used for MurmurHash's 32 bit hash. |
| 85 | + // Don't change these without consulting a hashing expert! |
| 86 | + final private val visibleMagic = 0x971e137b |
| 87 | + final private val hiddenMagicA = 0x95543787 |
| 88 | + final private val hiddenMagicB = 0x2ad7eb25 |
| 89 | + final private val visibleMixer = 0x52dce729 |
| 90 | + final private val hiddenMixerA = 0x7b7d159c |
| 91 | + final private val hiddenMixerB = 0x6bce6396 |
| 92 | + final private val finalMixer1 = 0x85ebca6b |
| 93 | + final private val finalMixer2 = 0xc2b2ae35 |
| 94 | + |
| 95 | + // Arbitrary values used for hashing certain classes |
| 96 | + final private val seedString = 0xf7ca7fd2 |
| 97 | + final private val seedArray = 0x3c074a61 |
| 98 | + |
| 99 | + /** The first 23 magic integers from the first stream are stored here */ |
| 100 | + val storedMagicA = |
| 101 | + Iterator.iterate(hiddenMagicA)(nextMagicA).take(23).toArray |
| 102 | + |
| 103 | + /** The first 23 magic integers from the second stream are stored here */ |
| 104 | + val storedMagicB = |
| 105 | + Iterator.iterate(hiddenMagicB)(nextMagicB).take(23).toArray |
| 106 | + |
| 107 | + /** Begin a new hash with a seed value. */ |
| 108 | + def startHash(seed: Int) = seed ^ visibleMagic |
| 109 | + |
| 110 | + /** The initial magic integers in the first stream. */ |
| 111 | + def startMagicA = hiddenMagicA |
| 112 | + |
| 113 | + /** The initial magic integer in the second stream. */ |
| 114 | + def startMagicB = hiddenMagicB |
| 115 | + |
| 116 | + /** Incorporates a new value into an existing hash. |
| 117 | + * |
| 118 | + * @param hash the prior hash value |
| 119 | + * @param value the new value to incorporate |
| 120 | + * @param magicA a magic integer from the stream |
| 121 | + * @param magicB a magic integer from a different stream |
| 122 | + * @return the updated hash value |
| 123 | + */ |
| 124 | + def extendHash(hash: Int, value: Int, magicA: Int, magicB: Int) = { |
| 125 | + (hash ^ rotl(value*magicA,11)*magicB)*3 + visibleMixer |
| 126 | + } |
| 127 | + |
| 128 | + /** Given a magic integer from the first stream, compute the next */ |
| 129 | + def nextMagicA(magicA: Int) = magicA*5 + hiddenMixerA |
| 130 | + |
| 131 | + /** Given a magic integer from the second stream, compute the next */ |
| 132 | + def nextMagicB(magicB: Int) = magicB*5 + hiddenMixerB |
| 133 | + |
| 134 | + /** Once all hashes have been incorporated, this performs a final mixing */ |
| 135 | + def finalizeHash(hash: Int) = { |
| 136 | + var i = (hash ^ (hash>>>16)) |
| 137 | + i *= finalMixer1 |
| 138 | + i ^= (i >>> 13) |
| 139 | + i *= finalMixer2 |
| 140 | + i ^= (i >>> 16) |
| 141 | + i |
| 142 | + } |
| 143 | + |
| 144 | + /** Compute a high-quality hash of an array */ |
| 145 | + def arrayHash[T](a: Array[T]) = { |
| 146 | + var h = startHash(a.length * seedArray) |
| 147 | + var c = hiddenMagicA |
| 148 | + var k = hiddenMagicB |
| 149 | + var j = 0 |
| 150 | + while (j < a.length) { |
| 151 | + h = extendHash(h, a(j).##, c, k) |
| 152 | + c = nextMagicA(c) |
| 153 | + k = nextMagicB(k) |
| 154 | + j += 1 |
| 155 | + } |
| 156 | + finalizeHash(h) |
| 157 | + } |
| 158 | + |
| 159 | + /** Compute a high-quality hash of a string */ |
| 160 | + def stringHash(s: String) = { |
| 161 | + var h = startHash(s.length * seedString) |
| 162 | + var c = hiddenMagicA |
| 163 | + var k = hiddenMagicB |
| 164 | + var j = 0 |
| 165 | + while (j+1 < s.length) { |
| 166 | + val i = (s.charAt(j)<<16) + s.charAt(j+1); |
| 167 | + h = extendHash(h,i,c,k) |
| 168 | + c = nextMagicA(c) |
| 169 | + k = nextMagicB(k) |
| 170 | + j += 2 |
| 171 | + } |
| 172 | + if (j < s.length) h = extendHash(h,s.charAt(j),c,k) |
| 173 | + finalizeHash(h) |
| 174 | + } |
| 175 | + |
| 176 | + /** Compute a hash that is symmetric in its arguments--that is, |
| 177 | + * where the order of appearance of elements does not matter. |
| 178 | + * This is useful for hashing sets, for example. |
| 179 | + */ |
| 180 | + def symmetricHash[T](xs: collection.TraversableOnce[T], seed: Int) = { |
| 181 | + var a,b,n = 0 |
| 182 | + var c = 1 |
| 183 | + xs.foreach(i => { |
| 184 | + val h = i.## |
| 185 | + a += h |
| 186 | + b ^= h |
| 187 | + if (h != 0) c *= h |
| 188 | + n += 1 |
| 189 | + }) |
| 190 | + var h = startHash(seed * n) |
| 191 | + h = extendHash(h, a, storedMagicA(0), storedMagicB(0)) |
| 192 | + h = extendHash(h, b, storedMagicA(1), storedMagicB(1)) |
| 193 | + h = extendHash(h, c, storedMagicA(2), storedMagicB(2)) |
| 194 | + finalizeHash(h) |
| 195 | + } |
| 196 | +} |
0 commit comments