MurMurHash3

Created by Austin Appleby,Authored by Yonik Seeley

package util.hash;

/**

 *  The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain.

 *  This java port was authored by Yonik Seeley and also placed into the public domain.

 *  The author hereby disclaims copyright to this source code.

 *  <p>

 *  This produces exactly the same hash values as the final C++

 *  version of MurmurHash3 and is thus suitable for producing the same hash values across

 *  platforms.

 *  <p>

 *  The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids.

 *  murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash.

 *  <p>

 *  Note - The x86 and x64 versions do _not_ produce the same results, as the

 *  algorithms are optimized for their respective platforms.

 *  <p>

 *  See http://github.com/yonik/java_util for future updates to this file.

 */

public final class MurmurHash3 {

  /** 128 bits of state */

  public static final class LongPair {

    public long val1;

    public long val2;

  }

  public static final int fmix32(int h) {

    h ^= h >>> 16;

    h *= 0x85ebca6b;

    h ^= h >>> 13;

    h *= 0xc2b2ae35;

    h ^= h >>> 16;

    return h;

  }

  public static final long fmix64(long k) {

    k ^= k >>> 33;

    k *= 0xff51afd7ed558ccdL;

    k ^= k >>> 33;

    k *= 0xc4ceb9fe1a85ec53L;

    k ^= k >>> 33;

    return k;

  }

  /** Gets a long from a byte buffer in little endian byte order. */

  public static final long getLongLittleEndian(byte[] buf, int offset) {

    return     ((long)buf[offset+7]    << 56)   // no mask needed

            | ((buf[offset+6] & 0xffL) << 48)

            | ((buf[offset+5] & 0xffL) << 40)

            | ((buf[offset+4] & 0xffL) << 32)

            | ((buf[offset+3] & 0xffL) << 24)

            | ((buf[offset+2] & 0xffL) << 16)

            | ((buf[offset+1] & 0xffL) << 8)

            | ((buf[offset  ] & 0xffL));        // no shift needed

  }

  /** Returns the MurmurHash3_x86_32 hash. */

  public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {

    final int c1 = 0xcc9e2d51;

    final int c2 = 0x1b873593;

    int h1 = seed;

    int roundedEnd = offset + (len & 0xfffffffc);  // round down to 4 byte block

    for (int i=offset; i<roundedEnd; i+=4) {

      // little endian load order

      int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);

      k1 *= c1;

      k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);

      k1 *= c2;

      h1 ^= k1;

      h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);

      h1 = h1*5+0xe6546b64;

    }

    // tail

    int k1 = 0;

    switch(len & 0x03) {

      case 3:

        k1 = (data[roundedEnd + 2] & 0xff) << 16;

        // fallthrough

      case 2:

        k1 |= (data[roundedEnd + 1] & 0xff) << 8;

        // fallthrough

      case 1:

        k1 |= (data[roundedEnd] & 0xff);

        k1 *= c1;

        k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);

        k1 *= c2;

        h1 ^= k1;

    }

    // finalization

    h1 ^= len;

    // fmix(h1);

    h1 ^= h1 >>> 16;

    h1 *= 0x85ebca6b;

    h1 ^= h1 >>> 13;

    h1 *= 0xc2b2ae35;

    h1 ^= h1 >>> 16;

    return h1;

  }

  /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding

   * the string to a temporary buffer.  This is more than 2x faster than hashing the result

   * of String.getBytes().

   */

  public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {

    final int c1 = 0xcc9e2d51;

    final int c2 = 0x1b873593;

    int h1 = seed;

    int pos = offset;

    int end = offset + len;

    int k1 = 0;

    int k2 = 0;

    int shift = 0;

    int bits = 0;

    int nBytes = 0;   // length in UTF8 bytes

    while (pos < end) {

      int code = data.charAt(pos++);

      if (code < 0x80) {

        k2 = code;

        bits = 8;

        /***

        // optimized ascii implementation (currently slower!!! code size?)

        if (shift == 24) {

          k1 = k1 | (code << 24);

          k1 *= c1;

          k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);

          k1 *= c2;

          h1 ^= k1;

          h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);

          h1 = h1*5+0xe6546b64;

          shift = 0;

          nBytes += 4;

          k1 = 0;

        } else {

          k1 |= code << shift;

          shift += 8;

        }

        continue;

       ***/

      }

      else if (code < 0x800) {

        k2 = (0xC0 | (code >> 6))

                | ((0x80 | (code & 0x3F)) << 8);

        bits = 16;

      }

      else if (code < 0xD800 || code > 0xDFFF || pos>=end) {

        // we check for pos>=end to encode an unpaired surrogate as 3 bytes.

        k2 = (0xE0 | (code >> 12))

                | ((0x80 | ((code >> 6) & 0x3F)) << 8)

                | ((0x80 | (code & 0x3F)) << 16);

        bits = 24;

      } else {

        // surrogate pair

        // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;

        int utf32 = (int) data.charAt(pos++);

        utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);

        k2 = (0xff & (0xF0 | (utf32 >> 18)))

             | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8

             | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16

             |  (0x80 | (utf32 & 0x3F)) << 24;

        bits = 32;

      }

      k1 |= k2 << shift;

      // int used_bits = 32 - shift;  // how many bits of k2 were used in k1.

      // int unused_bits = bits - used_bits; //  (bits-(32-shift)) == bits+shift-32  == bits-newshift

      shift += bits;

      if (shift >= 32) {

        // mix after we have a complete word

        k1 *= c1;

        k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);

        k1 *= c2;

        h1 ^= k1;

        h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);

        h1 = h1*5+0xe6546b64;

        shift -= 32;

        // unfortunately, java won't let you shift 32 bits off, so we need to check for 0

        if (shift != 0) {

          k1 = k2 >>> (bits-shift);   // bits used == bits - newshift

        } else {

          k1 = 0;

        }

        nBytes += 4;

      }

    } // inner

    // handle tail

    if (shift > 0) {

      nBytes += shift >> 3;

      k1 *= c1;

      k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);

      k1 *= c2;

      h1 ^= k1;

    }

    // finalization

    h1 ^= nBytes;

    // fmix(h1);

    h1 ^= h1 >>> 16;

    h1 *= 0x85ebca6b;

    h1 ^= h1 >>> 13;

    h1 *= 0xc2b2ae35;

    h1 ^= h1 >>> 16;

    return h1;

  }

  /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */

  public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) {

    // The original algorithm does have a 32 bit unsigned seed.

    // We have to mask to match the behavior of the unsigned types and prevent sign extension.

    long h1 = seed & 0x00000000FFFFFFFFL;

    long h2 = seed & 0x00000000FFFFFFFFL;

    final long c1 = 0x87c37b91114253d5L;

    final long c2 = 0x4cf5ad432745937fL;

    int roundedEnd = offset + (len & 0xFFFFFFF0);  // round down to 16 byte block

    for (int i=offset; i<roundedEnd; i+=16) {

        long k1 = getLongLittleEndian(key, i);

        long k2 = getLongLittleEndian(key, i+8);

        k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;

        h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729;

        k2 *= c2; k2  = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2;

        h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;

    }

    long k1 = 0;

    long k2 = 0;

    switch (len & 15) {

      case 15: k2  = (key[roundedEnd+14] & 0xffL) << 48;

      case 14: k2 |= (key[roundedEnd+13] & 0xffL) << 40;

      case 13: k2 |= (key[roundedEnd+12] & 0xffL) << 32;

      case 12: k2 |= (key[roundedEnd+11] & 0xffL) << 24;

      case 11: k2 |= (key[roundedEnd+10] & 0xffL) << 16;

      case 10: k2 |= (key[roundedEnd+ 9] & 0xffL) << 8;

      case  9: k2 |= (key[roundedEnd+ 8] & 0xffL);

        k2 *= c2; k2  = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2;

      case  8: k1  = ((long)key[roundedEnd+7]) << 56;

      case  7: k1 |= (key[roundedEnd+6] & 0xffL) << 48;

      case  6: k1 |= (key[roundedEnd+5] & 0xffL) << 40;

      case  5: k1 |= (key[roundedEnd+4] & 0xffL) << 32;

      case  4: k1 |= (key[roundedEnd+3] & 0xffL) << 24;

      case  3: k1 |= (key[roundedEnd+2] & 0xffL) << 16;

      case  2: k1 |= (key[roundedEnd+1] & 0xffL) << 8;

      case  1: k1 |= (key[roundedEnd  ] & 0xffL);

        k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;

    }

    //----------

    // finalization

    h1 ^= len; h2 ^= len;

    h1 += h2;

    h2 += h1;

    h1 = fmix64(h1);

    h2 = fmix64(h2);

    h1 += h2;

    h2 += h1;

    out.val1 = h1;

    out.val2 = h2;

  }

}

MurMurHash3的更多相关文章

Metadata Lock原理8
http://www.kancloud.cn/taobaomysql/monthly/67141 MySQL· 5.7优化·Metadata Lock子系统的优化背景引入MDL锁的目的,最初是为了 ...
剖析Elasticsearch集群系列第一篇 Elasticsearch的存储模型和读写操作
剖析Elasticsearch集群系列涵盖了当今最流行的分布式搜索引擎Elasticsearch的底层架构和原型实例. 本文是这个系列的第一篇,在本文中,我们将讨论的Elasticsearch的底层存 ...
ElasticSearch入门（2） —— 基础概念
在Elasticsearch中,文档归属于一种类型(type),而这些类型存在于索引(index)中,我们可以画一些简单的对比图来类比传统关系型数据库: Relational DB -> Dat ...
MinHash 原理
最小哈希原理介绍 MinHash是基于Jaccard Index相似度(海量数据不可行)的算法,一种降维的方法A,B 两个集合:A = {s1, s3, s6, s8, s9} B = {s3, s ...
Shodan的http.favicon.hash语法详解与使用技巧
在Shodan搜索中有一个关于网站icon图标的搜索语法,http.favicon.hash,我们可以使用这个语法来搜索出使用了同一icon图标的网站,不知道怎么用的朋友请参考我上一篇文章. 通过上一 ...
Java Bloom filter几种实现比较
英文原始出处: Bloom filter for Scala, the fastest for JVM 本文介绍的是用Scala实现的Bloom filter. 源代码在github上.依照性能测试结 ...
redis 系列6 数据结构之字典(下)
一.概述接着上篇继续,这篇把数据结构之字典学习完, 这篇知识点包括:哈希算法,解决键冲突, rehash , 渐进式rehash,字典API. 1.1 哈希算法当一个新的键值对需要添加到字典里面 ...
【转】解决Maxwell发送Kafka消息数据倾斜问题
最近用Maxwell解析MySQL的Binlog,发送到Kafka进行处理,测试的时候发现一个问题,就是Kafka的Offset严重倾斜,三个partition,其中一个的offset已经快200万了 ...
大数据量下的集合过滤—Bloom Filter
算法背景如果想判断一个元素是不是在一个集合里,一般想到的是将集合中所有元素保存起来,然后通过比较确定.链表.树.散列表(又叫哈希表,Hash table)等等数据结构都是这种思路,存储位置要么是磁盘 ...

随机推荐

上传预览 easyui部分控件获取focuse 表单验证
js: $(document).ready(function () { //$('#creater').combobox({ // url: '/VMS.UI/BindData/ScheamData? ...
lo dash api
https://lodash.com/docs 用 Lo-Dash 替换 underscore http://segmentfault.com/a/1190000000359484
PariticalFilter在MFC上的运行，源代码公开
由于项目需要,进行过一段时间的 PariticalFilter 研究.主要的工作就是将网络上的Console代码和Mfc融合在一起,并且添加了Mfc端的控制功能. 程序还有不完善的地方,现 ...
如何：使用 Visual Basic 编写基于 Unity3D 的计算器
随着 .NET 全平台战略的推进,微软正在让以 C# 为先锋的 .NET 拥有跨平台特性.这个过程中一直有人想知道其它 .NET 语言对跨平台的支持有什么改进,熟悉 C# 但是喜欢用 VB 的我也不例 ...
SQLSERVER 中实现类似Mysql的 INSERT ON DUPLICATE KEY UPDATE
通过SQLServer创建索引时,有一个IGNORE_DUP_KEY的选项,可以类似实现. IGNORE_DUP_KEY = { ON | OFF } 指定对唯一聚集索引或唯一非聚集索引执行多行插入操 ...
How To Ask Questions The Smart Way 转
先查后问多思考莫做伸手党. 原文链接译文链接
神秘的ApplicationPoolIdentity再也不用妈妈担心程序池安全了
在IIS 7和IIS 7.5中,我们可以为应用程序池设置一个特殊的Identity(用户标识):ApplicationPoolIdentity. 那么这个标识到底是什么意思?它是具体什么身份呢?这一讲 ...
TCP/IP 协议族的简介
TCP/IP重要的特性就是分层.TCP/IP 按照层次分为四层:应用层.传输层.网络层.数据链路层.分层的好处就是当某些地方需要改变的时候,只需要将改变的层替换掉即可,而不用去把整体做替换.各层之间的 ...
DOM范围
前面的话为了让开发人员更方便地控制页面,DOM定义了“范围”(range)接口.通过范围可以选择文档中的一个区域,而不必考虑节点的界限(选择在后台完成,对用户是不可见的).在常规的DOM操作不能更有 ...
游戏UI框架设计(二) : 最简版本设计
游戏UI框架设计(二) --最简版本设计为降低难度决定先讲解一个最简版本,阐述UI框架的核心设计理念.这里先定义三个核心功能: 1:UI窗体的自动加载功能. 2:缓存UI窗体. 3:窗体生命周期(状 ...

MurMurHash3

MurMurHash3的更多相关文章

随机推荐

热门专题