JDK1.8 HashMap 源码分析

一、概述

以键值对的形式存储，是基于Map接口的实现,可以接收null的键值，不保证有序（比如插入顺序），存储着Entry(hash, key, value, next)对象。

二、示例

public static void main(String[] args){

    Map<String, Integer> map = new HashMap<String, Integer>();

    map.put("上海", 1);

    map.put("北京", 2);

    map.put("广州", 3);

    map.put("天津", 4);

    map.put("重庆", 5);

    for(Map.Entry<String, Integer> entry : map.entrySet()) {

        System.out.println(entry.getKey() + ": " + entry.getValue());

    }

}

IntelliJ IDEA 调试，通过Variables我们能看到这样的储存方式：

三、HashMap存储的数据结构

3.1 数据结构

通过示例调试可以总结出HashMap示例存储的数据结构：

3.2 数据结构核心代码

3.2.1 table

transient Node<K,V>[] table;

3.2.2 Node

Node是HashMap的一个内部类，单向链表实现方式

static class Node<K,V> implements Map.Entry<K,V> {

    final int hash;    //用来定位数组索引位置

    final K key;

    V value;

    Node<K,V> next;    //链表的下一个node

    Node(int hash, K key, V value, Node<K,V> next) {

        this.hash = hash;

        this.key = key;

        this.value = value;

        this.next = next;

    }

    public final K getKey() { return key; }

    public final V getValue() { return value; }

    public final String toString() { return key + "=" + value; }

    public final int hashCode() {

        return Objects.hashCode(key) ^ Objects.hashCode(value);

    }

    public final V setValue(V newValue) {

        V oldValue = value;

        value = newValue;

        return oldValue;

    }

    public final boolean equals(Object o) {

        if (o == this)

            return true;

        if (o instanceof Map.Entry) {

            Map.Entry<?,?> e = (Map.Entry<?,?>)o;

            if (Objects.equals(key, e.getKey()) &&  Objects.equals(value, e.getValue()))

                return true;

        }

        return false;

    }

}

3.2.3 TreeNode 红黑树

static final class TreeNode<K,V> extends LinkedHashMap.Entry<K,V> {

    TreeNode<K,V> parent;  // red-black tree links

    TreeNode<K,V> left;

    TreeNode<K,V> right;

    TreeNode<K,V> prev;    // needed to unlink next upon deletion

    boolean red;

    TreeNode(int hash, K key, V val, Node<K,V> next) {

        super(hash, key, val, next);

    }

    //返回当前节点的根节点

    final TreeNode<K,V> root() {

        for (TreeNode<K,V> r = this, p;;) {

            if ((p = r.parent) == null)

                return r;

            r = p;

        }

    }

    以下省略... ...

}

四、HashMap主要属性

//默认初始容量为16，必须为2的幂

static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;  

//最大容量为2的30次方

static final int MAXIMUM_CAPACITY = 1 << 30;  

////默认加载因子0.75,当HashMap的数据大小>=容量*加载因子时，HashMap会将容量扩容

static final float DEFAULT_LOAD_FACTOR = 0.75f;  

//链表长度大于8时，将链表转化为红黑树

static final int TREEIFY_THRESHOLD = 8;  

//如果发现链表长度小于 6，则会将红黑树重新退化为链表

static final int UNTREEIFY_THRESHOLD = 6;  

//转变成树之前进行一次判断，只有键值对数量大于64才会发生转换。这是为了避免在哈希表建立初期，多个键值对恰好被放入了同一个链表中而导致不必要的转化。

static final int MIN_TREEIFY_CAPACITY = 64;  //MIN_TREEIFY_CAPACITY>= 4 * TREEIFY_THRESHOLD

//下次扩容的临界值，size>=threshold就会扩容,threshold＝容量*加载因子

int threshold;

final float loadFactor;

// 修改次数

transient int modCount;

五、HashMap的部分源码分析

在看到3.1的图时，可能会有疑问，广州为什么放到上海的链表中，带着问题我们往下看。

5.1 put实现

public V put(K key, V value) {

    return putVal(hash(key), key, value, false, true);

}

final V putVal(int hash, K key, V value, boolean onlyIfAbsent, boolean evict) {

    Node<K,V>[] tab; Node<K,V> p; int n, i;

    // tab为空则创建

    if ((tab = table) == null || (n = tab.length) == 0)

        n = (tab = resize()).length;

    // 计算index，并对null做处理

    if ((p = tab[i = (n - 1) & hash]) == null)

        tab[i] = newNode(hash, key, value, null);

    else {

        Node<K,V> e; K k;

        // 节点key存在，直接覆盖value

        if (p.hash == hash && ((k = p.key) == key || (key != null && key.equals(k))))

            e = p;

        // 判断该链为红黑树

        else if (p instanceof TreeNode)

            e = ((TreeNode<K,V>)p).putTreeVal(this, tab, hash, key, value);

        // 该链为链表

        else {

            for (int binCount = 0; ; ++binCount) {

                if ((e = p.next) == null) {

                    // 在Node添加到尾部

                    p.next = newNode(hash, key, value, null);

                    // 若链表长度大于8，则转换为红黑树进行处理

                    if (binCount >= TREEIFY_THRESHOLD - 1) // -1 for 1st

                        treeifyBin(tab, hash);

                    break;

                }

                // key已经存在，直接覆盖value

                if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k))))

                    break;

                p = e;

            }

        }

        //写入

        if (e != null) { // existing mapping for key

            V oldValue = e.value;

            if (!onlyIfAbsent || oldValue == null)

                e.value = value;

            afterNodeAccess(e);

            return oldValue;

        }

    }

    // 如果本次新增key之前不存在于HashMap中，modCount加1，说明结构改变了

    ++modCount;

    // 如果大于threshold， 扩容

    if (++size > threshold)

        resize();

    afterNodeInsertion(evict);

    return null;

}

final void treeifyBin(Node<K,V>[] tab, int hash) {

    int n, index; Node<K,V> e;

    //当tab.length<MIN_TREEIFY_CAPACITY 时还是进行resize

    if (tab == null || (n = tab.length) < MIN_TREEIFY_CAPACITY)

        resize();

    // key存在，转化为红黑树

    else if ((e = tab[index = (n - 1) & hash]) != null) {

        TreeNode<K,V> hd = null, tl = null;

        do {

            // 建立树的根节点，然后对每个元素进行添加

            TreeNode<K,V> p = replacementTreeNode(e, null);

            if (tl == null)

                hd = p;

            else {

                p.prev = tl;

                tl.next = p;

            }

            tl = p;

        } while ((e = e.next) != null);

        if ((tab[index] = hd) != null)

            // 存储红黑树

            hd.treeify(tab);

    }

}

这里重点说两点：

索引的计算：

在计算索引时，这个值必须在[0,length]这个左闭右开的区间中，基于这个条件，比如默认的table长度为16，代入公式 (n - 1) & hash，结果必然是存在于[0,length]区间范围内。这里还有个小技巧，在容量一定是2^n的情况下，h & (length - 1) == h % length，这里之所以使用位运算，我想也是因为位运算直接由计算机处理，效率要高过%运算。
转化红黑树：

在put方法中，逻辑是链表长度大于（TREEIFY_THRESHOLD -1）时，就转化为红黑树，实际情况这只是初步判断，在转化的方法treeifyBin()方法中会进行二次校验，当tab.length<MIN_TREEIFY_CAPACITY 时还是进行resize，只有table长度>MIN_TREEIFY_CAPACITY且桶中元素个数>TREEIFY_THRESHOLD时才会转化为红黑树。

5.2 hash实现

static final int hash(Object key) {

    int h;

    return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16);

}

这个函数大概的作用就是：高16bit不变，低16bit和高16bit做了一个异或。根据注释及个人理解，这样的做的原因是因为Java中对象的哈希值都32位整数，高位与低位异或一下能保证高低位都能参与到下标计算中，即使在table长度比较小的情况下，也能尽可能的避免碰撞。

举例：

通过以上计算，也正好证明，为什么广州会成为上海的next节点。

5.3 resize实现

final Node<K,V>[] resize() {

    Node<K,V>[] oldTab = table;

    int oldCap = (oldTab == null) ? 0 : oldTab.length;  // 获取原HashMap数组的长度。

    int oldThr = threshold;  // 扩容临界值

    int newCap, newThr = 0;

    if (oldCap > 0) {

        // 超过最大值就不再扩充了

        if (oldCap >= MAXIMUM_CAPACITY) {

            threshold = Integer.MAX_VALUE;

            return oldTab;

        }

        // 没超过最大值，就扩充为原来的2倍

        else if ((newCap = oldCap << 1) < MAXIMUM_CAPACITY && oldCap >= DEFAULT_INITIAL_CAPACITY)

            newThr = oldThr << 1; // double threshold

    }

    else if (oldThr > 0) // initial capacity was placed in threshold

        newCap = oldThr;

    else {               // zero initial threshold signifies using defaults

        newCap = DEFAULT_INITIAL_CAPACITY;

        newThr = (int)(DEFAULT_LOAD_FACTOR * DEFAULT_INITIAL_CAPACITY);

    }

    // 计算新的resize上限

    if (newThr == 0) {

        float ft = (float)newCap * loadFactor;

        newThr = (newCap < MAXIMUM_CAPACITY && ft < (float)MAXIMUM_CAPACITY ?

                (int)ft : Integer.MAX_VALUE);

    }

    threshold = newThr;

    @SuppressWarnings({"rawtypes","unchecked"})

    Node<K,V>[] newTab = (Node<K,V>[])new Node[newCap];

    table = newTab;

    // 遍历桶，然后对桶中的每个元素进行重新hash

    if (oldTab != null) {

        for (int j = 0; j < oldCap; ++j) {

            Node<K,V> e;

            if ((e = oldTab[j]) != null) {

                oldTab[j] = null;  // 原table地址释放

               // 单节点处理

               if (e.next == null)

                    newTab[e.hash & (newCap - 1)] = e;  // 重新hash放入新table中

                // 红黑树处理

                else if (e instanceof TreeNode)

                    ((TreeNode<K,V>)e).split(this, newTab, j, oldCap);

                else { // preserve order

                    // 长链表处理

                    Node<K,V> loHead = null, loTail = null;

                    Node<K,V> hiHead = null, hiTail = null;

                    Node<K,V> next;

                    do {

                        next = e.next;

                        // 新表是旧表的两倍容量，以下把单链表拆分为高位链表、低位链表

                        if ((e.hash & oldCap) == 0) {  // 低位链表，注意与的对象是oldCap，而不是 oldCap-1

                            if (loTail == null)

                                loHead = e;

                            else

                                loTail.next = e;

                            loTail = e;

                        }

                        else {   // 高位链表

                            if (hiTail == null)

                                hiHead = e;

                            else

                                hiTail.next = e;

                            hiTail = e;

                        }

                    } while ((e = next) != null);

                    // 低位链表保持原索引放入新table中

                    if (loTail != null) {

                        loTail.next = null;

                        newTab[j] = loHead;

                    }

                    // 高位链表放入新table中，索引=原索引+oldCap

                    if (hiTail != null) {

                        hiTail.next = null;

                        newTab[j + oldCap] = hiHead;

                    }

                }

            }

        }

    }

    return newTab;

}

从resize() 的实现中可以看出，在扩容时，针对table，如果桶的位置是单节点链表，那么index =（hash & (newTab.length - 1)），直接放入新表。红黑树另外处理。若是多节点链表，会产生高低和低位链表，即：hash & length=0为低位链表、hash & length=length为高位链表。低位链表保持原索引放入新table中，高位链表index=oldTab.index + oldTab.length = hash & (newTab.length-1)。

为什么要分高低位链表?，试想若是全部都使用index =（hash & (newTab.length - 1)）计算，此时因为是基于下标存储，从而导致在index冲突的情况下，多元素链表的追加出现额外的时间（寻址等）或空间（辅助参数、结构等）上的开销。分高低位链表，相比先保存好数据再寻找追加效率更好，也是极好的优化技巧。

5.4 get实现

public V get(Object key) {

    Node<K,V> e;

    return (e = getNode(hash(key), key)) == null ? null : e.value;

}

final Node<K,V> getNode(int hash, Object key) {

    Node<K,V>[] tab; Node<K,V> first, e; int n; K k;

    if ((tab = table) != null && (n = tab.length) > 0 && (first = tab[(n - 1) & hash]) != null) {

        // 直接命中

        if (first.hash == hash && // always check first node

                    ((k = first.key) == key || (key != null && key.equals(k))))

            return first;

        // 未命中

        if ((e = first.next) != null) {

            // 在树中查找

            if (first instanceof TreeNode)

                return ((TreeNode<K,V>)first).getTreeNode(hash, key);

            // 在链表中查找

            do {

                if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k))))

                    return e;

            } while ((e = e.next) != null);

        }

    }

    return null;

}

5.5 remove实现

public V remove(Object key) {

    Node<K,V> e;

    return (e = removeNode(hash(key), key, null, false, true)) == null ? null : e.value;

}

final Node<K,V> removeNode(int hash, Object key, Object value, boolean matchValue, boolean movable) {

    Node<K,V>[] tab; Node<K,V> p; int n, index;

    if ((tab = table) != null && (n = tab.length) > 0 && (p = tab[index = (n - 1) & hash]) != null) {

        Node<K,V> node = null, e; K k; V v;

        // 直接命中

        if (p.hash == hash && ((k = p.key) == key || (key != null && key.equals(k))))

            node = p;

        else if ((e = p.next) != null) {

            // 红黑树中查找

            if (p instanceof TreeNode)

                node = ((TreeNode<K,V>)p).getTreeNode(hash, key);

            else {

                // 链表中查找

                do {

                    if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k)))) {

                        node = e;

                        break;

                    }

                    p = e;

                } while ((e = e.next) != null);

            }

        }

        // 命中后删除

        if (node != null && (!matchValue || (v = node.value) == value || (value != null && value.equals(v)))) {

            if (node instanceof TreeNode)

                ((TreeNode<K,V>)node).removeTreeNode(this, tab, movable);

            else if (node == p)

                tab[index] = node.next;  // 链表首元素删除

            else

                p.next = node.next;  //多元素链表节点删除

            ++modCount;

            --size;

            afterNodeRemoval(node);

            return node;

        }

    }

    return null;

}

5.6 containsKey实现

public boolean containsKey(Object key) {

    return getNode(hash(key), key) != null;

}

5.7 containsValue实现

public boolean containsValue(Object value) {

    Node<K,V>[] tab; V v;

    if ((tab = table) != null && size > 0) {

        // table遍历

        for (int i = 0; i < tab.length; ++i) {

            // 多元素链表遍历

            for (Node<K,V> e = tab[i]; e != null; e = e.next) {

                if ((v = e.value) == value || (value != null && value.equals(v)))

                    return true;

            }

        }

    }

    return false;

}

六、总结

6.1 为什么需要负载因子？

加载因子存在的原因，还是因为要减缓哈希冲突，例如：默认初始桶为16，或等到满16个元素才扩容，某些桶里可能就会有多个元素了。所以加载因子默认为0.75，也就是说大小为16的HashMap，扩容临界值threshold=0.75*16=12,到了第13个元素，就会扩容成32。

6.2 加载因子减小？

在构造函数里，设定小一点的加载因子,比如0.5，甚至0.25。

若是一个长期存在的Map,并且key不固定，那可以适当加大初始大小，同时减少加载因子，降低冲突的机率，也能减少寻址的时间。用空间来换时间，这时也是值得的。

6.3 初始化时是否定义容量？

通过以上源码分析，每次扩容都需要重创建桶数组、链表、数据转换等，所以扩容成本还是挺高的，若初始化时能设置准确或预估出需要的容量，即使大一点，用空间来换时间，有时也是值得的。

6.4 String型的Key设计优化？

如果无法保证无冲突而且能用==来对比，那就尽量搞短点，试想一个个字符的equals都是需要花时间的。顺序型的Key,如：k1、k2、k3...k50,这种key的hashCode是数字递增，冲突的可能性实在太小。

for(int i=0;i<100;i++){

    System.out.println(key+".hashCode="+key.hashCode());

}

结果：

K0.hashCode = 2373

K1.hashCode = 2374

K2.hashCode = 2375

K3.hashCode = 2376

K4.hashCode = 2377

... ...