
字符串(string)、散列(hash)、列表(list)、集合(set)、排序集合(sorted set)、位图(bitmaps)、地理空间索引(Geospatial indexes)、流(streams)


key-value是redis中最基础的结构,key-value是采用哈希表(hash table)这种基础的数据结构来实现的,其中key是字符串类型,而value则会有上面说的各种数据类型。



  1. 哈希冲突
  2. 扩容搬迁


  1. // 来源:
  2. //
  3. //
  4. typedef struct dictEntry {
  5. void *key;
  6. union {
  7. void *val;
  8. uint64_t u64;
  9. int64_t s64;
  10. double d;
  11. } v;
  12. struct dictEntry *next;
  13. } dictEntry;
  14. // 字典类型定义
  15. typedef struct dictType {
  16. uint64_t (*hashFunction)(const void *key);
  17. void *(*keyDup)(dict *d, const void *key);
  18. void *(*valDup)(dict *d, const void *obj);
  19. int (*keyCompare)(dict *d, const void *key1, const void *key2);
  20. void (*keyDestructor)(dict *d, void *key);
  21. void (*valDestructor)(dict *d, void *obj);
  22. int (*expandAllowed)(size_t moreMem, double usedRatio);
  23. /* Allow a dictEntry to carry extra caller-defined metadata. The
  24. * extra memory is initialized to 0 when a dictEntry is allocated. */
  25. size_t (*dictEntryMetadataBytes)(dict *d);
  26. } dictType;
  27. /* This is our hash table structure. Every dictionary has two of this as we
  28. * implement incremental rehashing, for the old to the new table. */
  29. typedef struct dictht {
  30. dictEntry **table;
  31. unsigned long size;
  32. unsigned long sizemask;
  33. unsigned long used;
  34. } dictht;
  35. // hash类型定义
  36. typedef struct dict {
  37. dictType *type;
  38. void *privdata;
  39. dictht ht[2];
  40. // -1表示没有运行rehash
  41. long rehashidx; /* rehashing not in progress if rehashidx == -1 */
  42. unsigned long iterators; /* number of iterators currently running */
  43. } dict;
  44. int dictRehash(dict *d, int n) {
  45. // 空桶间隔
  46. int empty_visits = n*10; /* Max number of empty buckets to visit. */
  47. if (!dictIsRehashing(d)) return 0;
  48. while(n-- && d->ht[0].used != 0) {
  49. dictEntry *de, *nextde;
  50. /* Note that rehashidx can't overflow as we are sure there are more
  51. * elements because ht[0].used != 0 */
  52. assert(d->ht[0].size > (unsigned long)d->rehashidx);
  53. while(d->ht[0].table[d->rehashidx] == NULL) {
  54. d->rehashidx++;
  55. if (--empty_visits == 0) return 1;
  56. }
  57. de = d->ht[0].table[d->rehashidx];
  58. /* Move all the keys in this bucket from the old to the new hash HT */
  59. // 搬当前嘈的整个链表
  60. while(de) {
  61. uint64_t h;
  62. nextde = de->next;
  63. /* Get the index in the new hash table */
  64. h = dictHashKey(d, de->key) & d->ht[1].sizemask;
  65. de->next = d->ht[1].table[h];
  66. d->ht[1].table[h] = de;
  67. d->ht[0].used--;
  68. d->ht[1].used++;
  69. de = nextde;
  70. }
  71. d->ht[0].table[d->rehashidx] = NULL;
  72. d->rehashidx++;
  73. }
  74. /* Check if we already rehashed the whole table... */
  75. if (d->ht[0].used == 0) {
  76. zfree(d->ht[0].table);
  77. d->ht[0] = d->ht[1];
  78. _dictReset(&d->ht[1]);
  79. d->rehashidx = -1;
  80. return 0;
  81. }
  82. /* More to rehash... */
  83. return 1;
  84. }
  85. static void _dictRehashStep(dict *d) {
  86. if (d->iterators == 0) dictRehash(d,1);
  87. }
  88. #define dictIsRehashing(d) ((d)->rehashidx != -1)
  89. /* Add or Overwrite:
  90. * Add an element, discarding the old value if the key already exists.
  91. * Return 1 if the key was added from scratch, 0 if there was already an
  92. * element with such key and dictReplace() just performed a value update
  93. * operation. */
  94. int dictReplace(dict *d, void *key, void *val)
  95. {
  96. dictEntry *entry, *existing, auxentry;
  97. /* Try to add the element. If the key
  98. * does not exists dictAdd will succeed. */
  99. entry = dictAddRaw(d,key,&existing);
  100. if (entry) {
  101. dictSetVal(d, entry, val);
  102. return 1;
  103. }
  104. /* Set the new value and free the old one. Note that it is important
  105. * to do that in this order, as the value may just be exactly the same
  106. * as the previous one. In this context, think to reference counting,
  107. * you want to increment (set), and then decrement (free), and not the
  108. * reverse. */
  109. auxentry = *existing;
  110. dictSetVal(d, existing, val);
  111. dictFreeVal(d, &auxentry);
  112. return 0;
  113. }
  114. /* Add an element to the target hash table */
  115. int dictAdd(dict *d, void *key, void *val)
  116. {
  117. dictEntry *entry = dictAddRaw(d,key,NULL);
  118. if (!entry) return DICT_ERR;
  119. dictSetVal(d, entry, val);
  120. return DICT_OK;
  121. }
  122. dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
  123. {
  124. long index;
  125. dictEntry *entry;
  126. dictht *ht;
  127. // 如果正在执行rehash 则执行渐进式扩容
  128. if (dictIsRehashing(d)) _dictRehashStep(d);
  129. /* Get the index of the new element, or -1 if
  130. * the element already exists. */
  131. // 查询下标索引
  132. if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
  133. return NULL;
  134. /* Allocate the memory and store the new entry.
  135. * Insert the element in top, with the assumption that in a database
  136. * system it is more likely that recently added entries are accessed
  137. * more frequently. */
  138. ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
  139. entry = zmalloc(sizeof(*entry));
  140. entry->next = ht->table[index];
  141. ht->table[index] = entry;
  142. ht->used++;
  143. /* Set the hash entry fields. */
  144. dictSetKey(d, entry, key);
  145. return entry;
  146. }
  147. static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
  148. {
  149. unsigned long idx, table;
  150. dictEntry *he;
  151. if (existing) *existing = NULL;
  152. /* Expand the hash table if needed */
  153. if (_dictExpandIfNeeded(d) == DICT_ERR)
  154. return -1;
  155. // 同时查询两个哈希表
  156. for (table = 0; table <= 1; table++) {
  157. idx = hash & d->ht[table].sizemask;
  158. /* Search if this slot does not already contain the given key */
  159. he = d->ht[table].table[idx];
  160. while(he) {
  161. if (key==he->key || dictCompareKeys(d, key, he->key)) {
  162. if (existing) *existing = he;
  163. return -1;
  164. }
  165. he = he->next;
  166. }
  167. if (!dictIsRehashing(d)) break;
  168. }
  169. return idx;
  170. }
  171. /* This is the initial size of every hash table */
  172. #define DICT_HT_INITIAL_SIZE 4
  173. static int dict_can_resize = 1;
  174. static unsigned int dict_force_resize_ratio = 5;
  175. /* Expand the hash table if needed */
  176. static int _dictExpandIfNeeded(dict *d)
  177. {
  178. /* Incremental rehashing already in progress. Return. */
  179. if (dictIsRehashing(d)) return DICT_OK;
  180. // 初始大小为4
  181. /* If the hash table is empty expand it to the initial size. */
  182. if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);
  183. /* If we reached the 1:1 ratio, and we are allowed to resize the hash
  184. * table (global setting) or we should avoid it but the ratio between
  185. * elements/buckets is over the "safe" threshold, we resize doubling
  186. * the number of buckets. */
  187. // 判断是否达到扩容条件 如果放置的元素和大小已经相等 且 dict_can_resize为1或者达到强制扩容阈值5
  188. if (d->ht[0].used >= d->ht[0].size &&
  189. (dict_can_resize ||
  190. d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
  191. {
  192. return dictExpand(d, d->ht[0].used*2);
  193. }
  194. return DICT_OK;
  195. }
  196. dictEntry *dictFind(dict *d, const void *key)
  197. {
  198. dictEntry *he;
  199. uint64_t h, idx, table;
  200. if (d->ht[0].used + d->ht[1].used == 0) return NULL; /* dict is empty */
  201. if (dictIsRehashing(d)) _dictRehashStep(d);
  202. h = dictHashKey(d, key);
  203. for (table = 0; table <= 1; table++) {
  204. idx = h & d->ht[table].sizemask;
  205. he = d->ht[table].table[idx];
  206. while(he) {
  207. if (key==he->key || dictCompareKeys(d, key, he->key))
  208. return he;
  209. he = he->next;
  210. }
  211. if (!dictIsRehashing(d)) return NULL;
  212. }
  213. return NULL;
  214. }

Redis dict的整体实现如下:





最后可能会考虑到一个问题是,如果Redis长时间没有读写操作,那么rehash岂不是永远不会完成,其实Redis也会有定时任务来执行rehash操作,在server.c中可以找到serverCron函数,这个函数按照特定的时钟周期被触发,默认的server.hz为10,从Redis 5.0开始可以根据客户端的负载自动调整时钟周期,在serverCron函数中会调用databasesCron函数,当不执行rdb和aof持久化的时候则会执行rehash,具体代码段参考:

  1. if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
  2. /* We use global counters so if we stop the computation at a given
  3. * DB we'll be able to start from the successive in the next
  4. * cron loop iteration. */
  5. static unsigned int resize_db = 0;
  6. static unsigned int rehash_db = 0;
  7. int dbs_per_call = CRON_DBS_PER_CALL;
  8. int j;
  9. /* Don't test more DBs than we have. */
  10. if (dbs_per_call > server.dbnum) dbs_per_call = server.dbnum;
  11. /* Resize */
  12. for (j = 0; j < dbs_per_call; j++) {
  13. tryResizeHashTables(resize_db % server.dbnum);
  14. resize_db++;
  15. }
  16. /* Rehash */
  17. if (server.activerehashing) {
  18. for (j = 0; j < dbs_per_call; j++) {
  19. int work_done = incrementallyRehash(rehash_db);
  20. if (work_done) {
  21. /* If the function did some work, stop here, we'll do
  22. * more at the next cron loop. */
  23. break;
  24. } else {
  25. /* If this db didn't need rehash, we'll try the next one. */
  26. rehash_db++;
  27. rehash_db %= server.dbnum;
  28. }
  29. }
  30. }
  31. }


  1. int incrementallyRehash(int dbid) {
  2. /* Keys dictionary */
  3. if (dictIsRehashing(server.db[dbid].dict)) {
  4. dictRehashMilliseconds(server.db[dbid].dict,1);
  5. return 1; /* already used our millisecond for this loop... */
  6. }
  7. /* Expires */
  8. if (dictIsRehashing(server.db[dbid].expires)) {
  9. dictRehashMilliseconds(server.db[dbid].expires,1);
  10. return 1; /* already used our millisecond for this loop... */
  11. }
  12. return 0;
  13. }


  1. /* Rehash for an amount of time between ms milliseconds and ms+1 milliseconds */
  2. int dictRehashMilliseconds(dict *d, int ms) {
  3. long long start = timeInMilliseconds();
  4. int rehashes = 0;
  5. while(dictRehash(d,100)) {
  6. rehashes += 100;
  7. if (timeInMilliseconds()-start > ms) break;
  8. }
  9. return rehashes;
  10. }




2.1 List类型


其中zlbytes是压缩列表的字节数,包括本身的4个字节,zltail是压缩列表最后1个entry的偏移量,本身占用4个字节,设置这个的好处是可以直接从尾部弹出元素而无需遍历压缩列表,然后zllen表示压缩列表中的entry数量,本身占用2个字节,也就是最多放2^16 - 2个条目,当超过这个条目时设置为2^16 - 1,这时候需要遍历才能知道多少个entry,正常是不会超过的,entry的组成下面说,最后是zlend表示ziplist的结尾,长度1个字节,固定为0xff



  1. /* Encode the length of the previous entry and write it to "p". This only
  2. * uses the larger encoding (required in __ziplistCascadeUpdate). */
  3. int zipStorePrevEntryLengthLarge(unsigned char *p, unsigned int len) {
  4. if (p != NULL) {
  5. p[0] = ZIP_BIG_PREVLEN;
  6. memcpy(p+1,&len,sizeof(len));
  7. memrev32ifbe(p+1);
  8. }
  9. return 1+sizeof(len);
  10. }
  11. /* Encode the length of the previous entry and write it to "p". Return the
  12. * number of bytes needed to encode this length if "p" is NULL. */
  13. unsigned int zipStorePrevEntryLength(unsigned char *p, unsigned int len) {
  14. if (p == NULL) {
  15. return (len < ZIP_BIG_PREVLEN) ? 1 : sizeof(len)+1;
  16. } else {
  17. if (len < ZIP_BIG_PREVLEN) {
  18. p[0] = len;
  19. return 1;
  20. } else {
  21. return zipStorePrevEntryLengthLarge(p,len);
  22. }
  23. }
  24. }



  1. unsigned int zipStoreEntryEncoding(unsigned char *p, unsigned char encoding, unsigned int rawlen) {
  2. unsigned char len = 1, buf[5];
  3. if (ZIP_IS_STR(encoding)) {
  4. /* Although encoding is given it may not be set for strings,
  5. * so we determine it here using the raw length. */
  6. if (rawlen <= 0x3f) {
  7. if (!p) return len;
  8. buf[0] = ZIP_STR_06B | rawlen;
  9. } else if (rawlen <= 0x3fff) {
  10. len += 1;
  11. if (!p) return len;
  12. buf[0] = ZIP_STR_14B | ((rawlen >> 8) & 0x3f);
  13. buf[1] = rawlen & 0xff;
  14. } else {
  15. len += 4;
  16. if (!p) return len;
  17. buf[0] = ZIP_STR_32B;
  18. buf[1] = (rawlen >> 24) & 0xff;
  19. buf[2] = (rawlen >> 16) & 0xff;
  20. buf[3] = (rawlen >> 8) & 0xff;
  21. buf[4] = rawlen & 0xff;
  22. }
  23. } else {
  24. /* Implies integer encoding, so length is always 1. */
  25. if (!p) return len;
  26. buf[0] = encoding;
  27. }
  28. /* Store this length at p. */
  29. memcpy(p,buf,len);
  30. return len;
  31. }


  1. /* Different encoding/length possibilities */
  2. #define ZIP_STR_MASK 0xc0
  3. #define ZIP_INT_MASK 0x30
  4. #define ZIP_STR_06B (0 << 6)
  5. #define ZIP_STR_14B (1 << 6)
  6. #define ZIP_STR_32B (2 << 6)
  7. // entry值为整数时的表示
  8. #define ZIP_INT_16B (0xc0 | 0<<4)
  9. #define ZIP_INT_32B (0xc0 | 1<<4)
  10. #define ZIP_INT_64B (0xc0 | 2<<4)
  11. #define ZIP_INT_24B (0xc0 | 3<<4)
  12. #define ZIP_INT_8B 0xfe



  1. #define SIZE_SAFETY_LIMIT 8192


  1. // list由quicklist实现 定义在quicklist.h中
  2. typedef struct quicklist {
  3. quicklistNode *head;
  4. quicklistNode *tail;
  5. unsigned long count; /* total count of all entries in all ziplists */
  6. unsigned long len; /* number of quicklistNodes */
  7. int fill : 16; /* fill factor for individual nodes */
  8. unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
  9. } quicklist;
  10. typedef struct quicklistNode {
  11. struct quicklistNode *prev;
  12. struct quicklistNode *next;
  13. unsigned char *zl;
  14. unsigned int sz; /* ziplist size in bytes */
  15. unsigned int count : 16; /* count of items in ziplist */
  16. unsigned int encoding : 2; /* RAW==1 or LZF==2 */
  17. unsigned int container : 2; /* NONE==1 or ZIPLIST==2 */
  18. unsigned int recompress : 1; /* was this node previous compressed? */
  19. unsigned int attempted_compress : 1; /* node can't compress; too small */
  20. unsigned int extra : 10; /* more bits to steal for future usage */
  21. } quicklistNode;





  1. #define COMPRESS_MAX (1 << 16)
  2. void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
  3. if (compress > COMPRESS_MAX) {
  4. compress = COMPRESS_MAX;
  5. } else if (compress < 0) {
  6. compress = 0;
  7. }
  8. quicklist->compress = compress;
  9. }
  10. #define FILL_MAX (1 << 15)
  11. void quicklistSetFill(quicklist *quicklist, int fill) {
  12. if (fill > FILL_MAX) {
  13. fill = FILL_MAX;
  14. } else if (fill < -5) {
  15. fill = -5;
  16. }
  17. quicklist->fill = fill;
  18. }
  19. void quicklistSetOptions(quicklist *quicklist, int fill, int depth) {
  20. quicklistSetFill(quicklist, fill);
  21. quicklistSetCompressDepth(quicklist, depth);
  22. }
  23. void lpushCommand(client *c) {
  24. pushGenericCommand(c,LIST_HEAD);
  25. }
  26. // 命令入口
  27. void rpushCommand(client *c) {
  28. pushGenericCommand(c,LIST_TAIL);
  29. }
  30. void pushGenericCommand(client *c, int where) {
  31. int j, pushed = 0;
  32. for (j = 2; j < c->argc; j++) {
  33. if (sdslen(c->argv[j]->ptr) > LIST_MAX_ITEM_SIZE) {
  34. addReplyError(c, "Element too large");
  35. return;
  36. }
  37. }
  38. robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
  39. if (lobj && lobj->type != OBJ_LIST) {
  40. addReply(c,shared.wrongtypeerr);
  41. return;
  42. }
  43. for (j = 2; j < c->argc; j++) {
  44. if (!lobj) {
  45. // 调用object.c中的函数创建object
  46. lobj = createQuicklistObject();
  47. // 设置ziplist最大大小和非压缩深度
  48. quicklistSetOptions(lobj->ptr, server.list_max_ziplist_size,
  49. server.list_compress_depth);
  50. dbAdd(c->db,c->argv[1],lobj);
  51. }
  52. listTypePush(lobj,c->argv[j],where);
  53. pushed++;
  54. }
  55. addReplyLongLong(c, (lobj ? listTypeLength(lobj) : 0));
  56. if (pushed) {
  57. char *event = (where == LIST_HEAD) ? "lpush" : "rpush";
  58. signalModifiedKey(c->db,c->argv[1]);
  59. notifyKeyspaceEvent(NOTIFY_LIST,event,c->argv[1],c->db->id);
  60. }
  61. server.dirty += pushed;
  62. }
  63. // object.c createQuicklistObject
  64. robj *createQuicklistObject(void) {
  65. quicklist *l = quicklistCreate(); // quicklist.c quicklistCreate
  66. robj *o = createObject(OBJ_LIST,l);
  67. o->encoding = OBJ_ENCODING_QUICKLIST;
  68. return o;
  69. }
  70. // quicklist.c quicklistCreate
  71. quicklist *quicklistCreate(void) {
  72. struct quicklist *quicklist;
  73. quicklist = zmalloc(sizeof(*quicklist));
  74. quicklist->head = quicklist->tail = NULL;
  75. quicklist->len = 0;
  76. quicklist->count = 0;
  77. quicklist->compress = 0;
  78. quicklist->fill = -2;
  79. return quicklist;
  80. }
  81. void listTypePush(robj *subject, robj *value, int where) {
  82. if (subject->encoding == OBJ_ENCODING_QUICKLIST) {
  83. int pos = (where == LIST_HEAD) ? QUICKLIST_HEAD : QUICKLIST_TAIL;
  84. value = getDecodedObject(value);
  85. size_t len = sdslen(value->ptr);
  86. quicklistPush(subject->ptr, value->ptr, len, pos);
  87. decrRefCount(value);
  88. } else {
  89. serverPanic("Unknown list encoding");
  90. }
  91. }

插入可以从左端和右端,左端就是从链表头部插入,右端则是从链表尾部插入,最终都会带着条件调用pushGenericCommand函数,过程同样是先lookupKey从全局哈希表中看看key是否存在,如果不存在则首先创建整个的quicklist结构,然后通过dbAdd写入db的全局哈希表中,最终还是调用了前面的dictAdd函数,其中quick list创建比较简单,只是设置了一些默认值,然后会调用quicklistSetOptions函数设置ziplist的最大大小和非压缩深度,这两个参数都是通过Redis的配置文件传入,默认定义为:

  1. /* List defaults */



  1. /* Wrapper to allow argument-based switching between HEAD/TAIL pop */
  2. void quicklistPush(quicklist *quicklist, void *value, const size_t sz,
  3. int where) {
  4. if (where == QUICKLIST_HEAD) {
  5. quicklistPushHead(quicklist, value, sz);
  6. } else if (where == QUICKLIST_TAIL) {
  7. quicklistPushTail(quicklist, value, sz);
  8. }
  9. }
  10. int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) {
  11. quicklistNode *orig_head = quicklist->head;
  12. assert(sz < UINT32_MAX); /* TODO: add support for quicklist nodes that are sds encoded (not zipped) */
  13. if (likely(
  14. _quicklistNodeAllowInsert(quicklist->head, quicklist->fill, sz))) {
  15. quicklist->head->zl =
  16. ziplistPush(quicklist->head->zl, value, sz, ZIPLIST_HEAD);
  17. quicklistNodeUpdateSz(quicklist->head);
  18. } else {
  19. quicklistNode *node = quicklistCreateNode();
  20. node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_HEAD);
  21. quicklistNodeUpdateSz(node);
  22. _quicklistInsertNodeBefore(quicklist, quicklist->head, node);
  23. }
  24. quicklist->count++;
  25. quicklist->head->count++;
  26. return (orig_head != quicklist->head);
  27. }
  28. int quicklistPushTail(quicklist *quicklist, void *value, size_t sz) {
  29. quicklistNode *orig_tail = quicklist->tail;
  30. assert(sz < UINT32_MAX); /* TODO: add support for quicklist nodes that are sds encoded (not zipped) */
  31. if (likely(
  32. _quicklistNodeAllowInsert(quicklist->tail, quicklist->fill, sz))) {
  33. quicklist->tail->zl =
  34. ziplistPush(quicklist->tail->zl, value, sz, ZIPLIST_TAIL);
  35. quicklistNodeUpdateSz(quicklist->tail);
  36. } else {
  37. quicklistNode *node = quicklistCreateNode();
  38. node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_TAIL);
  39. quicklistNodeUpdateSz(node);
  40. _quicklistInsertNodeAfter(quicklist, quicklist->tail, node);
  41. }
  42. quicklist->count++;
  43. quicklist->tail->count++;
  44. return (orig_tail != quicklist->tail);
  45. }
  46. REDIS_STATIC quicklistNode *quicklistCreateNode(void) {
  47. quicklistNode *node;
  48. node = zmalloc(sizeof(*node));
  49. node->zl = NULL;
  50. node->count = 0;
  51. node->sz = 0;
  52. node->next = node->prev = NULL;
  53. node->encoding = QUICKLIST_NODE_ENCODING_RAW;
  55. node->recompress = 0;
  56. return node;
  57. }

同样插入分为插入到头部或者插入到尾部,先看下整体过程,如果是插入到头部,那么首先判断头部的ziplist是不是满了,如果没满则允许插入,对ziplist进行修改放到头部的位置,其实就是对ziplist的空间进行扩容,将所有的entry往后搬迁然后将value填充到前面,详细的操作就在ziplist.c中,否则如果满了就更简单了,直接new一个新的quicklist node,然后用value填充形成ziplist,最后执行_quicklistInsertNodeBefore将当前新建的node插入到当前双向链表的头部,具体代码就是双向链表的操作代码,比较简单,另外在尾部插入也是类似的操作,只是和头部写入相比少了数据搬迁的过程,效率相对来说更高一些,所以rpush操作比lpush操作性能略高一些。


  1. #define sizeMeetsSafetyLimit(sz) ((sz) <= SIZE_SAFETY_LIMIT)
  2. REDIS_STATIC int _quicklistNodeAllowInsert(const quicklistNode *node,
  3. const int fill, const size_t sz) {
  4. if (unlikely(!node))
  5. return 0;
  6. int ziplist_overhead;
  7. /* size of previous offset */
  8. if (sz < 254)
  9. ziplist_overhead = 1;
  10. else
  11. ziplist_overhead = 5;
  12. /* size of forward offset */
  13. if (sz < 64)
  14. ziplist_overhead += 1;
  15. else if (likely(sz < 16384))
  16. ziplist_overhead += 2;
  17. else
  18. ziplist_overhead += 5;
  19. /* new_sz overestimates if 'sz' encodes to an integer type */
  20. unsigned int new_sz = node->sz + sz + ziplist_overhead;
  21. if (likely(_quicklistNodeSizeMeetsOptimizationRequirement(new_sz, fill)))
  22. return 1;
  23. /* when we return 1 above we know that the limit is a size limit (which is
  24. * safe, see comments next to optimization_level and SIZE_SAFETY_LIMIT) */
  25. else if (!sizeMeetsSafetyLimit(new_sz))
  26. return 0;
  27. else if ((int)node->count < fill)
  28. return 1;
  29. else
  30. return 0;
  31. }
  32. // 表示ziplist级别的常量数组
  33. static const size_t optimization_level[] = {4096, 8192, 16384, 32768, 65536};
  34. REDIS_STATIC int
  35. _quicklistNodeSizeMeetsOptimizationRequirement(const size_t sz,
  36. const int fill) {
  37. if (fill >= 0)
  38. return 0;
  39. size_t offset = (-fill) - 1;
  40. if (offset < (sizeof(optimization_level) / sizeof(*optimization_level))) {
  41. if (sz <= optimization_level[offset]) {
  42. return 1;
  43. } else {
  44. return 0;
  45. }
  46. } else {
  47. return 0;
  48. }
  49. }


另外还会注意到很有趣的一点就是,从quicklistPushHead再到_quicklistNodeAllowInsert调用了2次likely函数,还有1次unlikely函数,其实likely/unlikely都属于系统调用,作用是用来优化CPU的分支预测,如果一个条件我们认为经常成立,那么可以用likely告诉CPU按照期望来预测,unlikely是反过来的会告诉CPU不要走这个分支,因为CPU自动分支预测会先对运行结果进行学习然后再预测,如果使用likely/unlikely系统调用后相当于告诉CPU先验知识,使用恰当可以提升程序的运行效率,但是反过来如果用的不好或者用反了,要比默认情况下性能低,因为默认情况下fill的值为-2所以会百分之百进入第一个分支,所以加上likely是会提升性能的,但是如果配置为1个大于0的值,性能会有所降低,这个配置项对应redis.conf中的list-max-ziplist-size -2,推荐配置为-2或者-1,不要自己指定长度。



  1. list的push和pop操作复杂度都是O(1),计算长度的llen复杂度也是O(1),这类操作是比较高效的。
  2. 像lrange、lset、lindex操作的复杂度都是O(n),要尽量避免使用,但是由于ziplist是包含一组元素,所以按照下标查找可以一次跳过整个ziplist,相比普通的双向链表比还是比较高效的,lrange这种操作一般用在查看头部或尾部少量元素时使用。
  3. list-max-ziplist-size正常建议配置为-2或-1,不要自己指定长度。


2.2 Hash类型




  1. // t_hash.c hset/hmset前先查找key
  2. robj *hashTypeLookupWriteOrCreate(client *c, robj *key) {
  3. robj *o = lookupKeyWrite(c->db,key);
  4. if (o == NULL) {
  5. // 不存在则先创建hash object
  6. o = createHashObject();
  7. dbAdd(c->db,key,o);
  8. } else {
  9. if (o->type != OBJ_HASH) {
  10. addReply(c,shared.wrongtypeerr);
  11. return NULL;
  12. }
  13. }
  14. return o;
  15. }
  16. // object.c 创建hash object
  17. robj *createHashObject(void) {
  18. // 创建ziplist对象
  19. unsigned char *zl = ziplistNew();
  20. robj *o = createObject(OBJ_HASH, zl);
  21. o->encoding = OBJ_ENCODING_ZIPLIST;
  22. return o;
  23. }


  1. /* Check if the ziplist needs to be converted to a hash table */
  2. if (hashTypeLength(o) > server.hash_max_ziplist_entries)
  3. hashTypeConvert(o, OBJ_ENCODING_HT);




2.3 Set和ZSet







  1. /* Lookup the key and create the sorted set if does not exist. */
  2. zobj = lookupKeyWrite(c->db,key);
  3. if (zobj == NULL) {
  4. if (xx) goto reply_to_client; /* No key + XX option: nothing to do. */
  5. if (server.zset_max_ziplist_entries == 0 ||
  6. server.zset_max_ziplist_value < sdslen(c->argv[scoreidx+1]->ptr))
  7. {
  8. zobj = createZsetObject();
  9. } else {
  10. zobj = createZsetZiplistObject();
  11. }
  12. dbAdd(c->db,key,zobj);
  13. } else {
  14. if (zobj->type != OBJ_ZSET) {
  15. addReply(c,shared.wrongtypeerr);
  16. goto cleanup;
  17. }
  18. }


  1. /* check if the element is too large or the list
  2. * becomes too long *before* executing zzlInsert. */
  3. if (zzlLength(zobj->ptr)+1 > server.zset_max_ziplist_entries ||
  4. sdslen(ele) > server.zset_max_ziplist_value ||
  5. !ziplistSafeToAdd(zobj->ptr, sdslen(ele)))
  6. {
  7. zsetConvert(zobj,OBJ_ENCODING_SKIPLIST);
  8. } else {
  9. zobj->ptr = zzlInsert(zobj->ptr,ele,score);
  10. if (newscore) *newscore = score;
  11. *flags |= ZADD_ADDED;
  12. return 1;
  13. }



  1. /* ZSETs use a specialized version of Skiplists */
  2. typedef struct zskiplistNode {
  3. sds ele;
  4. double score;
  5. struct zskiplistNode *backward;
  6. struct zskiplistLevel {
  7. struct zskiplistNode *forward;
  8. unsigned long span;
  9. } level[];
  10. } zskiplistNode;
  11. typedef struct zskiplist {
  12. struct zskiplistNode *header, *tail;
  13. unsigned long length;
  14. int level;
  15. } zskiplist;
  16. typedef struct zset {
  17. dict *dict;
  18. zskiplist *zsl;
  19. } zset;

可以看到zset除了zskiplist之外还定义了dict,我们知道对于跳表来说可以实现插入、查找、删除的复杂度都是O(log N),除了这个之外zset还有一些比较简单的操作例如直接根据元素获取对应的分值或者判断某元素是否在zset中等,对于这些操作可以进一步提升性能,所以作者通过空间换时间的方式增加了一个dict来维护元素值和分值的关系,像ZSCORE的复杂度就是O(1),从而对跳表进行加速,对于计算zset长度的操作,因为zskiplist中也会维护计数,复杂度也是O(1)。

综上总结下就是:单元素操作大部分复杂度都是O(1),例如:HGET、HSET、HDEL、SADD、SREM、ZSCORE、LPUSH、LPOP、RPUSH、RPOP、ZSCORE等,计算长度的复杂度也是O(1),例如:LLEN、HLEN、ZCARD等,对于zset的其他操作大部分复杂度都是O(log n),如:ZRANGEBYSCORE、ZADD、ZCOUNT、ZINCRBY、ZRANK、ZLEXCOUNT等,这些操作在大部分情况下都是比较高效的。

另外还有些比较危险的操作,例如:keys *、HGETALL、SMEMBERS、LRANGE、ZRANGE这些操作,复杂度都是O(n),需要对整个数据结构进行遍历,可能会带来非常大的开销,直接阻塞其他请求的执行,使用时务必谨慎,除非你知道自己在做什么,这些操作推荐使用Redis提供的游标进行操作,对应的有SCAN、HSCAN、SSCAN、ZSCAN等,这些操作都是每次返回一小批数据,然后基于游标再进行迭代,这样不会一次性查询造成其他的请求阻塞,CPU时间被轮换使用。



