redis设计与实现——字典

在字典中，一个key和一个value进行关联，从而组成键值对，但C语言并没有内置了这种数据结构，因为redis自行构建了。字典是哈希键的底层实现之一。

字典的实现

Redis的字典使用了哈希表作为底层实现，每个表内部含有多个哈希表节点。

哈希表节点

首先是键值对的定义，即每个dictEntry结构保存着一个键值对。

typedef struct dictEntry {
    void *key; // 键
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v; // 值
    struct dictEntry *next; // 指向下一个哈希表节点，形成链表
} dictEntry;

key-value中的值可以是一个指针，也可能是一个整数或者double。next则是用来以链表的形式解决哈希冲突的问题。

哈希表

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
    dictEntry **table; // 哈希表数组
    unsigned long size; // 哈希表大小
    unsigned long sizemask; // 哈希表掩码，用来计算索引值，等于size-1
    unsigned long used; // 已有节点的个数
} dictht;

字典

为了使用方便，redis的字典在上面哈希表再多封装一层。

typedef struct dict {
  	// 类型特定的函数
    dictType *type;
  	// 私有数据
    void *privdata;
  	// 两个哈希表，其中一个用来存储当前使用的，另一个则是用来做rehash
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

这里的type属性保存了一个指向dictType结构的指针，而每个dictType结构都包含了一组用于操作特定类型键值对的函数。而private属性则保存了需要传递给那些类型特定函数的可选参数：

typedef struct dictType {
  	// 计算哈希值的函数
    uint64_t (*hashFunction)(const void *key); 
  	// 复制key的函数
    void *(*keyDup)(void *privdata, const void *key);
  	// 复制value的函数
    void *(*valDup)(void *privdata, const void *obj);
 		// 对比key的函数
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
  	// 销毁key的函数
    void (*keyDestructor)(void *privdata, void *key);
  	// 销毁value的函数
    void (*valDestructor)(void *privdata, void *obj);
} dictType;

至此，Redis的封装层级就是dict->dictht->dictEntry

哈希算法

当需要添加一个新的键值对到dict里面，Redis会根据key计算出哈希值和索引值，再把包含键值对的哈希节点放到哈希表数组的指定索引上。

先是调用dictAdd

int dictAdd(dict *d, void *key, void *val)
{
    dictEntry *entry = dictAddRaw(d,key,NULL);

    if (!entry) return DICT_ERR;
    dictSetVal(d, entry, val);
    return DICT_OK;
}

其中dictAddRaw会增加一个dictEntry，但不会设置value值，而是由用户自行决定如何设置。同时这个API也直接暴露给用户，用户可以自行调用，比如设置非指针值。

1 2	entry = dictAddRaw(dict,mykey,NULL); if (entry != NULL) dictSetSignedIntegerVal(entry,1000);

dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
    long index;
    dictEntry *entry;
    dictht *ht;
		
  	// 判断是否需要rehashing
    if (dictIsRehashing(d)) _dictRehashStep(d);

    // 获取新元素index，如果已存在则返回NULL
    if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
        return NULL;

		// 如果需要rehasing，则插入到ht[1]中
    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
  	// 分配内存
    entry = zmalloc(sizeof(*entry));
  	// 插入到table的头部，这样就可以以O(1)的时间解决哈希冲突
    entry->next = ht->table[index];
    ht->table[index] = entry;
    ht->used++;

    /* Set the hash entry fields. */
    dictSetKey(d, entry, key); // 设置key
    return entry;
}

这里的关键是获取index，这样才能新增一个entry，并设置对应的key。准确来说，是先获取对应的hash值，再利用该hash值计算索引index。

// 使用dict设置的哈希函数，计算key的哈希值
#define dictHashKey(d, key) (d)->type->hashFunction(key)

// 如果key已经存在，则返回-1
static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
{
    unsigned long idx, table;
    dictEntry *he;
    if (existing) *existing = NULL; // existing用来获取当前的entry

    // 在需要时扩展整个dict
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return -1;
  	// 在两个hash表中查找是否存在相同key
    for (table = 0; table <= 1; table++) { 
      	// 使用哈希和sizemark掩码计算index
        idx = hash & d->ht[table].sizemask;
        // 遍历idx对应的table slot，判断该table不含有相同的key
        he = d->ht[table].table[idx];
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key)) {
                if (existing) *existing = he;
                return -1;
            }
            he = he->next;
        }
        if (!dictIsRehashing(d)) break; // 如果不是正在rehash，则直接break，不去ht[1]中寻找
    }
    return idx;
}

计算index的过程，首先是判断是否需要扩展dict，然后遍历两个哈希表，在dictEnrty数组中遍历，确保不含有相同的key。

解决键冲突

假设Redis计算得出k1和k2的索引值相同，则这就是发生了冲突。Redis使用链地址法解决冲突。每个哈希表节点都有一个next指针，在发生冲突时就使用next指针将k1和k2所在节点连接起来。同时，由于dictEntry节点组成链表没有指向尾部的指针，为了速度考虑，直接将新节点插入到头部。如上代码所示。

Rehash

首先来看扩展：正在rehashing的直接返回；第一次增加键值对的，直接扩展哈希表的大小到4；接下来是判断在什么情况下，redis会对哈希表进行扩展：

服务器没有执行BGSAVE或者BGREWRITEAOF命令时，哈希表的负载因子大于或等于1；
服务器正在执行BGSAVE或者BGREWRITEAOF命令时，哈希表的负载因子大于5；

之所以这样设计，是因为在子进程存在期间，操作系统是采用写时复制的技术，此时如果进行哈希扩展，就可能产生大量内存写入操作。

// 至于扩展dict也比较直接
static int _dictExpandIfNeeded(dict *d)
{
    // 正在rehashing的，直接返回
    if (dictIsRehashing(d)) return DICT_OK;

    // 第一次新增，则扩展到4；#define DICT_HT_INITIAL_SIZE     4
    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    // static int dict_can_resize = 1;
		// static unsigned int dict_force_resize_ratio = 5;
  
  	/*
  	在server.c里，会根据服务器是否存在有aof或者rdb的子进程，即是否在执行BGSAVE或者BGREWRITEAOF命令
  	void updateDictResizePolicy(void) {
        if (server.rdb_child_pid == -1 && server.aof_child_pid == -1)
            dictEnableResize();
        else
            dictDisableResize();
    }
  	*/
    if (d->ht[0].used >= d->ht[0].size &&
        (dict_can_resize ||
         d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
    {
        return dictExpand(d, d->ht[0].used*2);
    }
    return DICT_OK;
}

至于收缩，当哈希表的负载因子小于0.1时，Redis会进行收缩操作。这一个操作是在server.c里进行的。

#define HASHTABLE_MIN_FILL 10    /* Minimal hash table fill 10% */

int htNeedsResize(dict *dict) {
    long long size, used;

    size = dictSlots(dict);
    used = dictSize(dict);
    return (size > DICT_HT_INITIAL_SIZE &&
            (used*100/size < HASHTABLE_MIN_FILL));
}

渐进式rehash

由前面的结构体可以看到dict中有ht[0]和ht[1]，这样的设计可以保证在扩展或者收缩哈希表的时候可以将ht[0]里面的所有键值对rehash到ht[1]。而这一个步骤是分多次、渐进式地完成的，避免过大的计算量导致服务器在一段时间内停止服务。

在_dictExpandIfNeeded中，当满足扩展条件时会调用dictExpand。

int dictExpand(dict *d, unsigned long size)
{
    /* the size is invalid if it is smaller than the number of
     * elements already inside the hash table */
    if (dictIsRehashing(d) || d->ht[0].used > size)
        return DICT_ERR;

    dictht n; /* the new hash table */
  	
  	/*
  	static unsigned long _dictNextPower(unsigned long size)
    {
        unsigned long i = DICT_HT_INITIAL_SIZE; // 4

        if (size >= LONG_MAX) return LONG_MAX + 1LU;
        while(1) {
            if (i >= size)
                return i;
            i *= 2;
        }
    }
  	*/
    unsigned long realsize = _dictNextPower(size); // 获取哈希表容量，4，8，16，32，直到比size大

    /* Rehashing to the same table size is not useful. */
    if (realsize == d->ht[0].size) return DICT_ERR;

    /* Allocate the new hash table and initialize all pointers to NULL */
    n.size = realsize;
    n.sizemask = realsize-1;
    n.table = zcalloc(realsize*sizeof(dictEntry*)); // 分配新的表
    n.used = 0;

    /* Is this the first initialization? If so it's not really a rehashing
     * we just set the first hash table so that it can accept keys. */
    if (d->ht[0].table == NULL) {
        d->ht[0] = n;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    d->ht[1] = n;
    d->rehashidx = 0; // 更新rehashidx变量，后续有用
    return DICT_OK;
}

在分配完dict之后，就可以进行rehash的操作了。在redis中，有两种rehash方式。

dictRehashMilliseconds：按照ms计时的rehash操作，是databasesCron中针对redis的DB进行rehash。serverCron后台定时任务会每次调用databasesCron()进而调用incrementallyRehash，每隔一段时间就会执行。

/* Rehash for an amount of time between ms milliseconds and ms+1 milliseconds */
int dictRehashMilliseconds(dict *d, int ms) {
    long long start = timeInMilliseconds();
    int rehashes = 0;

    while(dictRehash(d,100)) {
        rehashes += 100;
        if (timeInMilliseconds()-start > ms) break;
    }
    return rehashes;
}

在server.c里会调用这个函数，使用了CPU时间的1ms。该函数的作用，是对正在rehash的字典，每次执行1毫秒，每次循环100次的哈希表数据迁移。

int incrementallyRehash(int dbid) {
    /* Keys dictionary */
    if (dictIsRehashing(server.db[dbid].dict)) {
        dictRehashMilliseconds(server.db[dbid].dict,1);
        return 1; /* already used our millisecond for this loop... */
    }
    /* Expires */
    if (dictIsRehashing(server.db[dbid].expires)) {
        dictRehashMilliseconds(server.db[dbid].expires,1);
        return 1; /* already used our millisecond for this loop... */
    }
    return 0;
}

_dictRehashStep：这是一个单步的rehasing，在执行对dict的增删改查中都会被调用

1
2
3

static void _dictRehashStep(dict *d) {
    if (d->iterators == 0) dictRehash(d,1);
}

比如前面的添加键值对，或者查找操作。但相对其它的操作会在两个哈希表中进行，字典的添加只会在ht[1]中直接新增

 dictEntry *dictAddRaw(dict *d, void *key)   // 新增一个entry
 {
     if (dictIsRehashing(d)) _dictRehashStep(d); // 如果在rehash执行一步rehash
     // 正在rehashing的话，直接在ht[1]中添加
   	 ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
 }

dictEntry *dictFind(dict *d, const void *key)
{
    // ...
    if (dictIsRehashing(d)) _dictRehashStep(d);
    // ...
}

上面两个函数都调用了通用的rehash函数

int dictRehash(dict *d, int n) {
    int empty_visits = n*10; /* 最多读取n*10个空节点，避免rehash阻塞太久*/
    if (!dictIsRehashing(d)) return 0;

    while(n-- && d->ht[0].used != 0) {
        dictEntry *de, *nextde;

        /* 确保rehashidx不会溢出 */
        assert(d->ht[0].size > (unsigned long)d->rehashidx);
      	/* 遍历找到非空节点 */
        while(d->ht[0].table[d->rehashidx] == NULL) {
            d->rehashidx++;
            if (--empty_visits == 0) return 1;
        }
        de = d->ht[0].table[d->rehashidx];
        /* 将当前节点的所有key-value对转存到ht[1]中 */
        while(de) {
            uint64_t h;

            nextde = de->next;
            /* Get the index in the new hash table */
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;
            de->next = d->ht[1].table[h];
            d->ht[1].table[h] = de;
            d->ht[0].used--;
            d->ht[1].used++;
            de = nextde;
        }
        d->ht[0].table[d->rehashidx] = NULL; // 释放该节点
        d->rehashidx++;
    }

    /* 检查是否已经完全rehash */
    if (d->ht[0].used == 0) {
        zfree(d->ht[0].table);
        d->ht[0] = d->ht[1];
        _dictReset(&d->ht[1]);
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}

因为哈希表可能存在大量的空节点，redis的做法是每次遍历10n个节点，如果还没找到非空节点即返回，这里的n是step数。这样就可以避免rehash的时候，阻塞太久。

另外，结合incrementallyRehash该函数来看，考虑到渐进式rehash在服务器比较空闲的时候将会长时间存在使用两个哈希表的时候。因此在redis的周期函数中，会花费1ms来辅助rehash。具体可以参考server.c/databasesCron()：

void databasesCron(void) {
    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
        // ....
				
      	// 这一步有可能产生缩容
      	for (j = 0; j < dbs_per_call; j++) {
            tryResizeHashTables(resize_db % server.dbnum);
            resize_db++;
        }
        /* Rehash */
      	/* 前提是配置了activerehashing，允许服务器在周期函数中辅助进行渐进式rehash，默认值是1，server.activerehashing = CONFIG_DEFAULT_ACTIVE_REHASHING; */
        if (server.activerehashing) {
            for (j = 0; j < dbs_per_call; j++) {
                int work_done = incrementallyRehash(rehash_db);
                if (work_done) {
                    /* 如果已经进行了定时rehash，则停止循环，等待下一轮cron */
                    break;
                } else {
                    /* 否则，会移动到下一个redis db进行 */
                    rehash_db++;
                    rehash_db %= server.dbnum;
                }
            }
        }
    }
}

总结rehash的详细步骤：

为 ht[1] 分配空间，让字典同时持有 ht[0] 和 ht[1] 两个哈希表；
在字典中维持一个索引计数器变量 rehashidx ，并将它的值设置为 0 ，表示 rehash 工作正式开始；
在 rehash 进行期间，每次对字典执行添加、删除、查找或者更新操作时，程序除了执行指定的操作以外，还会顺带将 ht[0] 哈希表在 rehashidx 索引上的所有键值对 rehash 到 ht[1] ，当 rehash 工作完成之后，程序将 rehashidx 属性的值增一。另外，每一步都会遍历至多十个节点，以找到非空节点；
随着字典操作的不断执行，最终在某个时间点上，ht[0] 的所有键值对都会被 rehash 至 ht[1]，这时程序将 rehashidx 属性的值设为 -1 ，表示 rehash 操作已完成。

通过将rehash键值对的计算工作分摊到增删改查的操作中，避免了对服务器性能造成影响。

关于rehash，推荐一篇文章，主要讲的是线上遇到的在rehash期间，同时有两个hash表在使用，会使得redis内存使用量瞬间突增的问题：美团针对Redis Rehash机制的探索和实践