AOF 持久化策略

Redis为了解决AOF后台重写造成的数据不一致问题，设置了AOF重写缓冲区。即使设置了no-appendfsync-on-rewrite yes也会造成短暂的主进程阻塞。原因就在于子进程完成AOF重写之后，会发送一个信号给主进程，而父进程会在这个时候调用信号处理函数，主要是将新的AOF文件替换旧的AOF文件，那么在这段时间内，主进程是阻塞的。

简介

AOF 持久化和 RDB 持久化的最主要区别在于，前者记录了数据的变更，而后者是保存了数据本身。本篇主要讲的是AOF 持久化，了解 AOF 的数据组织方式和运作机制。Redis 主要在 aof.c 中实现 AOF 的操作。

同样，AOF 持久化也会涉及文件的读写，会用到数据结构 rio。关于 rio 已经在上一个篇章已经讲述，在此不做展开。

AOF 数据组织方式

假设 redis 内存有「name:Jhon」的键值对，那么进行 AOF 持久化后，AOF 文件有如下内容：

*2     # 2 个参数

$6     # 第一个参数长度为6

SELECT # 第一个参数

$1     # 第二参数长度为1

8      # 第二参数

*3     # 3 个参数

$3     # 第一个参数长度为4

SET    # 第一个参数

$4     # 第二参数长度为4

name   # 第二个参数

$4     # 第三个参数长度为4

Jhon   # 第二参数长度为4

所以对上面的内容进行恢复，能得到熟悉的一条 Redis 命令：SELECT 8;SET name Jhon. 可以想象的是，Redis 遍历内存数据集中的每个 key-value 对，依次写入磁盘中；Redis 启动的时候，从 AOF 文件中读取数据，恢复数据。

AOF 持久化运作机制

和 redis RDB 持久化运作机制不同，redis AOF 有后台执行和边服务边备份两种方式。

1）AOF 后台执行的方式和 RDB 有类似的地方，fork 一个子进程，主进程仍进行服务，子进程执行AOF 持久化，数据被dump 到磁盘上。与 RDB 不同的是，后台子进程持久化过程中，主进程会记录期间的所有数据变更（主进程还在服务），并存储在 server.aof_rewrite_buf_blocks 中；后台子进程结束后，Redis 更新缓存追加到 AOF 文件中，是 RDB 持久化所不具备的。

来说说更新缓存这个东西。Redis 服务器产生数据变更的时候，譬如 set name Jhon，不仅仅会修改内存数据集，也会记录此更新（修改）操作，记录的方式就是上面所说的数据组织方式。

更新缓存可以存储在 server.aofbuf 中，你可以把它理解为一个小型临时中转站，所有累积的更新缓存都会先放入这里，它会在特定时机写入文件或者插入到server.aof-rewrite_buf_blocks 下链表（下面会详述）；server.aofbuf 中的数据在 propagrate() 添加，在涉及数据更新的地方都会调用propagrate() 以累积变更。更新缓存也可以存储在 server.aof-rewrite_buf_blocks，这是一个元素类型为 struct aofrwblock 的链表，你可以把它理解为一个仓库，当后台有AOF 子进程的时候，会将累积的更新缓存（在 server.aof_buf 中）插入到链表中，而当 AOF 子进程结束，它会被整个写入到文件。两者是有关联的。

这里的意图即是不用每次出现数据变更的时候都触发一个写操作，可以将写操作先缓存到内存中，待到合适的时机写入到磁盘，如此避免频繁的写操作。当然，完全可以实现让数据变更及时更新到磁盘中。两种做法的好坏就是一种博弈了。

下面是后台执行的主要代码：

// 启动后台子进程，执行AOF 持久化操作。bgrewriteaofCommand()，startAppendOnly()，

// serverCron() 中会调用此函数

/* This is how rewriting of the append only file in background works:

**1) The user calls BGREWRITEAOF

* 2) Redis calls this function, that forks():

* * 2a) the child rewrite the append only file in a temp file.

* 2b) the parent accumulates differences in server.aof_rewrite_buf.

* 3) When the child finished '2a' exists.

* 4) The parent will trap the exit code, if it's OK, will append the

* data accumulated into server.aof_rewrite_buf into the temp file, and

* finally will rename(2) the temp file in the actual file name.

* The the new file is reopened as the new append only file. Profit!

*/

int rewriteAppendOnlyFileBackground(void) {

    pid_t childpid;

    long long start;

    // 已经有正在执行备份的子进程

    if (server.aof_child_pid != -1) return REDIS_ERR;

        start = ustime();

    if ((childpid = fork()) == 0) {

        char tmpfile[256];

        // 子进程

        /* Child */

        // 关闭监听

        closeListeningSockets(0);

        // 设置进程title

        redisSetProcTitle("redis-aof-rewrite");

        // 临时文件名

        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());

        // 开始执行AOF 持久化

    if (rewriteAppendO nlyFile(tmpfile) == REDIS_OK) {

        // 脏数据，其实就是子进程所消耗的内存大小

        // 获取脏数据大小

        size_t private_dirty = zmalloc_get_private_dirty();

        // 记录脏数据

    if (private_dirty) {

        redisLog(REDIS_NOTICE,

        "AOF rewrite: %zu MB of memory used by copy-on-write",

        private_dirty/(1024*1024));

    }

        exitFromChild(0);

    } else {

        exitFromChild(1);

    }

    } else {

        /* Parent */

        server.stat_fork_time = ustime()-start;

    if (childpid == -1) {

        redisLog(REDIS_WARNING,

        "Can't rewrite append only file in background: fork: %s",

        strerror(errno));

        return REDIS_ERR;

    }

    redisLog(REDIS_NOTICE,

    "Background append only file rewriting started by pid %d",childpid);

    // AOF 已经开始执行，取消AOF 计划

    server.aof_rewrite_scheduled = 0;

    // AOF 最近一次执行的起始时间

    server.aof_rewrite_time_start = time(NULL);

    // 子进程ID

    server.aof_child_pid = childpid;

    updateDictResizePolicy();

// 因为更新缓存都将写入文件，要强制产生选择数据集的指令SELECT ，以防出现数据

// 合并错误。

/* We set appendseldb to -1 in order to force the next call to the

* feedAppendOnlyFile() to issue a SELECT command, so the differences

* accumulated by the parent into server.aof_rewrite_buf will start

* with a SELECT statement and it will be safe to merge. */

    server.aof_selected_db = -1;

    replicationScriptCacheFlush();

    return REDIS_OK;

    }

    return REDIS_OK; /* unreached */

}

如上，子进程执行 AOF 持久化，父进程则会记录一些 AOF 的执行信息。下面来看看 AOF 持久化具体是怎么做的？

// AOF 持久化主函数。只在rewriteAppendOnlyFileBackground() 中会调用此函数

/* Write a sequence of commands able to fully rebuild the dataset into

* "filename". Used both by REWRITEAOF and BGREWRITEAOF.

**

In order to minimize the number of commands needed in the rewritten

* log Redis uses variadic commands when possible, such as RPUSH, SADD

* and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time

* are inserted using a single command. */

    int rewriteAppendOnlyFile(char *filename) {

    dictIterator *di = NULL;

    dictEntry *de;

    rio aof;

    FILE *fp;

    char tmpfile[256];

    int j;

    long long now = mstime();

    /* Note that we have to use a different temp name here compared to the

    * one used by rewriteAppendOnlyFileBackground() function. */

    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());

    // 打开文件

    fp = fopen(tmpfile,"w");

    if (!fp) {

        redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in"

        "rewriteAppendOnlyFile(): %s", strerror(errno));

        return REDIS_ERR;

    }

        // 初始化rio 结构体

        rioInitWithFile(&aof,fp);

        // 如果设置了自动备份参数，将进行设置

    if (server.aof_rewrite_incremental_fsync)

        rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);

        // 备份每一个数据集

    for (j = 0; j < server.dbnum; j++) {

        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";

        redisDb *db = server.db+j;

        dict *d = db->dict;

    if (dictSize(d) == 0) continue;

        // 获取数据集的迭代器

        di = dictGetSafeIterator(d);

    if (!di) {

        fclose(fp);

        return REDIS_ERR;

    }

    // 写入AOF 操作码

    /* SELECT the new DB */

    if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;

    // 写入数据集序号

    if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;

    // 写入数据集中每一个数据项

    /* Iterate this DB writing every entry */

    while((de = dictNext(di)) != NULL) {

        sds keystr;

        robj key, *o;

        long long expiretime;

        keystr = dictGetKey(de);

        o = dictGetVal(de);

        // 将keystr 封装在robj 里

        initStaticStringObject(key,keystr);

        // 获取过期时间

        expiretime = getExpire(db,&key);

        // 如果已经过期，放弃存储

        /* If this key is already expired skip it */

    if (expiretime != -1 && expiretime < now) continue;

        // 写入键值对应的写操作

        /* Save the key and associated value */

    if (o->type == REDIS_STRING) {

        /* Emit a SET command */

        char cmd[]="*3\r\n$3\r\nSET\r\n";

    if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;

        /* Key and value */

    if (rioWriteBulkObject(&aof,&key) == 0) goto werr;

    if (rioWriteBulkObject(&aof,o) == 0) goto werr;

    } else if (o->type == REDIS_LIST) {

    if (rewriteListObject(&aof,&key,o) == 0) goto werr;

    } else if (o->type == REDIS_SET) {

    if (rewriteSetObject(&aof,&key,o) == 0) goto werr;

    } else if (o->type == REDIS_ZSET) {

    if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;

    } else if (o->type == REDIS_HASH) {

    if (rewriteHashObject(&aof,&key,o) == 0) goto werr;

    } else {

        redisPanic("Unknown object type");

    }

    // 写入过期时间

    /* Save the expire time */

    if (expiretime != -1) {

        char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";

    if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;

    if (rioWriteBulkObject(&aof,&key) == 0) goto werr;

    if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;

    }

}

    // 释放迭代器

    dictReleaseIterator(di);

}

    // 写入磁盘

    /* Make sure data will not remain on the OS's output buffers */

    fflush(fp);

    aof_fsync(fileno(fp));

    fclose(fp);

    // 重写文件名

    /* Use RENAME to make sure the DB file is changed atomically only

    * if the generate DB file is ok. */

    if (rename(tmpfile,filename) == -1) {

        redisLog(REDIS_WARNING,"Error moving temp append only file on the "

        "final destination: %s", strerror(errno));

        unlink(tmpfile);

        return REDIS_ERR;

    }

    redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");

    return REDIS_OK;

    werr:

    // 清理工作

    fclose(fp);

    unlink(tmpfile);

    redisLog(REDIS_WARNING,"Write error writing append only file on disk: "

    "%s", strerror(errno));

    if (di) dictReleaseIterator(di);

        return REDIS_ERR;

}

刚才所说，AOF 在持久化结束后，持久化过程产生的数据变更也会追加到 AOF 文件中。如果有留意定时处理函数 serverCorn()：父进程会在子进程结束后，将 AOF 持久化过程中产生的数据变更，追加到 AOF 文件。这就是 backgroundRewriteDoneHandler() 要做的：将 server.aof_rewrite_buf_blocks 追加到 AOF 文件。

// 后台子进程结束后，Redis 更新缓存server.aof_rewrite_buf_blocks 追加到AOF 文件中

// 在AOF 持久化结束后会执行这个函数， backgroundRewriteDoneHandler() 主要工作是

// 将server.aof_rewrite_buf_blocks，即AOF 缓存写入文件

/* A background append only file rewriting (BGREWRITEAOF) terminated its work.

* Handle this. */

    void backgroundRewriteDoneHandler(int exitcode, int bysignal) {

    ......

    // 将AOF 缓存server.aof_rewrite_buf_blocks 的AOF 写入磁盘

    if (aofRewriteBufferWrite(newfd) == -1) {

        redisLog(REDIS_WARNING,

        "Error trying to flush the parent diff to the rewritten AOF: %s",

        strerror(errno));

        close(newfd);

        goto cleanup;

    }

    ......

    }

// 将累积的更新缓存server.aof_rewrite_buf_blocks 同步到磁盘

/* Write the buffer (possibly composed of multiple blocks) into the specified

* fd. If no short write or any other error happens -1 is returned,

* otherwise the number of bytes written is returned. */

    ssize_t aofRewriteBufferWrite(int fd) {

    listNode *ln;

    listIter li;

    ssize_t count = 0;

    listRewind(server.aof_rewrite_buf_blocks,&li);

    while((ln = listNext(&li))) {

    aofrwblock *block = listNodeValue(ln);

    ssize_t nwritten;

    if (block->used) {

        nwritten = write(fd,block->buf,block->used);

    if (nwritten != block->used) {

    if (nwritten == 0) errno = EIO;

        return -1;

    }

        count += nwritten;

    }

  }

  return count;

}

2）边服务边备份的方式，即 Redis 服务器会把所有的数据变更存储在 server.aof_buf 中，并在特定时机将更新缓存写入预设定的文件（server.aof_filename）。特定时机有三种：

进入事件循环之前
Redis 服务器定时程序 serverCron() 中
停止 AOF 策略的 stopAppendOnly() 中

Redis 无非是不想服务器突然崩溃终止，导致过多的数据丢失。Redis 默认是每隔固定时间进行一次边服务边备份，即隔固定时间将累积的变更的写入文件。

下面是边服务边执行 AOF 持久化的主要代码：

// 同步磁盘；将所有累积的更新server.aof_buf 写入磁盘

/* Write the append only file buffer on disk.

**

Since we are required to write the AOF before replying to the client,

* and the only way the client socket can get a write is entering when the

* the event loop, we accumulate all the AOF writes in a memory

* buffer and write it on disk using this function just before entering

* the event loop again.

**

About the 'force' argument:

**

When the fsync policy is set to 'everysec' we may delay the flush if there

* is still an fsync() going on in the background thread, since for instance

* on Linux write(2) will be blocked by the background fsync anyway.

* When this happens we remember that there is some aof buffer to be

* flushed ASAP, and will try to do that in the serverCron() function.

**

However if force is set to 1 we'll write regardless of the background

* fsync. */

void flushAppendOnlyFile(int force) {

    ssize_t nwritten;

    int sync_in_progress = 0;

    // 无数据，无需同步到磁盘

    if (sdslen(server.aof_buf) == 0) return;

    // 创建线程任务，主要调用fsync()

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)

        sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;

    // 如果没有设置强制同步的选项，可能不会立即进行同步

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {

    // 推迟执行AOF

    /* With this append fsync policy we do background fsyncing.

    * If the fsync is still in progress we can try to delay

    * the write for a couple of seconds. */

    if (sync_in_progress) {

    if (server.aof_flush_postponed_start == 0) {

        // 设置延迟冲洗时间选项

    /* No previous write postponinig, remember that we are

    * postponing the flush and return. */

    // /* Unix time sampled every cron cycle. */

        server.aof_flush_postponed_start = server.unixtime;

        return;

    // 没有超过2s，直接结束

    } else if (server.unixtime - server.aof_flush_postponed_start < 2) {

    /* We were already waiting for fsync to finish, but for less

    * than two seconds this is still ok. Postpone again. */

    return;

    }

    // 否则，要强制写入磁盘

    /* Otherwise fall trough, and go write since we can't wait

    * over two seconds. */

        server.aof_delayed_fsync++;

        redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk"

    " is busy?). Writing the AOF buffer without waiting for fsync to "

    "complete, this may slow down Redis.");

    }

  }

    // 取消延迟冲洗时间设置

/* If you are following this code path, then we are going to write so

* set reset the postponed flush sentinel to zero. */

server.aof_flush_postponed_start = 0;

/* We want to perform a single write. This should be guaranteed atomic

* at least if the filesystem we are writing is a real physical one.

* While this will save us against the server being killed I don't think

* there is much to do about the whole server stopping for power problems

* or alike */

// AOF 文件已经打开了。将server.aof_buf 中的所有缓存数据写入文件

    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));

    if (nwritten != (signed)sdslen(server.aof_buf)) {

    /* Ooops, we are in troubles. The best thing to do for now is

    * aborting instead of giving the illusion that everything is

    * working as expected. */

    if (nwritten == -1) {

        redisLog(REDIS_WARNING,"Exiting on error writing to the append-only"

        " file: %s",strerror(errno));

    } else {

        redisLog(REDIS_WARNING,"Exiting on short write while writing to "

        "the append-only file: %s (nwritten=%ld, "

        "expected=%ld)",

        strerror(errno),

        (long)nwritten,

        (long)sdslen(server.aof_buf));

    if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {

        redisLog(REDIS_WARNING, "Could not remove short write "

        "from the append-only file. Redis may refuse "

        "to load the AOF the next time it starts. "

        "ftruncate: %s", strerror(errno));

        }

    }

    exit(1);

}

    // 更新AOF 文件的大小

    server.aof_current_size += nwritten;

    // 当server.aof_buf 足够小, 重新利用空间，防止频繁的内存分配。

    // 相反，当server.aof_buf 占据大量的空间，采取的策略是释放空间，可见redis

    // 对内存很敏感。

    /* Re-use AOF buffer when it is small enough. The maximum comes from the

    * arena size of 4k minus some overhead (but is otherwise arbitrary). */

    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {

        sdsclear(server.aof_buf);

    } else {

        sdsfree(server.aof_buf);

        server.aof_buf = sdsempty();

    }

    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are

    * children doing I/O in the background. */

    if (server.aof_no_fsync_on_rewrite &&

    (server.aof_child_pid != -1 || server.rdb_child_pid != -1))

    return;

    // sync, 写入磁盘

    /* Perform the fsync if needed. */

    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {

        /* aof_fsync is defined as fdatasync() for Linux in order to avoid

        * flushing metadata. */

        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */

        server.aof_last_fsync = server.unixtime;

    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&

        server.unixtime > server.aof_last_fsync)) {

    if (!sync_in_progress) aof_background_fsync(server.aof_fd);

        server.aof_last_fsync = server.unixtime;

    }

}

细说更新缓存

上面两次提到了「更新缓存」，它即是 Redis 累积的数据变更。

更新缓存可以存储在 server.aof_buf 中，可以存储在 server.server.aof_rewrite_buf_blocks 连表中。他们的关系是：每一次数据变更记录都会写入 server.aof_buf 中，同时如果后台子进程在持久化，变更记录还会被写入 server.server.aof_rewrite_buf_blocks 中。server.aof_buf 会在特定时期写入指定文件，server.server.aof_rewrite_buf_blocks 会在后台持久化结束后追加到文件。

Redis 源码中是这么实现的：propagrate()->feedAppendOnlyFile()->aofRewriteBufferAppend()

注意，feedAppendOnlyFile() 会把更新添加到server.aof_buf；接下来会有一个判断，如果存在 AOF 子进程，则调用aofRewriteBufferAppend() 将server.aof_buf 中的所有数据插入到 server.aof_rewrite_buf_blocks 链表。这样，就能够理解为什么在AOF 持久化子进程结束后，父进程会将 server.aof_rewrite_buf_blocks 追加到 AOF 文件了。

// 向AOF 和从机发布数据更新

/* Propagate the specified command (in the context of the specified database id)

* to AOF and Slaves.

**

flags are an xor between:

* + REDIS_PROPAGATE_NONE (no propagation of command at all)

* + REDIS_PROPAGATE_AOF (propagate into the AOF file if is enabled)

* + REDIS_PROPAGATE_REPL (propagate into the replication link)

*/

void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,

    int flags)

    {

    // AOF 策略需要打开，且设置AOF 传播标记，将更新发布给本地文件

    if (server.aof_state != REDIS_AOF_OFF && flags & REDIS_PROPAGATE_AOF)

        feedAppendOnlyFile(cmd,dbid,argv,argc);

    // 设置了从机传播标记，将更新发布给从机

    if (flags & REDIS_PROPAGATE_REPL)

        replicationFeedSlaves(server.slaves,dbid,argv,argc);

    }

    // 将数据更新记录到AOF 缓存中

void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv,

    int argc) {

    sds buf = sdsempty();

    robj *tmpargv[3];

    /* The DB this command was targeting is not the same as the last command

    * we appendend. To issue a SELECT command is needed. */

    if (dictid != server.aof_selected_db) {

        char seldb[64];

        snprintf(seldb,sizeof(seldb),"%d",dictid);

        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",

    (unsigned long)strlen(seldb),seldb);

    server.aof_selected_db = dictid;

    }

    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||

        cmd->proc == expireatCommand) {

        /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */

        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);

    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {

        /* Translate SETEX/PSETEX to SET and PEXPIREAT */

        tmpargv[0] = createStringObject("SET",3);

    tmpargv[1] = argv[1];

    tmpargv[2] = argv[3];

    buf = catAppendOnlyGenericCommand(buf,3,tmpargv);

    decrRefCount(tmpargv[0]);

    buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);

    } else {

    /* All the other commands don't need translation or need the

    * same translation already operated in the command vector

    * for the replication itself. */

    buf = catAppendOnlyGenericCommand(buf,argc,argv);

    }

// 将生成的AOF 追加到server.aof_buf 中。server. 在下一次进入事件循环之前，

// aof_buf 中的内容将会写到磁盘上

/* Append to the AOF buffer. This will be flushed on disk just before

* of re-entering the event loop, so before the client will get a

* positive reply about the operation performed. */

if (server.aof_state == REDIS_AOF_ON)

server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));

// 如果已经有AOF 子进程运行，redis 采取的策略是累积子进程AOF 备份的数据和

// 内存中数据集的差异。aofRewriteBufferAppend() 把buf 的内容追加到

// server.aof_rewrite_buf_blocks 数组中

/* If a background append only file rewriting is in progress we want to

* accumulate the differences between the child DB and the current one

* in a buffer, so that when the child process will do its work we

* can append the differences to the new append only file. */

    if (server.aof_child_pid != -1)

        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));

        sdsfree(buf);

    }

    // 将数据更新记录写入server.aof_rewrite_buf_blocks，此函数只由

    // feedAppendOnlyFile() 调用

    /* Append data to the AOF rewrite buffer, allocating new blocks if needed. */

void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {

    // 尾插法

    listNode *ln = listLast(server.aof_rewrite_buf_blocks);

    aofrwblock *block = ln ? ln->value : NULL;

    while(len) {

    /* If we already got at least an allocated block, try appending

    * at least some piece into it. */

    if (block) {

        unsigned long thislen = (block->free < len) ? block->free : len;

    if (thislen) { /* The current block is not already full. */

        memcpy(block->buf+block->used, s, thislen);

        block->used += thislen;

        block->free -= thislen;

        s += thislen;

        len -= thislen;

    }

}

    if (len) { /* First block to allocate, or need another block. */

        int numblocks;

        // 创建新的节点，插到尾部

        block = zmalloc(sizeof(*block));

        block->free = AOF_RW_BUF_BLOCK_SIZE;

        block->used = 0;

        // 尾插法

        listAddNodeTail(server.aof_rewrite_buf_blocks,block);

        /* Log every time we cross more 10 or 100 blocks, respectively

        * as a notice or warning. */

        numblocks = listLength(server.aof_rewrite_buf_blocks);

    if (((numblocks+1) % 10) == 0) {

        int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING :

        REDIS_NOTICE;

        redisLog(level,"Background AOF buffer size: %lu MB",

        aofRewriteBufferSize()/(1024*1024));

         }

      }

   }

}

一副可以缓解视力疲劳的图片——AOF 持久化运作机制：

两种数据落地的方式，就是 AOF 的两个主线。因此，redis AOF 持久化机制有两条主线：后台执行和边服务边备份，抓住这两点就能理解 redis AOF 了。

这里有一个疑问，两条主线都会涉及文件的写：后台执行会写一个AOF 文件，边服务边备份也会写一个，以哪个为准？

后台持久化的数据首先会被写入“temp-rewriteaof-bg-%d.aof”，其中“%d”是AOF 子进程 id；待 AOF 子进程结束后，“temp-rewriteaof-bg-%d.aof”会被以追加的方式打开，继而写入 server.aof_rewrite_buf_blocks 中的更新缓存，最后“temp-rewriteaof-bg-%d.aof”文件被命名为 server.aof_filename，所以之前的名为 server.aof_filename 的文件会被删除，也就是说边服务边备份写入的文件会被删除。边服务边备份的数据会被一直写入到 server.aof_filename文件中。

因此，确实会产生两个文件，但是最后都会变成 server.aof_filename 文件。这里可能还有一个疑问，既然有了后台持久化，为什么还要边服务边备份？边服务边备份时间长了会产生数据冗余甚至备份过旧的数据，而后台持久化可以消除这些东西。看，这里是 Redis 的双保险。

AOF 恢复过程

AOF 的数据恢复过程设计很巧妙，它模拟一个 Redis 的服务过程。Redis 首先虚拟一个客户端，读取 AOF 文件恢复 Redis 命令和参数；接着过程就和服务客户端一样执行命令相应的函数，从而恢复数据，这样做的目的无非是提高代码的复用率。这些过程主要在 loadAppendOnlyFile() 中实现。

// 加载AOF 文件，恢复数据

/* Replay the append log file. On error REDIS_OK is returned. On non fatal

* error (the append only file is zero-length) REDIS_ERR is returned. On

* fatal error an error message is logged and the program exists. */

int loadAppendOnlyFile(char *filename) {

    struct redisClient *fakeClient;

    FILE *fp = fopen(filename,"r");

    struct redis_stat sb;

    int old_aof_state = server.aof_state;

    long loops = 0;

    // 文件大小不能为0

    if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {

        server.aof_current_size = 0;

        fclose(fp);

        return REDIS_ERR;

    }

    if (fp == NULL) {

        redisLog(REDIS_WARNING,"Fatal error: can't open the append log file "

        "for reading: %s",strerror(errno));

        exit(1);

    }

    // 正在执行AOF 加载操作，于是暂时禁止AOF 的所有操作，以免混淆

    /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI

    * to the same file we're about to read. */

    server.aof_state = REDIS_AOF_OFF;

    // 虚拟出一个客户端，即redisClient

    fakeClient = createFakeClient();

    startLoading(fp);

    while(1) {

        int argc, j;

        unsigned long len;

        robj **argv;

        char buf[128];

        sds argsds;

        struct redisCommand *cmd;

        // 每循环1000 次，在恢复数据的同时，服务器也为客户端服务。

        // aeProcessEvents() 会进入事件循环

        /* Serve the clients from time to time */

    if (!(loops++ % 1000)) {

        loadingProgress(ftello(fp));

        aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);

    }

    // 可能aof 文件到了结尾

    if (fgets(buf,sizeof(buf),fp) == NULL) {

    if (feof(fp))

        break;

    else

        goto readerr;

    }

    // 必须以“*”开头，格式不对，退出

    if (buf[0] != '*') goto fmterr;

        // 参数的个数

        argc = atoi(buf+1);

        // 参数个数错误

    if (argc < 1) goto fmterr;

        // 为参数分配空间

        argv = zmalloc(sizeof(robj*)*argc);

        // 依次读取参数

    for (j = 0; j < argc; j++) {

    if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;

    if (buf[0] != '$') goto fmterr;

        len = strtol(buf+1,NULL,10);

        argsds = sdsnewlen(NULL,len);

    if (len && fread(argsds,len,1,fp) == 0) goto fmterr;

        argv[j] = createObject(REDIS_STRING,argsds);

    if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */

    }

    // 找到相应的命令

    /* Command lookup */

    cmd = lookupCommand(argv[0]->ptr);

    if (!cmd) {

        redisLog(REDIS_WARNING,"Unknown command '%s' reading the "

        "append only file", (char*)argv[0]->ptr);

        exit(1);

    }

    // 执行命令，模拟服务客户端请求的过程，从而写入数据

    /* Run the command in the context of a fake client */

    fakeClient->argc = argc;

    fakeClient->argv = argv;

    cmd->proc(fakeClient);

    /* The fake client should not have a reply */

    redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply)

    == 0);

    /* The fake client should never get blocked */

    redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);

    // 释放虚拟客户端空间

    /* Clean up. Command code may have changed argv/argc so we use the

    * argv/argc of the client instead of the local variables. */

    for (j = 0; j < fakeClient->argc; j++)

        decrRefCount(fakeClient->argv[j]);

        zfree(fakeClient->argv);

    }

    /* This point can only be reached when EOF is reached without errors.

    * If the client is in the middle of a MULTI/EXEC, log error and quit. */

    if (fakeClient->flags & REDIS_MULTI) goto readerr;

        // 清理工作

        fclose(fp);

        freeFakeClient(fakeClient);

        // 恢复旧的AOF 状态

        server.aof_state = old_aof_state;

        stopLoading();

        // 记录最近AOF 操作的文件大小

        aofUpdateCurrentSize();

        server.aof_rewrite_base_size = server.aof_current_size;

        return REDIS_OK;

        readerr:

    // 错误，清理工作

    if (feof(fp)) {

        redisLog(REDIS_WARNING,"Unexpected end of file reading the append "

        "only file");

    } else {

        redisLog(REDIS_WARNING,"Unrecoverable error reading the append only "

        "file: %s", strerror(errno));

    }

    exit(1);

    fmterr:

    redisLog(REDIS_WARNING,"Bad file format reading the append only file: "

    "make a backup of your AOF file, then use ./redis-check-aof --fix "

    "<filename>");

    exit(1);

}

AOF 的适用场景

如果对数据比较关心，分秒必争，可以用 AOF 持久化，而且AOF 文件很容易进行分析。