Redis 源码走读（一）事件驱动机制与命令处理

eventloop

从 server.c 的 main 方法看起

int main(int argc, char **argv) {

.......

    aeSetBeforeSleepProc(server.el,beforeSleep);

    aeSetAfterSleepProc(server.el,afterSleep);

    aeMain(server.el);

    aeDeleteEventLoop(server.el);

    return ;

}

aeMain.c

//在死循环中调用 aeProcessEvents 方法，处理可以执行的 time event 与 file event

// 在 server.c 的 main 函数中会被调用

void aeMain(aeEventLoop *eventLoop) {

    eventLoop->stop = ;

    while (!eventLoop->stop) {

        if (eventLoop->beforesleep != NULL)

            eventLoop->beforesleep(eventLoop);

        aeProcessEvents(eventLoop, AE_ALL_EVENTS|AE_CALL_AFTER_SLEEP);

    }

}

/* Process every pending time event, then every pending file event

 * (that may be registered by time event callbacks just processed).

 * Without special flags the function sleeps until some file event

 * fires, or when the next time event occurs (if any).

 *

 * If flags is 0, the function does nothing and returns.

 * if flags has AE_ALL_EVENTS set, all the kind of events are processed.

 * if flags has AE_FILE_EVENTS set, file events are processed.

 * if flags has AE_TIME_EVENTS set, time events are processed.

 * if flags has AE_DONT_WAIT set the function returns ASAP until all

 * if flags has AE_CALL_AFTER_SLEEP set, the aftersleep callback is called.

 * the events that's possible to process without to wait are processed.

 *

 * The function returns the number of events processed. */

int aeProcessEvents(aeEventLoop *eventLoop, int flags)

{

........

    /* Note that we want call select() even if there are no

     * file events to process as long as we want to process time

     * events, in order to sleep until the next time event is ready

     * to fire. */

    //优先执行 time event

    if (eventLoop->maxfd != - ||

        ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {

        int j;

        aeTimeEvent *shortest = NULL;

        struct timeval tv, *tvp;

        if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))

            //找到time event 链表里，最近的 time event

            shortest = aeSearchNearestTimer(eventLoop);

        //计算从现在起到这个time event 被执行，要等待多久

        if (shortest) {

            long now_sec, now_ms;

            aeGetTime(&now_sec, &now_ms);

            tvp = &tv;

            /* How many milliseconds we need to wait for the next

             * time event to fire? */

            long long ms =

                (shortest->when_sec - now_sec)* +

                shortest->when_ms - now_ms;

            if (ms > ) {

                tvp->tv_sec = ms/;

                tvp->tv_usec = (ms % )*;

            } else {

                tvp->tv_sec = ;

                tvp->tv_usec = ;

            }

        } else {

            /* If we have to check for events but need to return

             * ASAP because of AE_DONT_WAIT we need to set the timeout

             * to zero */

            if (flags & AE_DONT_WAIT) {

                tv.tv_sec = tv.tv_usec = ;

                tvp = &tv;

            } else {

                /* Otherwise we can block */

                tvp = NULL; /* wait forever */

            }

        }

        /* Call the multiplexing API, will return only on timeout or when

         * some event fires. */

        //调用 IO 多路复用的代码，找到可读写的 file event

        numevents = aeApiPoll(eventLoop, tvp);

        /* After sleep callback. */

        if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP)

            eventLoop->aftersleep(eventLoop);

        //遍历 event loop 的 fired 数组对应的 fd

        for (j = ; j < numevents; j++) {

            aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];

            int mask = eventLoop->fired[j].mask;//记录了事件类型：read/write

            int fd = eventLoop->fired[j].fd;//事件的 fd

            int fired = ; /* Number of events fired for current fd. */

            /* Normally we execute the readable event first, and the writable

             * event laster. This is useful as sometimes we may be able

             * to serve the reply of a query immediately after processing the

             * query.

             *

             * However if AE_BARRIER is set in the mask, our application is

             * asking us to do the reverse: never fire the writable event

             * after the readable. In such a case, we invert the calls.

             * This is useful when, for instance, we want to do things

             * in the beforeSleep() hook, like fsynching a file to disk,

             * before replying to a client. */

            int invert = fe->mask & AE_BARRIER;

        /* Note the "fe->mask & mask & ..." code: maybe an already

             * processed event removed an element that fired and we still

             * didn't processed, so we check if the event is still valid.

             *

             * Fire the readable event if the call sequence is not

             * inverted. */

            if (!invert && fe->mask & mask & AE_READABLE) {

                fe->rfileProc(eventLoop,fd,fe->clientData,mask);

                fired++;

            }

            /* Fire the writable event. */

            if (fe->mask & mask & AE_WRITABLE) {

                if (!fired || fe->wfileProc != fe->rfileProc) {

                    fe->wfileProc(eventLoop,fd,fe->clientData,mask);

                    fired++;

                }

            }

            /* If we have to invert the call, fire the readable event now

             * after the writable one. */

            if (invert && fe->mask & mask & AE_READABLE) {

                if (!fired || fe->wfileProc != fe->rfileProc) {

                    fe->rfileProc(eventLoop,fd,fe->clientData,mask);

                    fired++;

                }

            }

            processed++;

        }

    }

    /* Check time events */

    if (flags & AE_TIME_EVENTS)

        processed += processTimeEvents(eventLoop);

    return processed; /* return the number of processed file/time events */

}

标准的事件驱动框架，在死循环中调用aeProcessEvents方法

aeProcessEvents 方法比较长，里面会处理两种事件TimeEvent 与 FileEvent，本文关注的重点是 FileEvent

aeProcessEvents 调用 aeApiPoll 方法来查找监听的 fd 上有哪些是可用的，找到可用的 fd 之后，根据 fd 的事件类型，决定调用 wfileProc 还是rfileProc 来处理相关的事件，本文里我们关心的是 client 发来的 command 会被如何处理，那就是rfileProc了，rfileProc的设置过程在后文中被提及

aeApiPoll 在多个文件中被实现，Redis 用条件编译的手法决定采用哪种实现，很有意思

/* Include the best multiplexing layer supported by this system.

 * The following should be ordered by performances, descending. */

 //用宏实现编译期重载，很稳

#ifdef HAVE_EVPORT

#include "ae_evport.c"

#else

    #ifdef HAVE_EPOLL

    #include "ae_epoll.c"

    #else

        #ifdef HAVE_KQUEUE

        #include "ae_kqueue.c"

        #else

        #include "ae_select.c"

        #endif

    #endif

#endif

就看最经典的 epoll 好了：

typedef struct aeApiState {

    int epfd;

    struct epoll_event *events;

} aeApiState;

//创建eventloop

static int aeApiCreate(aeEventLoop *eventLoop) {

    aeApiState *state = zmalloc(sizeof(aeApiState));

    if (!state) return -;

    state->events = zmalloc(sizeof(struct epoll_event)*eventLoop->setsize);

    if (!state->events) {

        zfree(state);

        return -;

    }

    state->epfd = epoll_create(); /* 1024 is just a hint for the kernel */

    if (state->epfd == -) {

        zfree(state->events);

        zfree(state);

        return -;

    }

    eventLoop->apidata = state;

    return ;

}

static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {

    aeApiState *state = eventLoop->apidata;

    struct epoll_event ee = {}; /* avoid valgrind warning */

    /* If the fd was already monitored for some event, we need a MOD

     * operation. Otherwise we need an ADD operation. */

    int op = eventLoop->events[fd].mask == AE_NONE ?

            EPOLL_CTL_ADD : EPOLL_CTL_MOD;//epoll_ctl函数的 op 参数的可能的取值：EPOLL_CTL_ADD 注册、EPOLL_CTL_MOD 修 改、EPOLL_CTL_DEL 删除

    ee.events = ;

    //同时修改 eventLoop 里 event 的 mask 标记，和关联的 epoll fd 所监听的事件集合

    mask |= eventLoop->events[fd].mask; /* Merge old events */

    if (mask & AE_READABLE) ee.events |= EPOLLIN;

    if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;

    ee.data.fd = fd;

    if (epoll_ctl(state->epfd,op,fd,&ee) == -) return -;

    return ;

}

//传入的 tvp 是 epoll 超时时间，如果 tvp 为 null，则永久阻塞

static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {

    aeApiState *state = eventLoop->apidata;

    int retval, numevents = ;

    retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,

            tvp ? (tvp->tv_sec* + tvp->tv_usec/) : -);

    if (retval > ) {

        int j;

        numevents = retval;

        //遍历可读写的 fd

        for (j = ; j < numevents; j++) {

            int mask = ;

            struct epoll_event *e = state->events+j;

            if (e->events & EPOLLIN) mask |= AE_READABLE;

            if (e->events & EPOLLOUT) mask |= AE_WRITABLE;

            if (e->events & EPOLLERR) mask |= AE_WRITABLE;

            if (e->events & EPOLLHUP) mask |= AE_WRITABLE;

            //设置 eventLoop.fired 数组里的元素，这些元素代表可读写的 fd

            eventLoop->fired[j].fd = e->data.fd;

            eventLoop->fired[j].mask = mask;

        }

    }

    return numevents;

}

代码不算复杂，实际上对系统调用做了一层简单的封装

调用 epoll_ctl 方法来注册监听 fd

调用 epoll_wait 方法来等待，直到被监听的 fd 上有事件发生为止

比较有趣的做法是aeFileEvent 结构体里定义了一个 mask 属性来记录这个 fd 被监听的事件，应该是为了便于后续查找。

新 client 建立连接

networking.c

client *createClient(int fd) {

    client *c = zmalloc(sizeof(client));

    //fd == -1，说明这是一个用于执行 lua 脚本的无连接的伪客户端，可以省去一些开销

    /* passing -1 as fd it is possible to create a non connected client.

     * This is useful since all the commands needs to be executed

     * in the context of a client. When commands are executed in other

     * contexts (for instance a Lua script) we need a non connected client. */

    if (fd != -) {

        anetNonBlock(NULL,fd);//将这个 fd 设为 non block 模式

        anetEnableTcpNoDelay(NULL,fd);//调用 setsockopt 方法，禁止使用nagle 算法，确保数据包能尽可能快速的发出去

        if (server.tcpkeepalive)

            anetKeepAlive(NULL,fd,server.tcpkeepalive);

        // 给这个 client 关联的 fd 注册 read 事件处理函数：readQueryFromClient，其定义在文件尾部

        if (aeCreateFileEvent(server.el,fd,AE_READABLE,

            readQueryFromClient, c) == AE_ERR)

        {

            close(fd);

            zfree(c);

            return NULL;

        }

    }

调用 aeCreateFileEvent 方法给这个 fd 注册 read 事件处理函数 readQueryFromClient，也就是设置到这个 fd 的 rfileProc 属性里

int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,

        aeFileProc *proc, void *clientData)

{

    if (fd >= eventLoop->setsize) {

        errno = ERANGE;

        return AE_ERR;

    }

    aeFileEvent *fe = &eventLoop->events[fd];

    if (aeApiAddEvent(eventLoop, fd, mask) == -)

        return AE_ERR;

    fe->mask |= mask;

    if (mask & AE_READABLE) fe->rfileProc = proc;

    if (mask & AE_WRITABLE) fe->wfileProc = proc;

    fe->clientData = clientData;

    if (fd > eventLoop->maxfd)

        eventLoop->maxfd = fd;

    return AE_OK;

}

当 client 发送 command 过来的时候，eventloop 会发现这个 fd 可读，然后调用 readQueryFromClient 进行处理

处理client 发送的 command

//回调函数，这个函数被触发的时候，说明 client 触发了 read 事件

void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {

.....

    /* Time to process the buffer. If the client is a master we need to

     * compute the difference between the applied offset before and after

     * processing the buffer, to understand how much of the replication stream

     * was actually applied to the master state: this quantity, and its

     * corresponding part of the replication stream, will be propagated to

     * the sub-slaves and to the replication backlog. */

    if (!(c->flags & CLIENT_MASTER)) {

        processInputBuffer(c);//非 master

    } else {

        //本机为 master，除了处理 buffer 里的命令，还要解决主从复制的问题

        size_t prev_offset = c->reploff;

        processInputBuffer(c);

        size_t applied = c->reploff - prev_offset;

        if (applied) {

            replicationFeedSlavesFromMasterStream(server.slaves,

                    c->pending_querybuf, applied);

            sdsrange(c->pending_querybuf,applied,-);

        }

    }

}

当 fd 可读时，eventloop 会触发 readQueryFromClient 这个回调函数，再调用 processInputBuffer 函数

/* This function is called every time, in the client structure 'c', there is

 * more query buffer to process, because we read more data from the socket

 * or because a client was blocked and later reactivated, so there could be

 * pending query buffer, already representing a full command, to process. */

void processInputBuffer(client *c) {

.....

        if (c->reqtype == PROTO_REQ_INLINE) {

            if (processInlineBuffer(c) != C_OK) break;

        } else if (c->reqtype == PROTO_REQ_MULTIBULK) {

            if (processMultibulkBuffer(c) != C_OK) break;

        } else {

            serverPanic("Unknown request type");

        }

        /* Multibulk processing could see a <= 0 length. */

        if (c->argc == ) {

            resetClient(c);

        } else {

            /* Only reset the client when the command was executed. */

            //终于开始执行 command 了

            if (processCommand(c) == C_OK) {

                if (c->flags & CLIENT_MASTER && !(c->flags & CLIENT_MULTI)) {

                    /* Update the applied replication offset of our master. */

                    c->reploff = c->read_reploff - sdslen(c->querybuf);

                }

                /* Don't reset the client structure for clients blocked in a

                 * module blocking command, so that the reply callback will

                 * still be able to access the client argv and argc field.

                 * The client will be reset in unblockClientFromModule(). */

                if (!(c->flags & CLIENT_BLOCKED) || c->btype != BLOCKED_MODULE)

                    resetClient(c);

            }

            /* freeMemoryIfNeeded may flush slave output buffers. This may

             * result into a slave, that may be the active client, to be

             * freed. */

            if (server.current_client == NULL) break;

        }

    }

    server.current_client = NULL;

}

调用processCommand 方法，顾名思义，里面会对 client 发来的指令做处理

其实现位于server.c 里

/* If this function gets called we already read a whole

 * command, arguments are in the client argv/argc fields.

 * processCommand() execute the command or prepare the

 * server for a bulk read from the client.

 *

 * If C_OK is returned the client is still alive and valid and

 * other operations can be performed by the caller. Otherwise

 * if C_ERR is returned the client was destroyed (i.e. after QUIT). */

int processCommand(client *c) {

......

    /* Now lookup the command and check ASAP about trivial error conditions

     * such as wrong arity, bad command name and so forth. */

    // 从 command dict 里查找对应的 command 实现，

    c->cmd = c->lastcmd = lookupCommand(c->argv[]->ptr);

    //检查 command 是否存在，以及参数的数量是否正确

    if (!c->cmd) {

        flagTransaction(c);

        addReplyErrorFormat(c,"unknown command '%s'",

            (char*)c->argv[]->ptr);

        return C_OK;

    } else if ((c->cmd->arity >  && c->cmd->arity != c->argc) ||

               (c->argc < -c->cmd->arity)) {

        flagTransaction(c);

        addReplyErrorFormat(c,"wrong number of arguments for '%s' command",

            c->cmd->name);

        return C_OK;

    }

.....

    //前面是检查参数和处理各种异常情况

    /* Exec the command */

    //如果处在 multi 命令开启的事务环境中

    if (c->flags & CLIENT_MULTI &&

        c->cmd->proc != execCommand && c->cmd->proc != discardCommand &&

        c->cmd->proc != multiCommand && c->cmd->proc != watchCommand)

    {

        //把命令放到 queue 里

        queueMultiCommand(c);

        addReply(c,shared.queued);

    } else {

        //执行非事务，普通命令，实现位于本文件的2200多行

        call(c,CMD_CALL_FULL);

        c->woff = server.master_repl_offset;

        if (listLength(server.ready_keys))

            handleClientsBlockedOnKeys();

    }

    return C_OK;

}

这个方法有两个关键点：

1. 调用 lookupCommand 方法查找 client 提交的 command 对应的实现（redis server 启动的时候会初始化一个 dict，里面存放了 command 名称到实现函数的映射关系，去这个 dict 里查就好了）

2. 执行函数，我们先不关注事务，只看最简单的普通命令，那么会调用call 方法

其实现位于 server.c 里

void call(client *c, int flags) {

......

    /* Call the command. */

    dirty = server.dirty;

    start = ustime();

    c->cmd->proc(c);//执行命令

    duration = ustime()-start;//计算命令执行时间

    dirty = server.dirty-dirty;

    if (dirty < ) dirty = ;

....

}

主要是用 cmd 的 proc 属性，一个函数指针来完成实际操作

至于 cmd 和它的 proc 属性，是在上一步的 lookupCommand 方法里被设置的。

例如最简单的 get 方法，就对应于getCommand 这个方法：

    {"get",getCommand,,"rF",,NULL,,,,,},

其具体实现位于t_string.c 里，细节暂时就不跟进了。

现在我们就大致上能理解client 发送的 command 的流转过程了。