关于select与epoll的区别,网上的文章已是一大堆。不过别人的终究是别人的,总得自己去理解才更深刻。于是在阅读了大量的文章后,再装模作样的看下源码,写下了自己的一些理解。

  在开始之前,要明白linux中分用户空间、内核空间,这相当于两块不能直接相互访问的内存。而用户程序要访问设备,包括网络、读写文件,都需要调用内核的相关函数。而调用内核相关函数,则往往需要从用户空间往内核拷贝一些数据,反之亦然。当调用非常频繁,这个拷贝的消耗也是不能忽略的。具体请参考:http://www.kerneltravel.net/jiaoliu/005.htm

  select相关函数的源代码http://lxr.free-electrons.com/source/fs/select.c

  epoll相关函数的源代码http://lxr.free-electrons.com/source/fs/eventpoll.c

  • select过程
  1. select函数为入口,完成超时结构体的copy,并调用core_sys_select处理文件描述符
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
fd_set __user *, exp, struct timeval __user *, tvp)
{
struct timespec end_time, *to = NULL;
struct timeval tv;
int ret; if (tvp) { /* 如果设置了超时,则需要将时间结构体从用户空间拷贝到内核空间 */
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT; to = &end_time; /* 格式化时间到结构体to中 */
if (poll_select_set_timeout(to,
tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
return -EINVAL;
} ret = core_sys_select(n, inp, outp, exp, to); /* 拷贝文件描述符集合,然后调用do_select */
ret = poll_select_copy_remaining(&end_time, tvp, , ret);/* 把处理后超时信息拷贝到用户空间 */ return ret;
}
  1. core_sys_select将文件描述符copy到内核空间,调用do_select进行处理,完成后再拷贝回用户空间
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec *end_time)
{
fd_set_bits fds;
void *bits;
int ret, max_fds;
unsigned int size;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL;
if (n < )
goto out_nofds; /* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
fdt = files_fdtable(current->files);
max_fds = fdt->max_fds;
rcu_read_unlock();
if (n > max_fds)
n = max_fds; /*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
size = FDS_BYTES(n);
bits = stack_fds;
if (size > sizeof(stack_fds) / ) {
/* Not enough space in on-stack array; must use kmalloc */
ret = -ENOMEM;
bits = kmalloc( * size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
fds.in = bits;
fds.out = bits + size;
fds.ex = bits + *size;
fds.res_in = bits + *size;
fds.res_out = bits + *size;
fds.res_ex = bits + *size; /* get_fd_set只是将文件描述符从用户空间拷贝到内核空间 */ if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < )
goto out;
if (!ret) {
ret = -ERESTARTNOHAND;
if (signal_pending(current))
goto out;
ret = ;
} /* get_fd_set只是将文件描述符从内核空间拷贝到用户空间 */
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT; out:
if (bits != stack_fds)
kfree(bits);
out_nofds:
return ret;
} int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
nr = FDS_BYTES(nr);
if (ufdset)
return copy_from_user(fdset, ufdset, nr) ? -EFAULT : ; memset(fdset, , nr);
return ;
}
  1. do_select先设置设备事件唤醒函数,初始化等待队列,然后遍历所有文件描述符查找事件。如果找不到,进程休眠,直到被设备唤醒或超时,然后再去遍历所有文件描述符重新查找事件。
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table; /* 注意这是等待队列 */
poll_table *wait;
int retval, i, timed_out = ;
unsigned long slack = ;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : ;
unsigned long busy_end = ; rcu_read_lock();
retval = max_select_fd(n, fds);
rcu_read_unlock(); if (retval < )
return retval;
n = retval; /*
这里初始化队列信息,设置设备唤醒回调指针
当程序进入休眠后,如果设备有事件发生,根据回调指针唤醒当前进程
*/
poll_initwait(&table);
wait = &table.pt;
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
wait->_qproc = NULL;
timed_out = ;
} if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time); retval = ;
for (;;) { /* 循环,方便唤醒后重新遍历文件描述符查找事件 */
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; /* 遍历所有的文件描述符,查找是否有文件描述符存在读写、异常事件 */
for (i = ; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = , mask, j;
unsigned long res_in = , res_out = , res_ex = ; in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == ) {
i += BITS_PER_LONG;
continue;
} for (j = ; j < BITS_PER_LONG; ++j, ++i, bit <<= ) {
struct fd f;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
f = fdget(i);
if (f.file) {
const struct file_operations *f_op;
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
/* 如果找到对应的poll函数,找不到就是设备驱动没写好,socket对应的函数是sock_poll */
if (f_op->poll) {
wait_key_set(wait, in, out,
bit, busy_flag);
/* 得到当前设备状态,这里有wait,但不会阻塞。只是设置回调指针 */
mask = (*f_op->poll)(f.file, wait);
}
fdput(f); /* 下面按位检测事件 */
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
wait->_qproc = NULL;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
wait->_qproc = NULL;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
wait->_qproc = NULL;
}
/* got something, stop busy polling */
if (retval) {
can_busy_loop = false;
busy_flag = ; /*
* only remember a returned
* POLL_BUSY_LOOP if we asked for it
*/
} else if (busy_flag & mask)
can_busy_loop = true; }
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
cond_resched();
}
wait->_qproc = NULL;
/* 如果已经有结果,直接返回 */
if (retval || timed_out || signal_pending(current))
break;
if (table.error) {
retval = table.error;
break;
} /* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_end) {
busy_end = busy_loop_end_time();
continue;
}
if (!busy_loop_timeout(busy_end))
continue;
}
busy_flag = ; /*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
} if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, /* 这里阻塞,直到超时 */
to, slack))
timed_out = ; /* 设置超时,上面为什么会用一个for(;;)就是为了超时后还去检查一次是否有事件 */
} poll_freewait(&table); return retval;
}
  • epoll过程
  1. epoll_create创建一个epoll结构,并初始化监听链表、就绪链表。其实这是创建一个文件,其内存位于内核空间上。这就相当于mmap一个文件了。
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file; /* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < )
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));/* 分配一个文件描述符 */
if (fd < ) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd; out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
} static int ep_alloc(struct eventpoll **pep)
{
int error;
struct user_struct *user;
struct eventpoll *ep; user = get_current_user();
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL); /* 在内核上分配一块内存 */
if (unlikely(!ep))
goto free_uid; spin_lock_init(&ep->lock);
mutex_init(&ep->mtx);
init_waitqueue_head(&ep->wq); /* 初始化监听文件描述符链表 */
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist); /* 初始化就绪链表 */
ep->rbr = RB_ROOT;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user; *pep = ep; return ; free_uid:
free_uid(user);
return error;
}
  1. epoll_ctl来控制epoll结构。即负责epoll中监听链表的增、删、查、改。注意这里可能会产生一次用户空间到内核空间的拷贝。
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
int full_check = ;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL; error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event))) /* 这里可能会产生拷贝 */
goto error_return; error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return; /* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
goto error_fput; /* The target file descriptor must support poll */
error = -EPERM;
if (!tf.file->f_op->poll)
goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds); /*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput; /*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data; /*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epmutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
mutex_lock_nested(&ep->mtx, );
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
full_check = ;
mutex_unlock(&ep->mtx);
mutex_lock(&epmutex);
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != ) {
clear_tfile_check_list();
goto error_tgt_fput;
}
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
mutex_lock_nested(&ep->mtx, );
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
mutex_lock_nested(&tep->mtx, );
}
}
} /*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd); error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
if (tep != NULL)
mutex_unlock(&tep->mtx);
mutex_unlock(&ep->mtx); error_tgt_fput:
if (full_check)
mutex_unlock(&epmutex); fdput(tf);
error_fput:
fdput(f);
error_return: return error;
}
  1. epoll_wait只做一些容错預处理,然后调用ep_poll
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
int error;
struct fd f;
struct eventpoll *ep; /* The maximum number of event must be greater than zero */
if (maxevents <= || maxevents > EP_MAX_EVENTS)
return -EINVAL; /* Verify that the area passed by the user is writeable */
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
return -EFAULT; /* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
if (!f.file)
return -EBADF; /*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(f.file))
goto error_fput; /*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data; /* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout); error_fput:
fdput(f);
return error;
}
  1. ep_poll初始化等待队列,并将唤醒回调设置为往就绪队列添加设备,再唤醒进程。这样,进程只需要检测就绪队列是否为空,如果为空,则休眠直到超时或被唤醒。
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = , eavail, timed_out = ;
unsigned long flags;
long slack = ;
wait_queue_t wait;
ktime_t expires, *to = NULL; if (timeout > ) {
struct timespec end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec_to_ktime(end_time);
} else if (timeout == ) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
timed_out = ;
spin_lock_irqsave(&ep->lock, flags);
goto check_events;
} fetch_events:
spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/ /*
这里初始化等待队列,如果一个设备有事件,則会先往就绪链表中加就绪设备
然后唤醒进程
*/
init_waitqueue_entry(&wait, current);
__add_wait_queue_exclusive(&ep->wq, &wait); for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (ep_events_available(ep) || timed_out)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
} spin_unlock_irqrestore(&ep->lock, flags);
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) /* 进入休眠 */
timed_out = ; spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);/* 删除等待队列 */ set_current_state(TASK_RUNNING);
}
check_events:
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep); spin_unlock_irqrestore(&ep->lock, flags); /*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events; return res;
} static inline int ep_events_available(struct eventpoll *ep)
{
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}

  总结一下,select和epoll的流程如下:

如果要比性能,那么大概有以下的区别:

  • 每一次select,都需要拷贝两次;而epoll只在添加新文件描述符里拷贝一次,其余的使用mmap进行交互
  • 每次select,都需要遍历所有的文件描述符(如果第一次未有事件,则是遍历两次);而epoll只是查询一下就绪列表是否为空。

  一句话,select是你每天起床都去各个快递公司问是否有自己的快递,而epoll是每天起床到门口的邮箱查下是否有自己的快递。

select与epoll分析的更多相关文章

  1. select, poll, epoll的实现分析

    select, poll, epoll都是Linux上的IO多路复用机制.知其然知其所以然,为了更好地理解其底层实现,这几天我阅读了这三个系统调用的源码. 以下源代码摘自Linux4.4.0内核. 预 ...

  2. Linux下select&poll&epoll的实现原理(一)

    最近简单看了一把 linux-3.10.25 kernel中select/poll/epoll这个几个IO事件检测API的实现.此处做一些记录.其基本的原理是相同的,流程如下 先依次调用fd对应的st ...

  3. 多进程、协程、事件驱动及select poll epoll

    目录 -多线程使用场景 -多进程 --简单的一个多进程例子 --进程间数据的交互实现方法 ---通过Queues和Pipe可以实现进程间数据的传递,但是不能实现数据的共享 ---Queues ---P ...

  4. [转]谈谈select, iocp, epoll,kqueue及各种网络I/O复用机制

    参考原文:再谈select, iocp, epoll,kqueue及各种I/O复用机制 一.I/O模型概述 介绍几种常见的I/O模型及其区别,如下: blocking I/O nonblocking ...

  5. 【转】select和epoll模型的差异

    http://www.cppblog.com/converse/archive/2008/10/12/63836.html epoll为什么这么快 epoll是多路复用IO(I/O Multiplex ...

  6. select.poll,epoll的区别与应用

    先讲讲同步I/O的五大模型 阻塞式I/O, 非阻塞式I/O, I/O复用,信号驱动I/O(SIGIO),异步I/O模型 而select/poll/epoll属于I/O复用模型 select函数 该函数 ...

  7. select poll epoll三者之间的比较

    一.概述 说到Linux下的IO复用,系统提供了三个系统调用,分别是select poll epoll.那么这三者之间有什么不同呢,什么时候使用三个之间的其中一个呢? 下面,我将从系统调用原型来分析其 ...

  8. Linux下select&poll&epoll的实现原理(一)【转】

    转自:http://www.cnblogs.com/lanyuliuyun/p/5011526.html 最近简单看了一把 linux-3.10.25 kernel中select/poll/epoll ...

  9. Java IO 学习(二)select/poll/epoll

    如上文所说,select/poll/epoll本质上都是同步阻塞的,但是由于实现了IO多路复用,在处理聊天室这种需要处理大量长连接但是每个连接上数据事件较少的场景时,相比最原始的为每个连接新开一个线程 ...

随机推荐

  1. CSS3:优雅地绘制不规则ICON

    早上在w3ctech上看到 中国第二届CSS Conf总结  的时候,真是开心极了: 自从去年在慕课网上看了第一届CSS conf 视频之后,整个人都震惊了,原来还有大会是专门用来讨论CSS的,而且分 ...

  2. Android Sensor Test

    魅蓝note可用 [{Sensor name="MPL Gyroscope", vendor="Invensense", version=1, type=4, ...

  3. repo的小结

    repo仅仅是google用Python脚本写的调用git的一个脚本,主要是用来下载.管理Android项目的软件仓库. 1. 下载 repo 的地址: http://android.git.kern ...

  4. LF will be replaced by CRLF问题解决方法

    [GIT] warning: LF will be replaced by CRLF问题解决方法 开发环境: 操作系统: windows xp ruby 1.9.2 rails 3.1.3 git v ...

  5. 文件操作2 cp mv rm

    1.cp命令 [root@rusky /]# cp 123 /test  #在linux系统中,如果文件123已经存在,则提示用户确认,在unix系统中则不提示,除非使用参数-i 交互式操作.cp: ...

  6. 部署hibernate框架项目时出现问题:The type java.lang.Object cannot be resolved. It is indirectly referenced from required .class files.

    基本情况: (这些其实关系不大)我是直接impor导入HibernateDemo项目到eclipse中的,该项目的hibernate版本是3.6.7.Final版,使用了Hibernate Tools ...

  7. URL与URI的区别

    URI—Universal Resource Identifier通用资源标志符Web上可用的每种资源如HTML文档.图像.视频片段.程序等都是一个来URI来定位的URI一般由三部组成①访问资源的命名 ...

  8. (转) 将VB.NET网站转换成C#的全过程

    在学习URL重写过程中碰到个是VB写的源码,看起来总是不爽的就GOOLE了下 感觉这个文章写的不错 原文地址 http://www.cnblogs.com/cngunner/archive/2006/ ...

  9. (转)JQuery处理json与ajax返回JSON实例

    son数据是一种经型的实时数据交互的数据存储方法,使用到最多的应该是ajax与json配合使用了,下面我来给大家介绍jquery处理json数据方法. 一.JSON的一些基础知识. JSON中对象通过 ...

  10. 从Ueditor跨域上传,总结的一次跨域上传的爬坑经历

    项目内其中一个管理后台需要发布文章,需要一个富文本编辑器,经过一番选择后,最终选择了百度的Ueditor. 由于上传的文件是上传到另一台专门存放图片等静态资源的服务器上面的,所以就涉及到了跨域上传. ...