Linux NAPI/非NAPI

概述

本文主要介绍二层收包流程，包括NAPI与非NAPI方式；

NAPI方式

数据包到来，第一个数据包产生硬件中断，中断处理程序将设备的napi_struct结构挂在当前cpu的待收包设备链表softnet_data->poll_list中，并触发软中断，软中断执行过程中，遍历softnet_data->poll_list中的所有设备，依次调用其收包函数napi_sturct->poll，处理收包过程；

非NAPI方式

每个数据包到来，都会产生硬件中断，中断处理程序将收到的包放入当前cpu的收包队列softnet_data->input_pkt_queue中，并且将非napi设备对应的虚拟设备napi结构softnet->backlog结构挂在当前cpu的待收包设备链表softnet_data->poll_list中，并触发软中断，软中断处理过程中，会调用backlog的回调处理函数process_backlog，将收包队列input_pkt_queue合并到softdata->process_queue后面，并依次处理该队列中的数据包；

NAPI方式收包流程

中断上半部

以e100为例：

e100_intr(中断处理程序)–>__napi_schedule–>____napi_schedule(将设备对应的napi结构加入到当前cpu的待收包处理队列softnet_data->poll_list中，并触发软中断)

数据包到来，第一包产生中断，中断处理程序得到执行，其中关键步骤为调用__napi_schedule(&nic->napi)将设备对应的napi加入到当前cpu的softnet_data->poll_list中；

 static irqreturn_t e100_intr(int irq, void *dev_id)

 {

     struct net_device *netdev = dev_id;

     struct nic *nic = netdev_priv(netdev);

     u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);

     netif_printk(nic, intr, KERN_DEBUG, nic->netdev,

              "stat_ack = 0x%02X\n", stat_ack);

     if (stat_ack == stat_ack_not_ours ||    /* Not our interrupt */

        stat_ack == stat_ack_not_present)    /* Hardware is ejected */

         return IRQ_NONE;

     /* Ack interrupt(s) */

     iowrite8(stat_ack, &nic->csr->scb.stat_ack);

     /* We hit Receive No Resource (RNR); restart RU after cleaning */

     if (stat_ack & stat_ack_rnr)

         nic->ru_running = RU_SUSPENDED;

     if (likely(napi_schedule_prep(&nic->napi))) {

         e100_disable_irq(nic);

         //将该网络设备加入到sd的poll_list中

         __napi_schedule(&nic->napi);

     }

     return IRQ_HANDLED;

 }

将设备对应的napi结构加入到当前cpu的softnet_data->poll_list中，并触发收包软中断；

 void __napi_schedule(struct napi_struct *n)

 {

     unsigned long flags;

     local_irq_save(flags);

     ____napi_schedule(this_cpu_ptr(&softnet_data), n);

     local_irq_restore(flags);

 }

 //添加设备到poll_list，激活接收报文软中断

 static inline void ____napi_schedule(struct softnet_data *sd,

                      struct napi_struct *napi)

 {

     list_add_tail(&napi->poll_list, &sd->poll_list);

     __raise_softirq_irqoff(NET_RX_SOFTIRQ);

 }

中断下半部

net_rx_action(软中断收包处理程序)–>napi_poll(执行设备包处理回调napi_struct->poll)

收包软中断处理程序，软中断触发，说明有设备的数据包到达，此时本处理程序遍历softnet_data->poll_list中的待收包设备，并执行napi中的poll调度，关键代码napi_poll(n, &repoll);

 /* 收包软中断处理程序 */

 static __latent_entropy void net_rx_action(struct softirq_action *h)

 {

     struct softnet_data *sd = this_cpu_ptr(&softnet_data);

     unsigned long time_limit = jiffies +

         usecs_to_jiffies(netdev_budget_usecs);

     int budget = netdev_budget;

     LIST_HEAD(list);

     LIST_HEAD(repoll);

     /*

         将当前cpu的待收包设备列表poll_list合并到list，

         并且重新初始化poll_list

     */

     local_irq_disable();

     list_splice_init(&sd->poll_list, &list);

     local_irq_enable();

     /* 遍历列表 */

     for (;;) {

         struct napi_struct *n;

         /* 列表为空，则跳出 */

         if (list_empty(&list)) {

             if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))

                 goto out;

             break;

         }

         /* 取链表头napi节点 */

         n = list_first_entry(&list, struct napi_struct, poll_list);

         /*

             调用该节点的poll函数收包 ，

             若未处理完，则挂到repoll上

         */

         budget -= napi_poll(n, &repoll);

         /* If softirq window is exhausted then punt.

          * Allow this to run for 2 jiffies since which will allow

          * an average latency of 1.5/HZ.

          */

         /* 总配额用尽，或者中断时间窗口用尽，跳出 */

         if (unlikely(budget <=  ||

                  time_after_eq(jiffies, time_limit))) {

             sd->time_squeeze++;

             break;

         }

     }

     /* 禁用中断 */

     local_irq_disable();

     /* 整合poll_list链表，包括新产成的，未完成的，未完成的在前 */

     list_splice_tail_init(&sd->poll_list, &list);

     list_splice_tail(&repoll, &list);

     list_splice(&list, &sd->poll_list);

     /* 如果poll_list不为空，则触发下一次收包中断 */

     if (!list_empty(&sd->poll_list))

         __raise_softirq_irqoff(NET_RX_SOFTIRQ);

     /* 启用中断 */

     net_rps_action_and_irq_enable(sd);

 out:

     __kfree_skb_flush();

 }

 struct netdev_adjacent {

     struct net_device *dev;

     /* upper master flag, there can only be one master device per list */

     bool master;

     /* counter for the number of times this device was added to us */

     u16 ref_nr;

     /* private field for the users */

     void *private;

     struct list_head list;

     struct rcu_head rcu;

 };

调用设备对应的napi_struct->poll回调接收数据包，接收数量要根据配额进行限制，关键代码为 work = n->poll(n, weight);

 static int napi_poll(struct napi_struct *n, struct list_head *repoll)

 {

     void *have;

     int work, weight;

     /* 将napi从链表中拿掉 */

     list_del_init(&n->poll_list);

     have = netpoll_poll_lock(n);

     /* 读取配额 */

     weight = n->weight;

     /* This NAPI_STATE_SCHED test is for avoiding a race

      * with netpoll's poll_napi().  Only the entity which

      * obtains the lock and sees NAPI_STATE_SCHED set will

      * actually make the ->poll() call.  Therefore we avoid

      * accidentally calling ->poll() when NAPI is not scheduled.

      */

     work = ;

     /* napi在调度状态 */

     if (test_bit(NAPI_STATE_SCHED, &n->state)) {

         /* 执行设备napi的poll回调进行收包 */

         work = n->poll(n, weight);

         trace_napi_poll(n, work, weight);

     }

     WARN_ON_ONCE(work > weight);

     /* 收包数量小于配额，全部读完 */

     if (likely(work < weight))

         goto out_unlock;

     /* 以下未读完 */

     /* Drivers must not modify the NAPI state if they

      * consume the entire weight.  In such cases this code

      * still "owns" the NAPI instance and therefore can

      * move the instance around on the list at-will.

      */

     /* napi在禁用状态 */

     if (unlikely(napi_disable_pending(n))) {

         /* 执行完成项 */

         napi_complete(n);

         goto out_unlock;

     }

     if (n->gro_list) {

         /* flush too old packets

          * If HZ < 1000, flush all packets.

          */

         napi_gro_flush(n, HZ >= );

     }

     /* Some drivers may have called napi_schedule

      * prior to exhausting their budget.

      */

     if (unlikely(!list_empty(&n->poll_list))) {

         pr_warn_once("%s: Budget exhausted after napi rescheduled\n",

                  n->dev ? n->dev->name : "backlog");

         goto out_unlock;

     }

     /* 将为处理完的挂到repoll上 */

     list_add_tail(&n->poll_list, repoll);

 out_unlock:

     netpoll_poll_unlock(have);

     return work;

 }

非NAPI方式收包流程

中断上半部

netif_rx(中断处理程序最终会调用次函数处理收到的包)->netif_rx_internal->enqueue_to_backlog(将收到的包加入到当前cpu的softnet->input_pkt_queue中，并将默认设备backlog加入到softnet_data结构的poll_list链表)

中断处理程序会调用netif_rx来将数据包加入到收包队列中，关键代码：enqueue_to_backlog(skb, get_cpu(), &qtail); 注意数每包都会中断；

 int netif_rx(struct sk_buff *skb)

 {

     trace_netif_rx_entry(skb);

     return netif_rx_internal(skb);

 }

 static int netif_rx_internal(struct sk_buff *skb)

 {

     int ret;

     net_timestamp_check(netdev_tstamp_prequeue, skb);

     trace_netif_rx(skb);

 #ifdef CONFIG_RPS

     if (static_key_false(&rps_needed)) {

         struct rps_dev_flow voidflow, *rflow = &voidflow;

         int cpu;

         preempt_disable();

         rcu_read_lock();

         cpu = get_rps_cpu(skb->dev, skb, &rflow);

         if (cpu < )

             cpu = smp_processor_id();

         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

         rcu_read_unlock();

         preempt_enable();

     } else

 #endif

     {

         unsigned int qtail;

         ret = enqueue_to_backlog(skb, get_cpu(), &qtail);

         put_cpu();

     }

     return ret;

 }

enqueue_to_backlog将skb加入到当前cpu的softnet_data->input_pkt_queue中，并将softnet_data->backlog结构加入到softnet_data->poll_list链表中，并触发收包软中断；

 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,

                   unsigned int *qtail)

 {

     struct softnet_data *sd;

     unsigned long flags;

     unsigned int qlen;

     sd = &per_cpu(softnet_data, cpu);

     local_irq_save(flags);

     rps_lock(sd);

     //检查设备状态

     if (!netif_running(skb->dev))

         goto drop;

     //获取队列长度

     qlen = skb_queue_len(&sd->input_pkt_queue);

     //如果队列未满&& 未达到skb流限制

     if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {

         //长度不为空，设备已经得到了调度

         if (qlen) {

 enqueue:

             //skb入队

             __skb_queue_tail(&sd->input_pkt_queue, skb);

             input_queue_tail_incr_save(sd, qtail);

             rps_unlock(sd);

             local_irq_restore(flags);

             return NET_RX_SUCCESS;

         }

         /* Schedule NAPI for backlog device

          * We can use non atomic operation since we own the queue lock

          */

         //为空，则设置napi调度

         if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {

             //alextodo

             if (!rps_ipi_queued(sd))

                 ____napi_schedule(sd, &sd->backlog);

         }

         //设置调度之后，入队

         goto enqueue;

     }

 //丢包

 drop:

     sd->dropped++;

     rps_unlock(sd);

     local_irq_restore(flags);

     atomic_long_inc(&skb->dev->rx_dropped);

     kfree_skb(skb);

     return NET_RX_DROP;

 }

中断下半部

net_rx_action(软中断收包处理程序)–>napi_poll(执行非napi回调函数process_backlog)

net_rx_action与napi方式相同，这里略过，主要看下其poll回调函数，其将数据包从队列中移出，调用__netif_receive_skb传递到上层，后续介绍传递流程，此处略过：

 static int process_backlog(struct napi_struct *napi, int quota)

 {

     struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

     bool again = true;

     int work = ;

     /* Check if we have pending ipi, its better to send them now,

      * not waiting net_rx_action() end.

      */

     if (sd_has_rps_ipi_waiting(sd)) {

         local_irq_disable();

         net_rps_action_and_irq_enable(sd);

     }

     //设置设备接收配额

     napi->weight = dev_rx_weight;

     while (again) {

         struct sk_buff *skb;

         //从队列中取skb向上层输入

         while ((skb = __skb_dequeue(&sd->process_queue))) {

             rcu_read_lock();

             __netif_receive_skb(skb);

             rcu_read_unlock();

             input_queue_head_incr(sd);

             //如果达到配额，则完成

             if (++work >= quota)

                 return work;

         }

         local_irq_disable();

         rps_lock(sd);

         //如果输入队列为空，没有需要处理

         if (skb_queue_empty(&sd->input_pkt_queue)) {

             /*

              * Inline a custom version of __napi_complete().

              * only current cpu owns and manipulates this napi,

              * and NAPI_STATE_SCHED is the only possible flag set

              * on backlog.

              * We can use a plain write instead of clear_bit(),

              * and we dont need an smp_mb() memory barrier.

              */

             //重置状态，处理完毕

             napi->state = ;

             again = false;

         } else {

             //合并输入队列到处理队列，继续走循环处理

             skb_queue_splice_tail_init(&sd->input_pkt_queue,

                            &sd->process_queue);

         }

         rps_unlock(sd);

         local_irq_enable();

     }

     //返回实际处理的包数

     return work;

 }