linux内核数据包转发流程（三）网卡帧接收分析

每一个cpu都有队列来处理接收到的帧，都有其数据结构来处理入口和出口流量，因此，不同cpu之间没有必要使用上锁机制，。此队列数据结构为softnet_data(定义在include/linux/netdevice.h中):

/*

 * Incoming packets are placed on per-cpu queues so that

 * no locking is needed.

 */

struct softnet_data

{

struct Qdisc *output_queue;

struct sk_buff_headinput_pkt_queue;//有数据要传输的设备列表

struct list_headpoll_list; //双向链表，当中的设备有输入帧等着被处理。

struct sk_buff*completion_queue;//缓冲区列表，当中缓冲区已成功传输，能够释放掉

struct napi_structbacklog;

};

此结构字段可用于传输和接收。换而言之，NET_RX_SOFTIRQ和NET_TX_SOFTIRQ软IRQ都引用此结构。入口帧会排入input_pkt_queue(NAPI有所不同)。

softnet_data是在net_dev_init函数中初始化的：

/*

 *       This is called single threaded during boot, so no need

 *       to take the rtnl semaphore.

 */

static int __init net_dev_init(void)

{

int i, rc = -ENOMEM;

......

/*

* Initialise the packet receive queues.

*/

for_each_possible_cpu(i) {

struct softnet_data *queue;

queue = &per_cpu(softnet_data, i);

skb_queue_head_init(&queue->input_pkt_queue);

queue->completion_queue = NULL;

INIT_LIST_HEAD(&queue->poll_list);

queue->backlog.poll = process_backlog;

queue->backlog.weight = weight_p;

queue->backlog.gro_list = NULL;

queue->backlog.gro_count = 0;

}

......

open_softirq(NET_TX_SOFTIRQ, net_tx_action);

open_softirq(NET_RX_SOFTIRQ, net_rx_action);

......

}

非NAPI设备驱动会为其所接收的每个帧产生一个中断事件，在高流量负载下，会花掉大量时间处理中断事件，造成资源浪费。而NAPI驱动混合了中断事件和轮询，在高流量负载下其性能会比旧方法要好。

NAPI主要思想是混合使用中断事件和轮询，而不是只使用中断事件驱动模型。当收到新的帧时，关中断，再一次处理全然部入口队列。从内核观点来看，NAPI方法由于中断事件少了，降低了cpu负载。

使用非NAPI的驱动程序的xx_rx()函数一般例如以下：

void xx_rx()

{

struct sk_buff *skb;

skb = dev_alloc_skb(pkt_len + 5);

if (skb != NULL) {

skb_reserve(skb, 2);/* Align IP on 16 byte boundaries */

/*memcpy(skb_put(skb, 2), pkt, pkt_len);*/ //copy data to skb

skb->protocol = eth_type_trans(skb, dev);

netif_rx(skb);

}

}

第一步是分配一个缓存区来保存报文。注意缓存分配函数 (dev_alloc_skb) 须要知道数据长度。

第二步将报文数据被复制到缓存区; skb_put 函数更新缓存中的数据末尾指针并返回指向新建空间的指针。

第三步提取协议标识及获取其它信息。

最后调用netif_rx(skb)做进一步处理，该函数一般定义在net/core/dev.c中。

int netif_rx(struct sk_buff *skb)

{

struct softnet_data *queue;

unsigned long flags;

/* if netpoll wants it, pretend we never saw it */

if (netpoll_rx(skb))

return NET_RX_DROP;

if (!skb->tstamp.tv64)

net_timestamp(skb);

/*

* The code is rearranged so that the path is the most

* short when CPU is congested, but is still operating.

*/

local_irq_save(flags);

queue = &__get_cpu_var(softnet_data);

__get_cpu_var(netdev_rx_stat).total++;

if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {//是否还有空间,netdev_max_backlog一般为300

//仅仅有当新缓冲区为空时，才会触发软中断（napi_schedule()）,假设缓冲区不为空，软中断已被触发，没有必要再去触发一次。

if (queue->input_pkt_queue.qlen) {

enqueue:

__skb_queue_tail(&queue->input_pkt_queue, skb);//这里是关键之处，将skb增加input_pkt_queue之中。

local_irq_restore(flags);

return NET_RX_SUCCESS;

}

napi_schedule(&queue->backlog);//触发软中断

goto enqueue;

}

__get_cpu_var(netdev_rx_stat).dropped++;

local_irq_restore(flags);

kfree_skb(skb);

return NET_RX_DROP;

}

EXPORT_SYMBOL(netif_rx);

static inline void napi_schedule(struct napi_struct *n)

{

	if (napi_schedule_prep(n))

		__napi_schedule(n);

}

void __napi_schedule(struct napi_struct *n)

{

	unsigned long flags;

	local_irq_save(flags);

	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);//将该设备增加轮询链表，等待该设备的帧被处理

	__raise_softirq_irqoff(NET_RX_SOFTIRQ);//终于触发软中断

	local_irq_restore(flags);

}

EXPORT_SYMBOL(__napi_schedule);

至此中断的上半部完毕，其它的工作交由下半部来实现。napi_schedule(&queue->backlog)函数将有等待的接收数据包的NIC链入softnet_data的poll_list队列，然后触发软中断，让下半部去完毕数据的处理工作。

而是用ＮＡＰＩ设备的接受数据时直接触发软中断，不须要通过netif_rx()函数设置好接收队列再触发软中断。比方e100硬中断处理函数为：

static irqreturn_t e100_intr(int irq, void *dev_id)

{

	struct net_device *netdev = dev_id;

	struct nic *nic = netdev_priv(netdev);

	u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);

	DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X\n", stat_ack);

	if (stat_ack == stat_ack_not_ours ||	/* Not our interrupt */

	   stat_ack == stat_ack_not_present)	/* Hardware is ejected */

		return IRQ_NONE;

	/* Ack interrupt(s) */

	iowrite8(stat_ack, &nic->csr->scb.stat_ack);

	/* We hit Receive No Resource (RNR); restart RU after cleaning */

	if (stat_ack & stat_ack_rnr)

		nic->ru_running = RU_SUSPENDED;

	if (likely(napi_schedule_prep(&nic->napi))) {

		e100_disable_irq(nic);

		__napi_schedule(&nic->napi);//此处触发软中断

	}

	return IRQ_HANDLED;

}

在前面我们已经知道在net_dev_init()函数中注冊了收报软中断函数net_rx_action(),当软中断被触发之后，该函数将被调用。

net_rx_action()函数为：

static void net_rx_action(struct softirq_action *h)

{

	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;

	unsigned long time_limit = jiffies + 2;

	int budget = netdev_budget;

	void *have;

	local_irq_disable();

	while (!list_empty(list)) {

		struct napi_struct *n;

		int work, weight;

		/* If softirq window is exhuasted then punt.

		 * Allow this to run for 2 jiffies since which will allow

		 * an average latency of 1.5/HZ.

		 */

		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))//入口队列仍然有缓冲区，软IRQ再度被调度运行。

			goto softnet_break;

		local_irq_enable();

		/* Even though interrupts have been re-enabled, this

		 * access is safe because interrupts can only add new

		 * entries to the tail of this list, and only ->poll()

		 * calls can remove this head entry from the list.

		 */

		n = list_entry(list->next, struct napi_struct, poll_list);

		have = netpoll_poll_lock(n);

		weight = n->weight;

		/* This NAPI_STATE_SCHED test is for avoiding a race

		 * with netpoll's poll_napi().  Only the entity which

		 * obtains the lock and sees NAPI_STATE_SCHED set will

		 * actually make the ->poll() call.  Therefore we avoid

		 * accidently calling ->poll() when NAPI is not scheduled.

		 */

		work = 0;

		if (test_bit(NAPI_STATE_SCHED, &n->state)) {

			work = n->poll(n, weight);//运行poll函数，返回已处理的帧

			trace_napi_poll(n);

		}

		WARN_ON_ONCE(work > weight);

		budget -= work;

		local_irq_disable();

		/* Drivers must not modify the NAPI state if they

		 * consume the entire weight.  In such cases this code

		 * still "owns" the NAPI instance and therefore can

		 * move the instance around on the list at-will.

		 */

		if (unlikely(work == weight)) {//队列被清空。调用napi_complete()负责此事。

			if (unlikely(napi_disable_pending(n))) {

				local_irq_enable();

				napi_complete(n);

				local_irq_disable();

			} else

				list_move_tail(&n->poll_list, list);

		}

		netpoll_poll_unlock(have);

	}

out:

	local_irq_enable();

#ifdef CONFIG_NET_DMA

	/*

	 * There may not be any more sk_buffs coming right now, so push

	 * any pending DMA copies to hardware

	 */

	dma_issue_pending_all();

#endif

	return;

softnet_break:

	__get_cpu_var(netdev_rx_stat).time_squeeze++;

	__raise_softirq_irqoff(NET_RX_SOFTIRQ);

	goto out;

}

由上可见，下半部的主要工作是遍历有数据帧等待接收的设备链表，对于每一个设备，运行它对应的poll函数。

对非NAPI设备来说，poll函数在net_dev_init()函数中初始化为process_backlog()。

process_backlog()函数定义为：

static int process_backlog(struct napi_struct *napi, int quota)

{

	int work = 0;

	struct softnet_data *queue = &__get_cpu_var(softnet_data);

	unsigned long start_time = jiffies;

	napi->weight = weight_p;

	do {

		struct sk_buff *skb;

		local_irq_disable();

		skb = __skb_dequeue(&queue->input_pkt_queue);

		if (!skb) {

			__napi_complete(napi);

			local_irq_enable();

			break;

		}

		local_irq_enable();

		netif_receive_skb(skb);

	} while (++work < quota && jiffies == start_time);

	return work;

}

对NAPI设备来的说，驱动程序必须提供一个poll方法,poll 方法有以下原型:

int (*poll)(struct napi_struct *dev, int *budget);

在初始化时须要加入该方法：

netif_napi_add(netdev, &nic->napi, xx_poll, XX_NAPI_WEIGHT);

NAPI驱动的 poll 方法实现一般例如以下（借用《Linux设备驱动程序》中代码，内核有点没对上，懒得去写了）:

static int xx_poll(struct net_device *dev, int *budget)

{

    int npackets = 0, quota = min(dev->quota, *budget);

    struct sk_buff *skb;

    struct xx_priv *priv = netdev_priv(dev);

    struct xx_packet *pkt;

    while (npackets < quota && priv->rx_queue) {

        pkt = xx_dequeue_buf(dev);

        skb = dev_alloc_skb(pkt->datalen + 2);

        if (! skb) {

            if (printk_ratelimit())

                printk(KERN_NOTICE "xx: packet dropped\n"); priv->stats.rx_dropped++; xx_release_buffer(pkt); continue;

        }

        memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);

        skb->dev = dev;

        skb->protocol = eth_type_trans(skb, dev);

        skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */

        netif_receive_skb(skb);

        /* Maintain stats */

        npackets++;

        priv->stats.rx_packets++;

        priv->stats.rx_bytes += pkt->datalen;

        xx_release_buffer(pkt);

    }

    /* If we processed all packets, we're done; tell the kernel and reenable ints */

    *budget -= npackets;

    dev->quota -= npackets;

    if (! priv->rx_queue) {

        netif_rx_complete(dev);

        xx_rx_ints(dev, 1);

        return 0;

    }

    /* We couldn't process everything. */

    return 1;

}

NAPI驱动提供自己的poll函数和私有队列。

无论是非NAPI或NAPI，他们的poll函数最后都会调用netif_receive_skb(skb)来处理接收到的帧。该函数会想各个已注冊的协议例程发送一个skb，之后数据进入Linux内核协议栈处理。

linux内核数据包转发流程（三）网卡帧接收分析的更多相关文章

linux内核数据包转发流程（一）：网络设备驱动
[版权声明:转载请保留出处:blog.csdn.net/gentleliu.邮箱:shallnew*163.com] 网卡驱动为每一个新的接口在一个全局的网络设备列表里插入一个数据结构.每一个接口由一 ...
linux内核数据包转发流程（二）：中断
[版权声明:转载请保留出处:blog.csdn.net/gentleliu.邮箱:shallnew*163.com] 内核在处理2层数据包之前,必须先处理中断系统.设立中断系统,才有可能每秒处理成千的 ...
Linux内核数据包的发送传输
本文主要讲解了Linux内核数据包的传输流程,使用的内核的版本是2.6.32.27 为了方便理解,本文采用整体流程图加伪代码的方式从内核高层面上梳理了二层数据包发送传输的流程,希望可以对大家有所帮助. ...
[Docker]Docker与Linux ip_forward数据包转发
背景今天在一台新虚拟机上需要临时启动一个consul服务,安装Docker后使用docker启动,但是在执行启动命令后发现docker有一个警告: WARNING: IPv4 forwarding ...
LINUX下的远端主机登入校园网络注册网络数据包转发和捕获
第一部分:LINUX 下的远端主机登入和校园网注册校园网内目的主机远程管理登入程序本程序为校园网内远程登入,管理功能,该程序分服务器端和客户端两部分:服务器端为remote_server_udp. ...
Linux内核网络数据包处理流程
Linux内核网络数据包处理流程 from kernel-4.9: 0. Linux内核网络数据包处理流程 - 网络硬件网卡工作在物理层和数据链路层,主要由PHY/MAC芯片.Tx/Rx FIFO. ...
Linux内核网络报文简单流程
转:http://blog.csdn.net/adamska0104/article/details/45397177 Linux内核网络报文简单流程2014-08-12 10:05:09 分类: L ...
Linux网络 - 数据包的接收过程【转】
转自:https://segmentfault.com/a/1190000008836467 本文将介绍在Linux系统中,数据包是如何一步一步从网卡传到进程手中的. 如果英文没有问题,强烈建议阅读后 ...
[转帖]Linux内核为大规模支持100Gb/s网卡准备好了吗？并没有
Linux内核为大规模支持100Gb/s网卡准备好了吗?并没有之前用千兆的机器下载速度一般只能到 50MB 左右没法更高万兆的话可能也就是 200MB左右的速度很难更高不知道后续的服 ...

随机推荐

spring配置日志
原文:http://blog.csdn.net/xiejx618/article/details/41698913 参考:http://spring.io/blog/2009/12/04/loggin ...
Android 框架炼成教你怎样写组件间通信框架EventBus
转载请标明出处:http://blog.csdn.net/lmj623565791/article/details/41096639 .本文出自:[张鸿洋的博客] 1.概述关于Eventbus的介绍 ...
yum 安装软件时报错
报错信息 Another app is currently holding the yum lock; waiting for it to exit 处理方法 rm -rf /var/run/yum. ...
POJ 1422 Air Raid（二分图匹配最小路径覆盖）
POJ 1422 Air Raid 题目链接题意:给定一个有向图,在这个图上的某些点上放伞兵,能够使伞兵能够走到图上全部的点.且每一个点仅仅被一个伞兵走一次.问至少放多少伞兵思路:二分图的最小路径 ...
SSM框架理解（转）
SSM框架理解最近两星期一直在学JavaEE的MVC框架,因为之前学校开的JavaEE课程就一直学的吊儿郎当的,所以现在真正需要掌握就非常手忙脚乱,在此记录下这段时间学习的感悟,如有错误,希望大牛毫 ...
HDU 2825 AC自动机+DP
题意:一个密码,长度为 n,然后有m个magic words,这个密码至少由k个magic words组成. 问这个密码可能出现的总数. 思路:首先构造AC自动机,由于m很小,才10 ,我们可以使用二 ...
javaScript滚动新闻
<!DOCTYPE HTML> <html> <head> <meta http-equiv="Content-Type" content ...
Hibernate常用Annotation标签说明
@ javax.persistence.Entity 实体类定义,该标签表示当前类是一个Hibernate的数据库实体,对应着数据库中的某个表位置:用于类级别参数:无样例:@Entity 注意: ...
Oracle历史记录
请问如何查询ORACLE的历史操作记录!!!!!------解决方案-------------------- 有一个专门存储操作的数据库表..select t.SQL_TEXT, t.FIRST_LO ...
android windows 上JNI编程
昨天学习windows上的JNI编程,JNI说白了就是java和c语言的一个互相沟通的桥梁.java能够调用JNI来完毕调用C语言实现的方法. JNI的全称是(Java native interfac ...

linux内核数据包转发流程（三）网卡帧接收分析

linux内核数据包转发流程（三）网卡帧接收分析的更多相关文章

随机推荐

热门专题