IP 层收发报文简要剖析3--ip输入报文分片重组
在ip_local_deliver中,如果检测到是分片包,则需要将报文进行重组。其所有的分片被重新组合后才能提交到上层协议,每一个被重新组合的数据包文用ipq结构实例来表示
struct ipq {
struct inet_frag_queue q; u32 user;//分片来源
__be32 saddr;//原地址
__be32 daddr;//目的地址
__be16 id;//ip报文序列号
u8 protocol;//上层协议号
//这四个字段来自ip首部是为了确定来自哪个ip数据报文
u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */
int iif;
int vif; /* L3 master device index */
unsigned int rid;//已收到的分片计数器
struct inet_peer *peer;//记录发送方信息
//通过rid peer 可以防止Dos攻击
};
网络空间分段管理结构
struct inet_frags {
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];//哈希队列 struct work_struct frags_work;//工作队列
unsigned int next_bucket;
unsigned long last_rebuild_jiffies;
bool rebuild; /* The first call to hashfn is responsible to initialize
* rnd. This is best done with net_get_random_once.
*
* rnd_seqlock is used to let hash insertion detect
* when it needs to re-lookup the hash chain to use.
*/
u32 rnd;//随机数
seqlock_t rnd_seqlock;//
int qsize;//队列长度 unsigned int (*hashfn)(const struct inet_frag_queue *);
bool (*match)(const struct inet_frag_queue *q,
const void *arg);//分段队列匹配函数
void (*constructor)(struct inet_frag_queue *q,
const void *arg);
void (*destructor)(struct inet_frag_queue *);
void (*frag_expire)(unsigned long data);//队列过期处理函数
struct kmem_cache *frags_cachep;
const char *frags_cache_name;
};
struct netns_frags {
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */
int timeout;超时时间
int high_thresh;内存使用上限
int low_thresh;内存使用下限
int max_dist;
};
/**
* struct inet_frag_queue - fragment queue
*
* @lock: spinlock protecting the queue
* @timer: queue expiration timer
* @list: hash bucket list
* @refcnt: reference count of the queue
* @fragments: received fragments head
* @fragments_tail: received fragments tail
* @stamp: timestamp of the last received fragment
* @len: total length of the original datagram
* @meat: length of received fragments so far
* @flags: fragment queue flags
* @max_size: maximum received fragment size
* @net: namespace that this frag belongs to
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
*/
struct inet_frag_queue {//inet分段队列头
spinlock_t lock;smp环境下 需要
struct timer_list timer;队列定时器,组装非常耗时,不能无休止的等待分片的到达
struct hlist_node list;哈希节点,链入inet分段管理结构的哈希队列
atomic_t refcnt;计数器
struct sk_buff *fragments;分段数据包队列
struct sk_buff *fragments_tail;
ktime_t stamp;时间戳
int len;数据包结束位置offset+len
int meat;与原数据长度的差距,如果和原数据包长度一样代表接收完成
__u8 flags;
u16 max_size;
struct netns_frags *net;指向网络空寂分段管理结构
struct hlist_node list_evictor;
};
1.1、 IP分组的初始化
void __init ipfrag_init(void)
{
ip4_frags_ctl_register();
register_pernet_subsys(&ip4_frags_ops);//向内核注册ipv4分段管理函数
ip4_frags.hashfn = ip4_hashfn;//设置计算hash的函数
//设置初始化ip 分段队列的构造函数
ip4_frags.constructor = ip4_frag_init;
//析构函数
ip4_frags.destructor = ip4_frag_free;
//队列机构长度
ip4_frags.qsize = sizeof(struct ipq);
//对比ip分段队列hook
ip4_frags.match = ip4_frag_match;
//设置分段队列过期处理函数
ip4_frags.frag_expire = ip_expire;
ip4_frags.frags_cache_name = ip_frag_cache_name;
if (inet_frags_init(&ip4_frags))
panic("IP: failed to allocate ip4_frags cache\n");
} int inet_frags_init(struct inet_frags *f)
{
int i;
//初始化工作队列
INIT_WORK(&f->frags_work, inet_frag_worker); for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb = &f->hash[i];//初始化hash 队列头 spin_lock_init(&hb->chain_lock);
INIT_HLIST_HEAD(&hb->chain);
} seqlock_init(&f->rnd_seqlock);
f->last_rebuild_jiffies = 0;
f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
NULL);
if (!f->frags_cachep)
return -ENOMEM; return 0;
}
EXPORT_SYMBOL(inet_frags_init);
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
struct net *net = dev_net(skb->dev); /* 分片重组 */
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
} /* 经过LOCAL_IN钩子点 */
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
1.2、 ip分片报文重组的处理
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
//递增计数
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
skb_orphan(skb); /* Lookup (or create) queue header* 查找或创建IP分片队列 */
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {/* 分片队列存在 */
int ret; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb);//分片数据包入队重组数据包 spin_unlock(&qp->q.lock);
ipq_put(qp);
return ret;
}
/* 创建新的ip分片队列失败,内存不足递增失败计数*/
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}
EXPORT_SYMBOL(ip_defrag);
1.2.2 ip_find 根据ip首部以及user标志 在ipq散列表中查找对应的ipq。
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
enum ip_defrag_users {
IP_DEFRAG_LOCAL_DELIVER,
IP_DEFRAG_CALL_RA_CHAIN,
IP_DEFRAG_CONNTRACK_IN,
__IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
IP_DEFRAG_CONNTRACK_OUT,
__IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
IP_DEFRAG_CONNTRACK_BRIDGE_IN,
__IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
IP_DEFRAG_VS_IN,
IP_DEFRAG_VS_OUT,
IP_DEFRAG_VS_FWD,
IP_DEFRAG_AF_PACKET,
IP_DEFRAG_MACVLAN,
};
*/
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
u32 user, int vif)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
/* 记录ip头和输入信息 */
arg.iph = iph;
arg.user = user;
arg.vif = vif;
/* 通过id,源地址,目的地址,协议计算hash */
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
/* 根据hash值查找或创建队列 */
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL;
}
return container_of(q, struct ipq, q);
} struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key,
unsigned int hash)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
int depth = 0;
/* 分片内存已经超过了低限 */
if (frag_mem_limit(nf) > nf->low_thresh)
/* 进行节点回收 */
inet_frag_schedule_worker(f); //工作队列回调函数为inet_frag_worker hash &= (INETFRAGS_HASHSZ - 1);
hb = &f->hash[hash]; /* 找到hash桶 */ spin_lock(&hb->chain_lock);
hlist_for_each_entry(q, &hb->chain, list) { /* 遍历链表 */
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt); /* 增加引用计数 */
spin_unlock(&hb->chain_lock);
return q;
}
depth++;/* 记录查找深度 */
}
spin_unlock(&hb->chain_lock);
/* 未找到 */
/* 桶节点的链表深度不超过限定 */
if (depth <= INETFRAGS_MAXDEPTH)
return inet_frag_create(nf, f, key);/* 创建节点返回 */ if (inet_frag_may_rebuild(f)) {
/* 如果已经超过了重建间隔时间,则重建 */
if (!f->rebuild)
f->rebuild = true;
inet_frag_schedule_worker(f);
} return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
如果查找不到则会创建一个ipq 并将其插入链表中
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *q; q = inet_frag_alloc(nf, f, arg);//分配队列头结构空间
if (!q)
return NULL; return inet_frag_intern(nf, q, f, arg);
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *q; if (frag_mem_limit(nf) > nf->high_thresh) {//内存超过警戒线 回收内存
inet_frag_schedule_worker(f);
return NULL;
} q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q)
return NULL; q->net = nf;//记录下网络空间的分段管理结构指针
f->constructor(q, arg);//之前初始化时,构造函数来初始化-ip4_frag_init
add_frag_mem_limit(nf, f->qsize);//sum 网络空间的分段内存 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);//定时器initand run
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1); return q;
}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);//获取分段队列指针
struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
frags); struct net *net = container_of(ipv4, struct net, ipv4); const struct ip4_create_arg *arg = a;//ipv4的分段信息指针 qp->protocol = arg->iph->protocol;//IP层头部协议
qp->id = arg->iph->id;//ip层id
qp->ecn = ip4_frag_ecn(arg->iph->tos);
qp->saddr = arg->iph->saddr;
qp->daddr = arg->iph->daddr;
qp->vif = arg->vif;
qp->user = arg->user;
//记录对方信息
qp->peer = q->net->max_dist ?
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
NULL;
} static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in,
struct inet_frags *f,
void *arg)
{
struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
struct inet_frag_queue *qp; #ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could have been created on other cpu before
* we acquired hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
}
#endif
qp = qp_in;
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt);//链入inet分段管理结构的hash队列
hlist_add_head(&qp->list, &hb->chain); spin_unlock(&hb->chain_lock); return qp;
}
1/2/3 分片数据包加入重组数据包
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
u8 ecn; if (qp->q.flags & INET_FRAG_COMPLETE) //分段队列接收完成 则释放此分片返回
goto err;
/*数据包没有分段标志or 分段队列间隔过大
//重现调整分段队列是否出错
如果不是本地生成的分片,则调用ip_frag_too_far 检测
是否存在 dos攻击,存在攻击则调用邋ip_frag_reinit释放
所用分片
*/
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
ipq_kill(qp);//将ipq从散列表中移除停止定时器 计数器减一
// 调用ipq_unlink 设置ipq为complete状态,只有complete状态才能释放
goto err;
} ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
offset = ntohs(ip_hdr(skb)->frag_off);
flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET;
offset <<= 3; /* offset is in 8-byte chunks */
ihl = ip_hdrlen(skb);
/* 获取ip首部中的数据标志位 片的偏移 首部长度 */
/* Determine the position of this fragment. */
end = offset + skb->len - skb_network_offset(skb) - ihl;
err = -EINVAL;
/**/
/* Is this the final fragment?
如果是最后一个片则先对分片进行检测
*/
if ((flags & IP_MF) == 0) {
/* If we already have some bits beyond end
* or have different end, the segment is corrupted.
结束位置小于前一个位置,ipq已经有
last_in 标志且分片末尾不等于原始数据长度
*/
if (end < qp->q.len ||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
goto err;
qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
/*通过校验并设置为last_in标志,存储完整的数据长度*/
} else {
if (end&7) {//按8字节对其
end &= ~7;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
if (end > qp->q.len) {
/* 结束地址大于前一个分段数据地址
Some bits beyond end -> corruption.
如果设置了最后一个分段数据标志
表示最后一个包,则错误*/
if (qp->q.flags & INET_FRAG_LAST_IN)
goto err;
qp->q.len = end;//记录当前分段数据块的结束位置
}
}
if (end == offset)//等于起始位置 即分片区数据长度为0
goto err; err = -ENOMEM;//去掉ip首部
if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
goto err;
//skb 数据长度为end-offset ip 有效载荷长度
err = pskb_trim_rcsum(skb, end - offset);
if (err)
goto err; /* Find out which fragments are in front and at the back of us
* in the chain of fragments so far. We must know where to put
* this fragment, right?
*/
prev = qp->q.fragments_tail;
if (!prev || FRAG_CB(prev)->offset < offset) {
next = NULL;
goto found;
}
prev = NULL;
for (next = qp->q.fragments; next != NULL; next = next->next) {
if (FRAG_CB(next)->offset >= offset)
break; /* bingo! */
prev = next;
}/*确定分片在链表中的位置,分片到达的时间顺序不同
ipq 上的分片按照分片偏移值大小排序
*/ found:
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
检验和和上一个分片数据是否有重叠
*/
if (prev) {
int i = (FRAG_CB(prev)->offset + prev->len) - offset; if (i > 0) {//有重叠 调用pskb_pull 消除重叠
offset += i;
err = -EINVAL;
if (end <= offset)
goto err;
err = -ENOMEM;
if (!pskb_pull(skb, i))
goto err;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->ip_summed = CHECKSUM_NONE;
}
} err = -ENOMEM;
/*如果和后面一个分片的数据有重叠,
部分重叠还是完全重叠;
重叠部分数据超过下一个分片的数据长度,咋释放
下发一个分片并在检查与后面第二个分片的数据是否
有重叠,如果没有超过下一个则调整下一个分片。
如此反复直到对所有分片都检测完。
调整片的偏移以及分片总长度
*/
while (next && FRAG_CB(next)->offset < end) {
int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ if (i < next->len) {
/* Eat head of the next overlapped fragment
* and leave the loop. The next ones cannot overlap.
*/
if (!pskb_pull(next, i))
goto err;
FRAG_CB(next)->offset += i;
qp->q.meat -= i;
if (next->ip_summed != CHECKSUM_UNNECESSARY)
next->ip_summed = CHECKSUM_NONE;
break;
} else {
struct sk_buff *free_it = next; /* Old fragment is completely overridden with
* new one drop it.
*/
next = next->next; if (prev)
prev->next = next;
else
qp->q.fragments = next; qp->q.meat -= free_it->len;
sub_frag_mem_limit(qp->q.net, free_it->truesize);
kfree_skb(free_it);
}
} FRAG_CB(skb)->offset = offset;//当前片的偏移 /* Insert this fragment in the chain of fragments.
当前的片插入到ipq队列中相应的位置*/
skb->next = next;
if (!next)
qp->q.fragments_tail = skb;
if (prev)
prev->next = skb;
else
qp->q.fragments = skb; dev = skb->dev;
if (dev) {
qp->iif = dev->ifindex;
skb->dev = NULL;
}
qp->q.stamp = skb->tstamp;//更新时间搓
qp->q.meat += skb->len;//sum ipq已收到分片的总长度
qp->ecn |= ecn;
//分片组装模块的所占内存的总长度
add_frag_mem_limit(qp->q.net, skb->truesize);
if (offset == 0)//为第一个片 设置标志
qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size)
qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
fragsize > qp->max_df_size)
qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {//所有报文都到齐则重组
unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL;
err = ip_frag_reasm(qp, prev, dev);
skb->_skb_refdst = orefdst;
return err;
} skb_dst_drop(skb);
return -EINPROGRESS; err:
kfree_skb(skb);
return err;
}
ip_frag_reasm 重组报文;
* Build a new IP datagram from all its fragments. */
/*
*用于组装已到齐的所有分片,当原始
* 数据包的所有分片都已到齐时,会调用此函
* 数组装分片。
*/
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
int ihlen;
int err;
u8 ecn;
/*
* 要开始组装了,因此调用ipq_kill()将此ipq结点从
* ipq散列表删除,并删除定时器。
*/
ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
}
/* Make the one we just received the head. */
if (prev) {
head = prev->next;
fp = skb_clone(head, GFP_ATOMIC);
if (!fp)
goto out_nomem; fp->next = head->next;
if (!fp->next)
qp->q.fragments_tail = fp;
prev->next = fp; skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next; consume_skb(qp->q.fragments);
qp->q.fragments = head;
} WARN_ON(!head);
WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram.
计算原始报文的长度 超过64 KB*/
ihlen = ip_hdrlen(head);
len = ihlen + qp->q.len; err = -E2BIG;
if (len > 65535)
goto out_oversize; /* Head of list must not be cloned.
* 在组装分片时,所有的分片都会组装到第一个分片
* 上,因此第一个分片是不能克隆的,如果是克隆的,
* 则需为分片组装重新分配一个SKB。
*/
if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem; /* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
/*
* 分片队列的第一个SKB不能既带有数据,又带有分片,即其
* frag_list上不能有分片skb,如果有则重新分配一个SKB。最终的
* 效果是,head自身不包括数据,其frag_list上链接着所有分片的
* SKB。这也是SKB的一种表现形式,不一定是一个连续的数据块,
* 但最终会调用skb_linearize()将这些数据都复制到一个连续的数据
* 块中。
*/
if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
goto out_nomem;
clone->next = head->next;
head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
head->data_len -= clone->len;
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
}
/*
* 把所有分片组装起来即将分片链接到第一个
* SKB的frag_list上,同时还需要遍历所有分片,
* 重新计算IP数据包长度以及校验和等。
*/
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head)); for (fp=head->next; fp; fp = fp->next) {
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
}
/*
* 重置首部长度、片偏移、标志位和总长度。
*/
sub_frag_mem_limit(qp->q.net, head->truesize); head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head);
iph->tot_len = htons(len);
iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a
* call to ip_fragment to avoid forwarding a DF-skb of size s while
* original sender only sent fragments of size f (where f < s).
*
* We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
* frag seen to avoid sending tiny DF-fragments in case skb was built
* from one very small df-fragment and one large non-df frag.
*/
if (qp->max_df_size == qp->q.max_size) {
IPCB(head)->flags |= IPSKB_FRAG_PMTU;
iph->frag_off = htons(IP_DF);
} else {
iph->frag_off = 0;
} ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
/*
* 既然各分片都已处理完,释放ipq的分片队列。
*/
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
return 0; out_nomem:
net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
return err;
}
1/4/4 ipq散列表重组
static void inet_frag_secret_rebuild(struct inet_frags *f)
{
int i; write_seqlock_bh(&f->rnd_seqlock);//顺序锁 if (!inet_frag_may_rebuild(f))
goto out;
/* 获取新的用于计算hash的随机值 */
get_random_bytes(&f->rnd, sizeof(u32)); for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n; hb = &f->hash[i]; /* 取的桶节点 */
spin_lock(&hb->chain_lock); hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) {/* 节点不属于当前桶 */
struct inet_frag_bucket *hb_dest; hlist_del(&q->list); /* 从当前桶中删除该节点 */ /* Relink to new hash chain. */
hb_dest = &f->hash[hval]; /* 找到目标桶 */ /* This is the only place where we take
* another chain_lock while already holding
* one. As this will not run concurrently,
* we cannot deadlock on hb_dest lock below, if its
* already locked it will be released soon since
* other caller cannot be waiting for hb lock
* that we've taken above.
*/
spin_lock_nested(&hb_dest->chain_lock,
SINGLE_DEPTH_NESTING);/* 节点加入目标桶的链表中 */
hlist_add_head(&q->list, &hb_dest->chain);
spin_unlock(&hb_dest->chain_lock);
}
}
spin_unlock(&hb->chain_lock);
}
/* 设置重建标记和重建时间 */
f->rebuild = false;
f->last_rebuild_jiffies = jiffies;
out:
write_sequnlock_bh(&f->rnd_seqlock);
}
1/4/5 超时IP分片的清除
会定时清除规定 时间内没有完成重组的upq及其所有的分片
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
static void ip_expire(unsigned long arg)
{
struct ipq *qp;
struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock);
//ipq 已经是complete状态不处理 直接释放ipq以及其所有的分片
if (qp->q.flags & INET_FRAG_COMPLETE)
goto out; ipq_kill(qp);//将其从散列表移除
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);//数据统计 if (!inet_frag_evicting(&qp->q)) {//在回收队列中
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err; __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
goto out; rcu_read_lock();
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out_rcu_unlock; /* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
if (err)
goto out_rcu_unlock; /* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (frag_expire_skip_icmp(qp->user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. 发送ICMP 报文*/
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
out_rcu_unlock:
rcu_read_unlock();
}
out:
spin_unlock(&qp->q.lock);
ipq_put(qp);
}
1/4/6 进行节点回收工作队列
为了控制ip组装所占用的内存,设置了两个阈值low_thresh 、high_thresh 当前ipq散列表所占用的内存存储在 mem变量中,这些全局变量存在如下结构中(netns_frags)
struct netns_frags {
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */
int timeout;
int high_thresh;
int low_thresh;
int max_dist;
};
当mem大于high_thres 时,需要对散列表清理,直到mem值降低到low_thres。这两个值可以通过proc修改
static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
struct inet_frag_queue *fq;
struct hlist_node *n;
unsigned int evicted = 0;
HLIST_HEAD(expired); spin_lock(&hb->chain_lock);
/* 遍历桶下的链表 */
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))/* 未超过限定,无需回收 */
continue; if (!del_timer(&fq->timer)) /* 定时器无法删除 */
continue;
/* 能够回收的节点加入到临时hash */
hlist_add_head(&fq->list_evictor, &expired);
++evicted;
} spin_unlock(&hb->chain_lock);
/* 依次调用回收函数进行回收 */
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
f->frag_expire((unsigned long) fq); return evicted;
} static void inet_frag_worker(struct work_struct *work)
{
/* 本次回收的桶节点数 */
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
unsigned int i, evicted = 0;
struct inet_frags *f; f = container_of(work, struct inet_frags, frags_work); BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); local_bh_disable();
/* 从上次回收完的下一个节点开始,进行回收 */
for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
evicted += inet_evict_bucket(f, &f->hash[i]);
/* 回收并统计回收数量 */
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
/* 回收节点数超过最大值,停止 */
if (evicted > INETFRAGS_EVICT_MAX)
break;
} f->next_bucket = i; /* 记录下次需要开始回收的桶节点 */ local_bh_enable();
/* 如果需要重建,则重建 */
if (f->rebuild && inet_frag_may_rebuild(f))
inet_frag_secret_rebuild(f);
}
IP 层收发报文简要剖析3--ip输入报文分片重组的更多相关文章
- IP 层收发报文简要剖析6--ip报文输出3 ip_push_pending_frames
L4层的协议会把数据通过ip_append_data或ip_append_page把数据线放在缓冲区,然后再显示调用ip_push_pending_frames传送数据. 把数据放在缓冲区有两个优点, ...
- IP 层收发报文简要剖析5--ip报文发送2
udp 发送ip段报文接口ip_append_data ip_append_data 函数主要用来udp 套接字以及raw套接字发送报文的接口.在tcp中发送ack 以及rest段的ip_send_u ...
- IP 层收发报文简要剖析2--ip报文的输入ip_local_deliver
ip报文根据路由结果:如果发往本地则调用ip_local_deliver处理报文:如果是转发出去,则调用ip_forward 处理报文. 一.ip报文转发到本地: /* * Deliver IP Pa ...
- IP 层收发报文简要剖析1-ip报文的输入
ip层数据包处理场景如下: 网络层处理数据包文时需要和路由表以及邻居系统打交道.输入数据时,提供输入接口给链路层调用,并调用传输层的输入接口将数据输入到传输层. 在输出数据时,提供输出接口给传输层,并 ...
- IP 层收发报文简要剖析4--ip 报文发送
无论是从本地输出的数据还是转发的数据报文,经过路由后都要输出到网络设备,而输出到网络设备的接口就是dst_output(output)函数 路由的时候,dst_output函数设置为ip_output ...
- IP 层收发报文简要剖析6--ip_forward 报文转发
//在函数ip_route_input_slow->ip_mkroute_input注册, /* * IP数据包的转发是由ip_forward()处理,该函数在ip_rcv_finish() * ...
- Linux 网卡驱动学习(六)(应用层、tcp 层、ip 层、设备层和驱动层作用解析)
本文将介绍网络连接建立的过程.收发包流程,以及当中应用层.tcp层.ip层.设备层和驱动层各层发挥的作用. 1.应用层 对于使用socket进行网络连接的server端程序.我们会先调用socket函 ...
- 老斜两宗事-七层代理模式还是IP层VPN
1.七层代理模式还是IP层VPN 非常多人会问,我究竟是使用代理模式呢,还是使用VPN模式,假设我想数据在中间不安全的链路上实现加密保护的话.这个问题有一个背景.那就是,你想保护你的数据,能够使用VP ...
- Linux内核IP层的报文处理流程(一)
本文主要讲解了Linux内核IP层的整体架构和对从网卡接受的报文处理流程,使用的内核的版本是2.6.32.27 为了方便理解,本文采用整体流程图加伪代码的方式对Linxu内核中IP整体实现架构和对网卡 ...
随机推荐
- 网页添加 Live2D 看板娘
我是先参考别人的[点击跳转]博客来做的.不过我发现网上很多人都没有把一些细节写出来,用了别人那里下载的文件后里面的一些跳转链接就跳到他们的页面了.所以我这里写一写如何修改这些跳转链接吧. 1. ...
- 【树形DP】BZOJ 1131 Sta
题目内容 给出一个\(N\)个点的树,找出一个点来,以这个点为根的树时,所有点的深度之和最大 输入格式 给出一个数字\(N\),代表有\(N\)个点.\(N \le 1000000\).下面\(N-1 ...
- eShopOnContainers 知多少[12]:Envoy gateways
1. 引言 在最新的eShopOnContainers 3.0 中Ocelot 网关被Envoy Proxy 替换.下面就来简要带大家了解下Envoy,并尝试梳理下为什么要使用Envoy替代Ocelo ...
- 手撸ORM浅谈ORM框架之Add篇
快速传送 手撸ORM浅谈ORM框架之基础篇 手撸ORM浅谈ORM框架之Add篇 手撸ORM浅谈ORM框架之Update篇 手撸ORM浅谈ORM框架之Delete篇 手撸ORM浅谈ORM框架之Query ...
- sql优化整理(一)
sql的编写语法是这样的: SELECT DISTINCT <select_list> FROM <left_table> <join_type> JOIN < ...
- airtest本地连接和远程连接
一.本地连接 # -*- coding: utf-8 -*-# from poco.drivers.android.uiautomation import AndroidUiautomationPoc ...
- python抓取动态验证码,具体第几帧数的位置静态图片
一.代码+注解 import os from PIL import Image import requests import io def save_img(): headers = { 'User- ...
- Python ( 学习基础篇 第二部 )
目录 运算符 算数运算符 比较运算符 赋值运算符 位运算符 逻辑运算符 成员运算符 身份运算符 Python 中运算符的优先级 运算符总结基础语法 判断类型 isinstence 代码块 流程控制 w ...
- Redis学习笔记(六)——数据结构之Set
一.介绍 Redis的Set是string类型的无序集合.集合成员是唯一的,这就意味着集合中不能出现重复的数据. Redis中集合是通过哈希表实现的,所以添加.删除.查找的复杂度都是O(1). 集合中 ...
- 【Deeplearning】(转)深度学习知识网络
转自深度学习知识框架,小象牛逼! 图片来自小象学院公开课,下面直接解释几条线 神经网络 线性回归 (+ 非线性激励) → 神经网络 有线性映射关系的数据,找到映射关系,非常简单,只能描述简单的映射关系 ...