ICMP 实现
以下代码取自 kernel-2.6. . [数据结构]
struct icmp_control {
void (*handler)(struct sk_buff *skb); //icmp处理函数,根据icmp的类型字段
short error; /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+]; //每个icmp类型有一个项 [/数据结构]
[初始化]
文件net/ipv4/af_inet.c中,函数
static int __init inet_init(void)
{
......
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < ) //注册协议处理函数,参看下面协议处理实现
printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
......
icmp_init(&inet_family_ops); //icmp协议初始化
......
}
icmp初始化函数
static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; //每cpu变量
void __init icmp_init(struct net_proto_family *ops)
{
struct inet_sock *inet;
int i;
for_each_possible_cpu(i) { //循环所有的cpu
int err;
//在每个cpu上调用__sock_create函数创建一个 socket实例。
err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, &per_cpu(__icmp_socket, i));
if (err < )
panic("Failed to create the ICMP control socket.\n"); per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; //指定分配内存方法为atomic
/* Enough space for 2 * 64K ICMP packets, including sk_buff struct overhead. */
per_cpu(__icmp_socket, i)->sk->sk_sndbuf = ( * (( * ) + sizeof(struct sk_buff))); //指定发送缓冲区大小 inet = inet_sk(per_cpu(__icmp_socket, i)->sk); //获取inet_sock指针, 分配sock结构时空间大小就是inet_sock的大小
inet->uc_ttl = -;
inet->pmtudisc = IP_PMTUDISC_DONT; /* Unhash it so that IP input processing does not even see it, we do not wish this socket to see incoming packets. */
//进入的包看不到这些socket结构
per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
}
}
[/初始化]
[协议处理实现]
注册的协议处理函数,当ip向上递交数据包时,如果发现是icmp协议就会调用这个函数。
static struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
};
处理进入的icmp包
int icmp_rcv(struct sk_buff *skb)
{
struct icmphdr *icmph;
struct rtable *rt = (struct rtable *)skb->dst; //路由缓存 ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); switch (skb->ip_summed) { //skb的ip校验和标志
case CHECKSUM_COMPLETE:
if (!csum_fold(skb->csum)) //没有伪头部的校验和检测
break; /* fall through */
case CHECKSUM_NONE:
skb->csum = ;
if (__skb_checksum_complete(skb)) //全部内容的校验和检测
goto error;
}
if (!pskb_pull(skb, sizeof(struct icmphdr))) //是否有icmp头空间,如果有移动data指针到icmp头后面
goto error; icmph = icmp_hdr(skb); //获取icmp头
ICMPMSGIN_INC_STATS_BH(icmph->type); /*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently discarded.
*/
if (icmph->type > NR_ICMP_TYPES)
goto error;
//icmp是发送到本地的多播或广播地址
if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
/* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be silently ignored (we let user decide with a sysctl). * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently discarded if to broadcast/multicast.*/
if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && sysctl_icmp_echo_ignore_broadcasts) {
goto error;
}
//除了回显和时间截,地址掩码请求和应答,其他到广播和多播的icmp包全部丢弃
if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP &&
icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) {
goto error;
}
}
icmp_pointers[icmph->type].handler(skb); //根据icmp类型调用相应的处理函数
drop:
kfree_skb(skb); //处理完了释放skb
return ;
error:
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
goto drop;
}
类型处理函数在内核中被静态的初始化.
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + ] = {
[ICMP_ECHOREPLY] = {
.handler = icmp_discard, //空函数
},
[] = {
.handler = icmp_discard,
.error = ,
},
[] = {
.handler = icmp_discard,
.error = ,
},
[ICMP_DEST_UNREACH] = {
.handler = icmp_unreach,
.error = ,
},
[ICMP_SOURCE_QUENCH] = {
.handler = icmp_unreach,
.error = ,
},
[ICMP_REDIRECT] = {
.handler = icmp_redirect,
.error = ,
},
[] = {
.handler = icmp_discard,
.error = ,
},
[] = {
.handler = icmp_discard,
.error = ,
},
[ICMP_ECHO] = {
.handler = icmp_echo,
},
[] = {
.handler = icmp_discard,
.error = ,
},
[] = {
.handler = icmp_discard,
.error = ,
},
[ICMP_TIME_EXCEEDED] = {
.handler = icmp_unreach,
.error = ,
},
[ICMP_PARAMETERPROB] = {
.handler = icmp_unreach,
.error = ,
},
[ICMP_TIMESTAMP] = {
.handler = icmp_timestamp,
},
[ICMP_TIMESTAMPREPLY] = {
.handler = icmp_discard,
},
[ICMP_INFO_REQUEST] = {
.handler = icmp_discard,
},
[ICMP_INFO_REPLY] = {
.handler = icmp_discard,
},
[ICMP_ADDRESS] = {
.handler = icmp_address,
},
[ICMP_ADDRESSREPLY] = {
.handler = icmp_address_reply,
},
};
我们一个一个看。
icmp接收到不可达包的处理,不可达包括ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
static void icmp_unreach(struct sk_buff *skb)
{
struct iphdr *iph;
struct icmphdr *icmph;
int hash, protocol;
struct net_protocol *ipprot;
struct sock *raw_sk;
u32 info = ;
//数据部分包括了携带的ip头吗
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out_err; icmph = icmp_hdr(skb); //icmp头
iph = (struct iphdr *)skb->data; //携带的ip头 //ip头损坏
if (iph->ihl < ) /* Mangled header, drop. */
goto out_err; if (icmph->type == ICMP_DEST_UNREACH) { //icmp类型是目的不可达
switch (icmph->code & ) { //错误码标识
case ICMP_NET_UNREACH: //网络
case ICMP_HOST_UNREACH: //主机
case ICMP_PROT_UNREACH: //协议
case ICMP_PORT_UNREACH: //端口
break; //不可达
case ICMP_FRAG_NEEDED: //需要分片
if (ipv4_config.no_pmtu_disc) {
LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: fragmentation needed and DF set.\n", NIPQUAD(iph->daddr));
} else {
//在到那个目的地址的路由缓存中保存mtu的大小,在发送数据时就会根据这个mtu大小进行分片
info = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu));
if (!info)
goto out;
}
case ICMP_SR_FAILED:
LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source Route Failed.\n", NIPQUAD(iph->daddr));
break;
default:
break;
}
if (icmph->code > NR_ICMP_UNREACH) //超过限制,错误的的不可达码
goto out;
} else if (icmph->type == ICMP_PARAMETERPROB)
info = ntohl(icmph->un.gateway) >> ;
//一些路由器会发送应答到广播地址,可能是用户工具引起的问题
if (!sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(iph->daddr) == RTN_BROADCAST) {
if (net_ratelimit())
printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP type %u, code %u "
"error to a broadcast: %u.%u.%u.%u on %s\n", NIPQUAD(ip_hdr(skb)->saddr),
icmph->type, icmph->code, NIPQUAD(iph->daddr), skb->dev->name);
goto out;
}
/* Checkin full IP header plus 8 bytes of protocol to avoid additional coding at protocol handlers. */
if (!pskb_may_pull(skb, iph->ihl * + )) //ip头加8字节的协议
goto out; iph = (struct iphdr *)skb->data;
protocol = iph->protocol; //获取协议
hash = protocol & (MAX_INET_PROTOS - ); //递交icmp信息到 raw socket, why ??????
read_lock(&raw_v4_lock);
if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, iph->saddr, skb->dev->ifindex)) != NULL) {
raw_err(raw_sk, skb, info);
raw_sk = sk_next(raw_sk);
iph = (struct iphdr *)skb->data;
}
}
read_unlock(&raw_v4_lock); rcu_read_lock();
ipprot = rcu_dereference(inet_protos[hash]); //根据协议查找协议处理结构
if (ipprot && ipprot->err_handler) //如果有,调用相关的协议错误处理函数处理这个icmp不可达包
ipprot->err_handler(skb, info);
rcu_read_unlock();
out:
return;
out_err:
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
goto out;
}
icmp重定向处理
static void icmp_redirect(struct sk_buff *skb)
{
struct iphdr *iph; if (skb->len < sizeof(struct iphdr)) 长度检测
goto out_err;
/* Get the copied header of the packet that caused the redirect */
if (!pskb_may_pull(skb, sizeof(struct iphdr))) //ip头长度检测
goto out; iph = (struct iphdr *)skb->data; //取出ip头 switch (icmp_hdr(skb)->code & ) { //编码
case ICMP_REDIR_NET: //网络重定向
case ICMP_REDIR_NETTOS:
/* As per RFC recommendations now handle it as a host redirect.*/
case ICMP_REDIR_HOST: //主机重定向
case ICMP_REDIR_HOSTTOS:
//在路由告诉缓存中,更新相同缓存项的rt_gateway字段
ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, icmp_hdr(skb)->un.gateway, iph->saddr, skb->dev);
break;
}
out:
return;
out_err:
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
goto out;
}
icmp回显请求
static void icmp_echo(struct sk_buff *skb)
{
if (!sysctl_icmp_echo_ignore_all) { //是否忽略回显请求
struct icmp_bxm icmp_param;
//保存一些icmp内容
icmp_param.data.icmph = *icmp_hdr(skb);
icmp_param.data.icmph.type = ICMP_ECHOREPLY;
icmp_param.skb = skb;
icmp_param.offset = ;
icmp_param.data_len = skb->len;
icmp_param.head_len = sizeof(struct icmphdr);
icmp_reply(&icmp_param, skb);
}
}
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
struct sock *sk = icmp_socket->sk;
struct inet_sock *inet = inet_sk(sk);
struct ipcm_cookie ipc;
struct rtable *rt = (struct rtable *)skb->dst; //路由缓存
__be32 daddr;
//解析其中的ip选项
if (ip_options_echo(&icmp_param->replyopts, skb))
return; if (icmp_xmit_lock()) //是否可以锁定这个cpu上的icmp_socket.
return;
icmp_param->data.icmph.checksum = ; inet->tos = ip_hdr(skb)->tos;
daddr = ipc.addr = rt->rt_src; //目的地址
ipc.opt = NULL;
if (icmp_param->replyopts.optlen) { //有ip选项
ipc.opt = &icmp_param->replyopts;
if (ipc.opt->srr)
daddr = icmp_param->replyopts.faddr;
}
{
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = daddr,
.saddr = rt->rt_spec_dst,
.tos = RT_TOS(ip_hdr(skb)->tos) } },
.proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl);
if (ip_route_output_key(&rt, &fl)) //路由查找,如果没找到那么什么也不发送了
goto out_unlock;
}
//是否立即发送应答
if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code))
icmp_push_reply(icmp_param, &ipc, rt); //发送应答 ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock();
}
判断应答是否发送
static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
{
struct dst_entry *dst = &rt->u.dst;
int rc = ; if (type > NR_ICMP_TYPES) //类型超过范围, 这应该是个bug,需要添加 rc = 0
goto out; /* Don't limit PMTU discovery. */
//这两个类型不做限制
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
goto out; /* No rate limit on loopback */
if (dst->dev && (dst->dev->flags & IFF_LOOPBACK)) //回环设备也不限制
goto out; /* Limit if icmp type is enabled in ratemask. */
if (( << type) & sysctl_icmp_ratemask) //用户通过/proc配置了限制速度的icmp类型掩码
rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
out:
return rc;
}
#define XRLIM_BURST_FACTOR 6
int xrlim_allow(struct dst_entry *dst, int timeout)
{
unsigned long now;
int rc = ; //不发送 now = jiffies;
dst->rate_tokens += now - dst->rate_last; //累加过去的时间
dst->rate_last = now; //最后使用时间 if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) //累加时间超过指定的范围
dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; //设为最大值 if (dst->rate_tokens >= timeout) { //超过用户配置的时间限制
dst->rate_tokens -= timeout; //递减配置的时间限制
rc = ; //发送
}
return rc;
}
发送icmp应答函数
static void icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt)
{
struct sk_buff *skb;
//分配skb拷贝接收的skb数据到新分配的skb内存中,新skb被链入到icmp_socket->sk->sk_write_queue中.
if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < )
ip_flush_pending_frames(icmp_socket->sk); //拷贝失败
else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { //提取分配的skb
struct icmphdr *icmph = icmp_hdr(skb);
__wsum csum = ;
struct sk_buff *skb1;
//计算校验和
skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
csum = csum_add(csum, skb1->csum);
}
csum = csum_partial_copy_nocheck((void *)&icmp_param->data, (char *)icmph, icmp_param->head_len, csum);
icmph->checksum = csum_fold(csum);
skb->ip_summed = CHECKSUM_NONE;
ip_push_pending_frames(icmp_socket->sk); //发送队列中的skb
}
}
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
__wsum csum;
//拷贝数据
csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len, );
//添加所有icmp_param->skb的校验和到地一个skb中
skb->csum = csum_block_add(skb->csum, csum, odd);
if (icmp_pointers[icmp_param->data.icmph.type].error)
nf_ct_attach(skb, icmp_param->skb);
return ;
}
拷贝数据到ip数据负载部分,如果需要将所有碎片链入到sk->sk_write_queue队列中
int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable *rt, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb; struct ip_options *opt = NULL;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = ;
unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE; if (flags & MSG_PROBE)
return ;
if (skb_queue_empty(&sk->sk_write_queue)) { //写队列为空
opt = ipc->opt;
if (opt) { //有ip选项
if (inet->cork.opt == NULL) { //inet socket中ip选项指针为空,分配一个ip选项+ip最长头空间
inet->cork.opt = kmalloc(sizeof(struct ip_options) + , sk->sk_allocation);
if (unlikely(inet->cork.opt == NULL))
return -ENOBUFS;
}
//拷贝icmp中携带的ip选项
memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
inet->cork.flags |= IPCORK_OPT;
inet->cork.addr = ipc->addr; //记录发送这个icmp的地址
}
//IP_PMTUDISC_PROBE 表示忽略对方的mtu, 如果忽略使用本地设备的mtu,设置分片大小
inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
inet->cork.rt = rt; //保存路由
inet->cork.length = ;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = ;
if ((exthdrlen = rt->u.dst.header_len) != ) { //需要额外的头长度
length += exthdrlen;
transhdrlen += exthdrlen;
}
} else { //队列不为空,用保存好的数据初始化一些变量
rt = inet->cork.rt;
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt; transhdrlen = ;
exthdrlen = ;
mtu = inet->cork.fragsize;
}
hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); //足够的硬件头空间 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : ); //每个碎片的ip头长度
maxfraglen = ((mtu - fragheaderlen) & ~) + fragheaderlen; //每个碎片的最大长度 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { //发送来的数据长度超过了允许的最大ip数据长度(65535 - ip头 + ip选项)
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
return -EMSGSIZE;
}
/* transhdrlen > 0 means that this is the first fragment and we wish it won't be fragmented in the future. */
if (transhdrlen && length + fragheaderlen <= mtu && rt->u.dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen)
csummode = CHECKSUM_PARTIAL;
inet->cork.length += length; //累加这个长度
//长度 > mtu ,协议是 udp,且网卡设备支持GSO分片
if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && (rt->u.dst.dev->features & NETIF_F_UFO)) {
err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags);
if (err)
goto error; return ;
}
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) //队列为空
goto alloc_new_skb; while (length > ) {
/* Check if the remaining data fits into current packet. */
copy = mtu - skb->len;////这的mtu,我认为应该为maxfraglen, 这样就不用 fraggap变量和相关的操作了
if (copy < length)
copy = maxfraglen - skb->len;
if (copy <= ) {
char *data;
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
struct sk_buff *skb_prev;
alloc_new_skb: skb_prev = skb;
if (skb_prev)
fraggap = skb_prev->len - maxfraglen;
else
fraggap = ; /* If remaining data exceeds the mtu, we know we need more fragment(s). */
datalen = length + fraggap; //这的mtu,我认为应该为maxfraglen, 这样就不用 fraggap变量和相关的操作了
if (datalen > mtu - fragheaderlen) //数据长度超过mtu - ip头长度,需要分片
datalen = maxfraglen - fragheaderlen; //设置成合适的长度 fraglen = datalen + fragheaderlen; //一个碎片的完整长度
if ((flags & MSG_MORE) && !(rt->u.dst.dev->features & NETIF_F_SG))
alloclen = mtu;
else
alloclen = datalen + fragheaderlen; /* The last fragment gets additional space at tail. Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be the last. */
if (datalen == length + fraggap) //最后一个分片将添加额外的长度
alloclen += rt->u.dst.trailer_len; if (transhdrlen) { //指定了传输层头长度
//分配内存hh_len是硬件地址长度
skb = sock_alloc_send_skb(sk, alloclen + hh_len + , (flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
if (atomic_read(&sk->sk_wmem_alloc) <= * sk->sk_sndbuf)
skb = sock_wmalloc(sk, alloclen + hh_len + , , sk->sk_allocation); if (unlikely(skb == NULL))
err = -ENOBUFS;
}
if (skb == NULL) //分配失败
goto error; /* Fill in the control structures */
skb->ip_summed = csummode;
skb->csum = ;
skb_reserve(skb, hh_len); //保留出硬件地址空间 data和tail向后移动 hh_len /*Find where to start putting bytes. */
data = skb_put(skb, fraglen); //返回data移动tail和增加len
skb_set_network_header(skb, exthdrlen);//如果有额外头,移动网络头位置
//传输层头在网络头后面
skb->transport_header = (skb->network_header + fragheaderlen);//fragheaderlen 可能包括ip选项长度
data += fragheaderlen; //data指向传输层头位置 if (fraggap) { //把上一个skb最后几个没有对齐的字节拷贝到这新包的 data + transhdrlen位置
skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, data + transhdrlen, fraggap, );
skb_prev->csum = csum_sub(skb_prev->csum, skb->csum);
data += fraggap; //移动指针
pskb_trim_unique(skb_prev, maxfraglen); //修改上一个skb的数据长度,进行缩小 }
//datalen包括传输层头和数据
copy = datalen - transhdrlen - fraggap;//要拷贝的数据长度
//从from拷贝一些传输层头后面的数据到data+transhdrlen的位置
if (copy > && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < ) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
offset += copy; //偏移累加
length -= datalen - fraggap; //长度递减,包含传输层头长度
transhdrlen = ;
exthdrlen = ;
csummode = CHECKSUM_NONE; /* Put the packet on the pending queue. */
__skb_queue_tail(&sk->sk_write_queue, skb); //链入队列
continue;
}
if (copy > length)
copy = length;
if (!(rt->u.dst.dev->features & NETIF_F_SG)) { //设备不支持SG
unsigned int off;
off = skb->len;
if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < ) {
__skb_trim(skb, off); err = -EFAULT;
goto error;
}
} else { //按SG分页处理
int i = skb_shinfo(skb)->nr_frags;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-];
struct page *page = sk->sk_sndmsg_page;
int off = sk->sk_sndmsg_off;
unsigned int left; if (page && (left = PAGE_SIZE - off) > ) {
if (copy >= left)
copy = left; if (page != frag->page) {
if (i == MAX_SKB_FRAGS) {
err = -EMSGSIZE;
goto error;
}
get_page(page);
skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, );
frag = &skb_shinfo(skb)->frags[i];
} } else if (i < MAX_SKB_FRAGS) {
if (copy > PAGE_SIZE)
copy = PAGE_SIZE; page = alloc_pages(sk->sk_allocation, );
if (page == NULL) {
err = -ENOMEM;
goto error;
}
sk->sk_sndmsg_page = page;
sk->sk_sndmsg_off = ; skb_fill_page_desc(skb, i, page, , );
frag = &skb_shinfo(skb)->frags[i]; } else {
err = -EMSGSIZE;
goto error;
}
if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < ) {
err = -EFAULT;
goto error;
}
sk->sk_sndmsg_off += copy;
frag->size += copy;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
}
offset += copy;
length -= copy;
}
return ;
error:
inet->cork.length -= length;
IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
return err;
}
ip_append_data函数失败就会调用这个函数十分所有skb
void ip_flush_pending_frames(struct sock *sk)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
kfree_skb(skb); ip_cork_release(inet_sk(sk));
}
icmp_push_reply-> 取出队列中的skb,然后添加完整的ip头然后发送出去
int ip_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct inet_sock *inet = inet_sk(sk);
struct ip_options *opt = NULL;
struct rtable *rt = inet->cork.rt;
struct iphdr *iph;
__be16 df = ;
__u8 ttl;
int err = ; if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) //取出一个skb
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list); //指向分片连表头 /* move skb->data to ip header from ext header */
if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb)); //移动data指针到ip头位置 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { //循环出队所有skb
__skb_pull(tmp_skb, skb_network_header_len(skb)); //移动data到传输层头位置
*tail_skb = tmp_skb; //当执行第一次时等于是(skb_shinfo(skb)->frag_list) = tmp_skb
tail_skb = &(tmp_skb->next); //指向了tmp_skb的next
//累加这个包的长度
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
__sock_put(tmp_skb->sk); //递减sock的引用计数
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
}
//到这就是把所有在sk->sk_write_queue中的skb(所有分片)组合到第一个skb的skb_shinfo(skb)->frag_list连表中了。 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow to fragment the frame generated here.
* No matter, what transforms how transforms change size of the packet, it will come out. */
if (inet->pmtudisc < IP_PMTUDISC_DO)
skb->local_df = ; //不分片
/* DF bit is set when we want to see DF on outgoing frames.
* If local_df is set too, we still allow to fragment this frame locally. */
if (inet->pmtudisc >= IP_PMTUDISC_DO || (skb->len <= dst_mtu(&rt->u.dst) && ip_dont_fragment(sk, &rt->u.dst)))
df = htons(IP_DF); //设置不分片标志
if (inet->cork.flags & IPCORK_OPT) //有ip选项
opt = inet->cork.opt; if (rt->rt_type == RTN_MULTICAST) //多播ttl
ttl = inet->mc_ttl;
else
ttl = ip_select_ttl(inet, &rt->u.dst); //单播,需要计算 iph = (struct iphdr *)skb->data; //在第一个skb中添加ip头
iph->version = ;
iph->ihl = ;
if (opt) {
iph->ihl += opt->optlen>>;
ip_options_build(skb, opt, inet->cork.addr, rt, );
}
iph->tos = inet->tos;
iph->tot_len = htons(skb->len);
iph->frag_off = df;
ip_select_ident(iph, &rt->u.dst, sk); //选择一个ip标识
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
ip_send_check(iph); //校验和 skb->priority = sk->sk_priority;
skb->dst = dst_clone(&rt->u.dst); if (iph->protocol == IPPROTO_ICMP)
icmp_out_count(((struct icmphdr *)skb_transport_header(skb))->type); //更新一些统计信息 //发送这个skb到netfilter的LOCAL_OUT hook
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
if (err) {
if (err > )
err = inet->recverr ? net_xmit_errno(err) : ; if (err)
goto error;
}
out:
ip_cork_release(inet);
return err;
error:
IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
goto out;
}
到这需要简单说一下,其实我们看的是icmp回显请求相关的流程,其中什么ip碎片应该就根本不会发生,
但一些函数在ip层使用所以有些看起来十分的复杂。 icmp时间截请求处理
static void icmp_timestamp(struct sk_buff *skb)
{
struct timeval tv;
struct icmp_bxm icmp_param; if (skb->len < ) //长度不对
goto out_err;
/* Fill in the current time as ms since midnight UT: */
do_gettimeofday(&tv); //获取当前时间
icmp_param.data.times[] = htonl((tv.tv_sec % ) * + tv.tv_usec / );
icmp_param.data.times[] = icmp_param.data.times[];
//拷贝skb中的数据到 times[0]中
if (skb_copy_bits(skb, , &icmp_param.data.times[], ))
BUG(); icmp_param.data.icmph = *icmp_hdr(skb);
icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; //时间截应答
icmp_param.data.icmph.code = ;
icmp_param.skb = skb;
icmp_param.offset = ;
icmp_param.data_len = ;
icmp_param.head_len = sizeof(struct icmphdr) + ;
icmp_reply(&icmp_param, skb);
out:
return;
out_err:
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
goto out;
}
地址掩码请求,linux没有实现它,参考内核中这函数的注释
static void icmp_address(struct sk_buff *skb)
{
#if 0
if (net_ratelimit())
printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
#endif
}
地址掩码应答处理
static void icmp_address_reply(struct sk_buff *skb)
{
struct rtable *rt = (struct rtable *)skb->dst; //路由缓存
struct net_device *dev = skb->dev;
struct in_device *in_dev;
struct in_ifaddr *ifa;
//长度不对或没有标志重定向源地址
if (skb->len < || !(rt->rt_flags & RTCF_DIRECTSRC))
goto out;
in_dev = in_dev_get(dev);
if (!in_dev)
goto out;
rcu_read_lock();
//设备有地址,打开调试项,设备允许转发
if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) {
__be32 _mask, *mp;
//取出掩码
mp = skb_header_pointer(skb, , sizeof(_mask), &_mask);
BUG_ON(mp == NULL);
for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
//循环所有地址,如果掩码匹配且路由地址也匹配
if (*mp == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa))
break;
}
if (!ifa && net_ratelimit()) { //都不匹配
printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from %s/%u.%u.%u.%u\n",
NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
}
}
rcu_read_unlock();
in_dev_put(in_dev);
out:;
}
[/协议处理实现]
ICMP 实现的更多相关文章
- [协议]ICMP协议剖析
1.ICMP简介 ICMP全名为(INTERNET CONTROL MESSAGE PROTOCOL)网络控制消息协议. ICMP的协议号为1. ICMP报文就像是IP报文的小弟,总顶着IP报文的名头 ...
- 简单了解ICMP协议
ping命令是什么协议? 维基百科: ping是一种电脑网络工具,用来测试数据包能否通过IP协议到达特定主机.ping的运作原理是向目标主机传出一个ICMP echo@要求数据包,并等待接受echo回 ...
- ICMP的应用--Traceroute
Traceroute是用来侦测主机到目的主机之间所经路由情况的重要工具,也是最便利的工具.前面说到,尽管ping工具也可以进行侦测,但是,因为ip头的限制,ping不能完全的记录下所经过的路由器.所以 ...
- ICMP Protocol
[ICMP Protocol] 参考: 1.ICMP Types and Codes:http://www.nthelp.com/icmp.html 2.RFC 792 - Internet Cont ...
- TCP协议学习记录 (一) ICMP时间戳请求
程序只实现了获取时间戳,至于将时间戳转换成具体日期和时间,暂时没有好的办法. #define TIME_STAMP_REQUEST 13 struct iphdr { unsigned ; //包头长 ...
- 002.ICMP--拼接ICMP包,实现简单Ping程序(原始套接字)
一.大致流程: 将ICMP头和时间数据设置好后,通过创建好的原始套接字socket发出去.目的主机计算效验和后会将数据原样返回,用当前时间和返回的数据结算时间差,计算出rtt. 二.数据结构: ICM ...
- linux原始套接字(2)-icmp请求与接收
一.概述 上一篇arp请求使用的是链路层的原始套接字.icmp封装在ip数据报里面,所以icmp请 ...
- 网络错误定位案例 ICMP host *** unreachable - admin prohibited
1. 环境 一台物理服务器 9.115.251.86,上面创建两个虚机,每个虚机两个网卡: vm1:eth0 - 9.*.*.232 eth1:10.0.0.14 vm2: eth0 - 9.8.*. ...
- GO语言练习:网络编程 ICMP 示例
1.代码 2.编译及运行 1.Go语言网络编程:ICMP示例代码 icmptest.go package main import ( "fmt" "net" & ...
- 一个ICMP单元
unit ICMPUtils; interface {$IFDEF VER80} { This source file is *NOT* compatible with Delphi 1 becaus ...
随机推荐
- Win7/Win8.1预订升级Win10失败临时解决方案
很多Win7/Win8.1用户在今天凌晨通过微软官方推送的方式升级Win10,但这一过程中遇到了“安装失败”等问题,导致升级无法进行.鉴于这种情况,很多用户选择进入Windows10预下载安装文件夹打 ...
- 关于cnpm的一点小bug
在实际工作中,一个项目完成后,在上线前,常常需要把代码进行压缩,一般是用gulp或者 webpack 进行压缩.(小妹是用gulp) gulp是运行在node 环境下的. 所以首先,下载并安装了nod ...
- C++11中新特性之:initializer_list详解
C++11提供的新类型,定义在<initializer_list>头文件中. template< class T > class initializer_list; 先说它的用 ...
- JFreeChart入学教程
JFreeChart入学教程 2011-08-08 14:55:19| 分类: 技术篇 |举报 |字号 订阅 JFreeChart 是一组功能强大.灵活易用的Java绘图API,使用它可以生成多 ...
- java中动态反射
java中动态反射能达到的效果和python的语法糖很像,能够截获方法的实现,在真实方法调用之前和之后进行修改,甚至能够用自己的实现进行特别的替代,也可以用其实现面向切片的部分功能.动态代理可以方便实 ...
- c#反射机制判断同一个类的两个实例的值是否完全一样
; i < properties1.Length; i++) { string s = properties1[i].DeclaringTyp ...
- vi 编辑器的使用
1) vi的自动对齐功能 我从window的网页上拷贝了一段代码到vi中,结果是不对齐的.见下图 此时为了对齐,我的做法是: ESC-v 进入视图模式,然后全选 再然后直接按 = 号. 然 ...
- mysql5.7版本无法启动服务问题
cmd情况下进入mysql的bin目录后 输入命令:mysqld --initialize-insecure d:\mysql\bin
- UI控件切圆角
1. xib下设置View圆角 这个很简单, 只需要重写 - (void)drawRect:(CGRect)rect 方法就行了 1 2 3 4 5 6 - (void)drawRect:(CGRe ...
- Pentaho Data Integration Step: BD Procedure Call
官网连接:http://wiki.pentaho.com/display/EAI/Call+DB+Procedure 描述 调用数据库存储过程步骤允许用户执行一个数据库存储过程,并且得到结果.存储过程 ...