Socket层实现系列 — bind()的实现（二）

本文主要内容：bind()的TCP层实现、端口的冲突处理，以及不同内核版本的实现差异。

内核版本：3.6

Author：zhangskd @ csdn blog

TCP层实现

SOCK_STREAM套接口的TCP层操作函数集实例为tcp_prot，其中端口绑定函数为inet_csk_get_port()。

struct proto tcp_prot = {

    .name = "TCP",

    .owner = THIS_MODULE,

    ...

    .get_port = inet_csk_get_port, /* TCP层bind()相关操作 */

    ...

};

和较早的内核版本不同，现在系统自动选择端口时，也可以复用端口了。

/* Obtain a reference to a local port for the given sock,

 * if snum is zero it means select any available local port.

 */

int inet_csk_get_port(struct sock *sk, unsigned short snum)

{

    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; /* 指向tcp_hashinfo */

    struct inet_bind_hashbucket *head;

    struct hlist_node *node;

    struct inet_bind_bucket *tb;

    int ret, attempts = 5;

    struct net *net = sock_net(sk);

    int smallest_size = -1, smallest_rover;

    local_bh_disable(); /* 禁止下半部，防止和进程冲突 */

    /* 如果snum为0，系统自动为sock选择一个端口号 */

    if (! snum) {

        int remaining, rover, low, high;

again:

        inet_get_local_port_range(&low, &high); /* 获取端口号的取值范围 */

        remaining = (high - low) + 1; /* 取值范围内端口号的个数 */

        smallest_rover = rover = net_random() % remaining + low; /* 随机选取范围内的一个端口 */

        smallest_size = -1;

        do {

            if (inet_is_reserved_local_port(rover)) /* 查看端口是否属于保留的 */

                goto next_nolock; /* rover加1，继续 */

            /* 根据端口号，确定所在的哈希桶 */

            head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)];

            spin_lock( &head->lock); /* 锁住哈希桶 */

            inet_bind_bucket_for_each(tb, node, &head->chain) /* 从头遍历哈希桶 */

                /* 如果端口被使用了 */

                if (eq(ib_net(tb), net) && tb->port == rover) { 

                    if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN &&

                        tb->num_owners < smallest_size || smallest_size == -1)) {

                        smallest_size = tb->num_owners; /* 记下这个端口使用者的个数 */

                        smallest_rover = rover; /* 记下这个端口 */

                        /* 如果系统绑定的端口已经很多了，那么就判断端口是否有绑定冲突*/

                        if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&

                            ! inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {

                            snum = smallest_rover; /* 没有冲突，使用此端口 */

                            goto tb_found;

                        }

                    }

                    /* 检查是否有端口绑定冲突，该端口是否能重用 */

                    if (! inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {

                        snum = rover;

                        goto tb_found;

                    }

                    goto next; /* 此端口不可重用，看下一个 */

                }

            break; /* 找到了没被用的端口，退出 */

next:

            spin_unlock(&head->lock);

next_nolock:

            if (++rover > high)

                rover = low;

        } while(--remaining > 0);

        /* Exhausted local port range during search? It is not possible for us to be holding

         * one of the bind hash locks if this test triggers, because if 'remaining' drops to zero,

         * we broke out of the do/while loop at the top level, not from the 'break' statement.

         */

        ret = 1;

        if (remaining <= 0) { /* 完全遍历 */

            if (smallest_size != -1) {

                snum = smallest_rover;

                goto have_snum;

            }

            goto fail;

        }

        /* OK, here is the one we will use.

         * HEAD is non-NULL and we hold it's mutex.

         */

        snum = rover; /* 自动选择的可用端口 */

    } else { /* 如果有指定要绑定的端口 */

have_snum:

        head = &hashinfo->bhash[inet_bhashfn(net, snum, hashinfo->bhash_size)];

        spin_lock(&head->lock);

        inet_bind_bucket_for_each(tb, node, &head->chain)

            if (net_eq(ib_net(tb), net) && tb->port == snum)

                goto tb_found; /* 发现端口在用 */

    }

    tb = NULL;

    goto tb_not_found;

tb_found:

    /* 端口上有绑定sock时 */

    if (! hlist_empty(&tb->owners)) {

        /* 这是强制的绑定啊，不管端口是否会绑定冲突！*/

        if (sk->sk_reuse == SK_FORCE_REUSE)

            goto success;

        if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN &&

            smallest_size == -1) { /* 指定端口的情况 */

            goto success;

        } else {

            ret = 1;

            if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { /* 端口绑定冲突 */

                /* 自动分配的端口绑定冲突了，再次尝试，最多重试5次。

                 * 我觉得以下if不必要，因为自动选择时goto tb_found之前都有检测过了。

                 */

                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && smallest_size != -1

                    && --attempts >= 0) {

                    spin_unlock(&head->lock);

                    goto again;

                }

                goto fail_unlock; /* 失败 */

            }

        }

    }

tb_not_found:

    ret = 1;

    /* 申请和初始化一个inet_bind_bucket结构 */

    if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, net, head, snum)) == NULL)

        goto fail_unlock;

    if (hlist_empty(&tb->owners)) {

        if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)

            tb->fastreuse = 1;

        else

            tb->fastreuse = 0;

    } else if (tb->fastreuse && (! sk->sk_reuse || sk->sk_state == TCP_LISTEN))

        tb->fastreuse = 0;

success:

    /* 赋值icsk中的inet_bind_bucket */

    if (! inet_csk(sk)->icsk_bind_hash)

        inet_bind_hash(sk, tb, snum);

    WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);

    ret = 0;

fail_unlock:

    spin_unlock(&head->lock);

fail:

    local_bh_enable();

    return ret;

}

端口区间

我们可以指定系统自动分配端口号时，端口的区间：

/proc/sys/net/ipv4/ip_local_port_range，默认为：32768 61000

也可以指定要保留的端口区间：

/proc/sys/net/ipv4/ip_local_reserved_ports，默认为空

typedef struct {

    unsigned sequence;

    spinlock_t lock;

}seqlock_t;

struct local_ports {

    seqlock_t lock; /* 顺序锁，读多写少，写优先 */

    int range[2];

};

/* This struct holds the first and last local port number.

 * 用于系统自动分配的端口区间。

 */

struct local_ports sysctl_local_ports __read_mostly = {

    .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),

    .range = {32768, 61000},

};

当snum为0时，由系统自动分配端口号时，获取端口号的取值范围。

void inet_get_local_port_range(int *low, int *high)

{

    unsigned int seq;

    do {

        seq = read_seqbegin(&sysctl_local_ports.lock);

        *low = sysctl_local_ports.range[0];

        *high = sysctl_local_ports.range[1];

    } while(read_seqretry(&sysctl_local_ports.lock, seq)); /* 用顺序锁来读 */

}

端口绑定冲突

面向连接的、传输层的协议族相关的操作函数集：

/*

 * Pointers to address related TCP functions

 * (i.e. things that depend on the address family)

 */

struct inet_connection_sock_af_ops {

    ...

    int (*bind_conflict) (const struct sock *sk, const struct inet_bind_bucket *tb, bool relax);

    ...

};

const struct inet_connection_sock_af_ops ipv4_specific = {

    ...

    .bind_conflict = inet_csk_bind_conflict, /* 用于判断绑定端口是否冲突 */

    ...

};

如果不冲突，则可以绑定该端口，返回0；如果冲突，则不可绑定该端口，返回1。

int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax)

{

    struct sock *sk2;

    struct hlist_node *node;

    int reuse = sk->sk_reuse; /* SO_REUSEADDR，表示处于TIME_WAIT状态的端口允许重用 */

    /* Unlike other sk lookup places we do not check for sk_net here, since all the socks

     * listed in tb->owners list belong to the same net - the one this bucket belongs to.

     * 遍历此端口上的sock。

     */

    sk_for_each_bound(sk2, node, &tb->owners) {

        /* 冲突的条件1：不是同一socket、绑定在相同的设备上 */

        if (sk != sk2 && ! inet_v6_ipv6only(sk2) && (! sk->sk_bound_dev_if || ! sk2->sk_bound_dev_if

            || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {

            /* 冲突的条件2：绑定在相同的IP上

             * 冲突的条件3（符合一个即满足）：

             * 3.1 本socket不允许重用

             * 3.2 链表中的socket不允许重用

             * 3.3 链表中的socket处于监听状态

             */

            if (! reuse || ! sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) {

                const __u32 sk2_rcv_saddr = sk_rcv_saddr(sk2); /* sk2的绑定IP */

                if (! sk2_rcv_saddr || ! sk_rcv_saddr(sk) || sk2_rcv_saddr == sk_rcv_saddr(sk))

                    break; /* 冲突了 */

             }

             /* 觉得这段代码有好多冗余，可以精简下：）

              * 3.4 relax为false

              */

             if (! relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) {

                 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);

                 if (! sk2_rcv_saddr || ! sk_rcv_saddr(sk) || sk2_rcv_saddr == sk_rcv_saddr(sk))

                     break;

             }

        }

    }

    return node != NULL;

}

Q: 什么情况下会出现冲突呢？

A: 同时符合以下条件才会冲突：

1. 绑定的设备相同（不允许自动选择设备）

2. 绑定的IP地址相同（不允许自动选择IP）

3 以下条件有一个成立：

3.1 要绑定的socket不允许重用

3.2 已绑定的socket不允许重用

3.3 已绑定的socket处于监听状态

3.4 relax参数为false

这样看来能够重用端口号的情况包括：

1. 绑定的设备不同

2. 绑定的IP地址不同

3. 要绑定的socket允许重用，且已绑定的socket允许重用，且已绑定的socket不处于监听状态，relex参数为true。

在这种情况下，有可能两个socket的(addr, port)、绑定的设备完全一样，所以增加了relax参数，可用于禁止。

我们看到系统自动选择端口时，relax为false，是不允许这种情况的。

inet_bind_bucket

inet_bind_bucket用来保存使用中的端口，以及绑定在该端口上的sock链表。

（1）创建

/*

 * Allocate and initialize a new local port bind bucket.

 * The bindhash mutex for snum's hash chain must be held here.

 */

struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,

                             struct inet_bind_hashbucket *head, const unsigned short snum)

{

    struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

    if (tb != NULL) {

        write_pnet(&tb->ib_net, hold_net(net)); /* 指定网络命名空间 */

        tb->port = snum; /* 指定绑定端口 */

        tb->fastreuse = 0;

        tb->num_owners = 0;

        INIT_HLIST_HEAD(&tb->owners); /* 初始化owners哈希链表 */

        hlist_add_head(&tb->node, &head->chain); /* 把此inet_bind_bucket实例添加到哈希桶中 */

    }

    return tb;

}

（2）销毁

void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)

{

    if (hlist_empty(&tb->owners)) {

        __hlist_del(&tb->node);

        release_net(ib_net(tb));

        kmem_cache_free(cachep, tb);

    }

}

（3）赋值

更新赋值icsk的inet_bind_bucket类型成员icsk_bind_hash。

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum)

{

    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; /* 指向tcp_hashinfo */

    atomic_inc(&hashinfo->bsockets); /* 增加总的绑定次数 */

    inet_sk(sk)->inet_num = snum; /* 保存绑定的端口 */

    sk_add_bind_node(sk, &tb->owners); /* 把此sock链入tb->owners哈希链表中 */

    tb->num_owners++; /* 增加端口绑定次数 */

    inet_csk(sk)->icsk_bind_hash = tb; /* 把此tb作为icsk成员icsk_bind_hash */

}

（4）查找

根据端口号，确定所在的哈希桶索引。

static inline int inet_bhashfn(struct net *net, const __16 lport, const int bhash_size)

{

    return (lport + net_hash_mix(net)) & (bhash_size - 1);

}

问题

笔者看的是3.6的版本，一开始对系统自动选择端口的逻辑感到不解。

2.6.18版本

2.6.18版本的系统自动选择端口：随机选取一个未使用端口，不允许端口复用。

由于snum为0时不进行端口复用，所以实现起来很简单。

3.2版本

查了下3.2版本，系统自动选择端口的思路是这样的：

1. 随机选取一个端口。

2. 检查其是否被使用了。

2.1 没有被使用，那么就是这个端口了，退出：）

2.2 被使用了，就检查它是否能够重用。

2.2.1 如果可以重用，则记下它的使用者个数small_size和端口号small_rover。

下次如果发现使用者更少的可重用端口，则更small_size和small_rover。

2.2.2 检查此时已绑定的端口总数，是不是超过了自动绑定区间的端口数。

2.2.2.1 超过了，说明端口基本上都被使用了，不再继续寻找没被使用过的，直接重用此时的small_rover。

退出：）

2.2.2.2 没超过，那就继续寻找没被使用过的端口。如果又发现可重用的端口时，顺便更新下small_size

和small_rover。

3. 端口++，重复1和2。

简而言之，系统自动选择端口时：尽量选择没被使用过的端口。如果实在没有，就选择使用者个数最少的可重用端口。

3.6版本

在3.3版本中加入了commit 2b05ad33。

增加了几行代码：

if (! inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {

snum = rover;

goto tb_found;

}

作者的描述如下：

tcp: bind() fix autoselection to share ports

The current code checks for conflicts when the application requests a specific port.

If there is no conflict, then the request is granted.

On the other hand, the port autoselection done by the kernel fails when all ports are bound

even when there is a port whith no conflict available.

The fix changes port autoselection to check if there is a conflict and use it if not.

作者的意思是，在系统自动选择端口时，判断可重用端口的主要条件为：

tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN

而其实不符合此条件、但通过bind_conflict()检查的端口也是可以重用的，所以作者的出发点是正确的。

但是，简单的把上面那几行代码就这样加进去，系统自动选择端口的思路就变为：

1. 随机选取一个端口。

2. 检查其是否被使用了。

2.1 没有被使用，那么就是这个端口了，退出：）

2.2 被使用了，检查重用是否有冲突。

2.2.1 没有冲突，就重用这个端口，退出！

2.2.2 有冲突，继续遍历。

3. 端口++，重复1和2。

也就是说，系统自动选择端口时：不优先选择没被使用过的端口。只要没有冲突，直接重用端口。

这样一来，上面small_size和small_rover的代码就成了空摆设了，跟原意不一样了。我认为这个改动并不合适，

不知道当时是怎么通过的：）