http://blog.chinaunix.net/uid-20788636-id-4408261.html

前言:

对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢!
     转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html

Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

int retval;

struct socket *sock;

int flags;

/* Check the SOCK_* constants for consistency.  下面这些都是进行各种的检查操作*/

BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

flags = type & ~SOCK_TYPE_MASK;

if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

return -EINVAL;

type &= SOCK_TYPE_MASK;

if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

//调用创建socket的函数

retval = sock_create(family, type, protocol, &sock);//------参考下面的分析

if (retval < 0)

goto out;

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

if (retval < 0)

goto out_release;

out:

/* It may be already another descriptor 8) Not kernel problem. */

return retval;

out_release:

sock_release(sock);

return retval;

}

1.1  socket_create函数

对于sock_create(family, type, protocol, &sock)函数调用的是包囊函数,

__sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);

对于__sock_create函数的定义如下:

int __sock_create(struct net *net, int family, int type, int protocol,

struct socket **res, int kern)

{

int err;

struct socket *sock;

const struct net_proto_family *pf;

/*

*      Check protocol is in range 检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇

#define NPROTO                  AF_MAX

#define AF_MAX           41     /* For now.. */

*/

if (family < 0 || family >= NPROTO)

return -EAFNOSUPPORT;

if (type < 0 || type >= SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM

return -EINVAL;

/* Compatibility.

This uglymoron is moved from INET layer to here to avoid

deadlock in module load.

*/

if (family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值

static int warned;//这里自动初始化为0,

if (!warned) {

warned = 1;

printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",

current->comm);

}

family = PF_PACKET;//赋值为PF_PACKET

}

err = security_socket_create(family, type, protocol, kern);

if (err)

return err;

/*

*     Allocate the socket and allow the family to set things up. if

*     the protocol is 0, the family is instructed to select an appropriate

*     default.这里调用sock_alloc分配sock,见下面的分析

*/

sock = sock_alloc();

if (!sock) {

net_warn_ratelimited("socket: no more sockets\n");

return -ENFILE;         /* Not exactly a match, but its the

closest posix thing */

}

sock->type = type;

#ifdef CONFIG_MODULES

/* Attempt to load a protocol module if the find failed.

*

* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user

* requested real, full-featured networking support upon configuration.

* Otherwise module support will break!

*/

if (rcu_access_pointer(net_families[family]) == NULL)

request_module("net-pf-%d", family);

#endif

rcu_read_lock();

pf = rcu_dereference(net_families[family]);

err = -EAFNOSUPPORT;

if (!pf)

goto out_release;

/*

* We will call the ->create function, that possibly is in a loadable

* module, so we have to bump that loadable module refcnt first.

*/

if (!try_module_get(pf->owner))

goto out_release;

/* Now protected by module ref count */

rcu_read_unlock();

/*static const struct net_proto_family inet_family_ops = {

.family = PF_INET,

.create = inet_create,

.owner     = THIS_MODULE,

};这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/

err = pf->create(net, sock, protocol, kern);

if (err < 0)

goto out_module_put;

/*

* Now to bump the refcnt of the [loadable] module that owns this

* socket at sock_release time we decrement its refcnt.

*/

if (!try_module_get(sock->ops->owner))

goto out_module_busy;

/*

* Now that we're done with the ->create function, the [loadable]

* module can have its refcnt decremented

*/

module_put(pf->owner);

err = security_socket_post_create(sock, family, type, protocol, kern);

if (err)

goto out_sock_release;

*res = sock;

return 0;

out_module_busy:

err = -EAFNOSUPPORT;

out_module_put:

sock->ops = NULL;

module_put(pf->owner);

out_sock_release:

sock_release(sock);

return err;

out_release:

rcu_read_unlock();

goto out_sock_release;

}

1.1.1   sock_alloc函数

sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。

static struct socket *sock_alloc(void)

{

struct inode *inode;

struct socket *sock;

/*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc {

struct socket socket;

struct inode vfs_inode;

};

*/

inode = new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1)

if (!inode)

return NULL;

sock = SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针

kmemcheck_annotate_bitfield(sock, type);

/*下面是对inode变量进行初始化操作,*/

inode->i_ino = get_next_ino();

inode->i_mode = S_IFSOCK | S_IRWXUGO;

inode->i_uid = current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比

inode->i_gid = current_fsgid();//组ID

inode->i_op = &sockfs_inode_ops;

this_cpu_add(sockets_in_use, 1);

return sock;

}

(1)对于sock_mnt->mnt_sb的赋值和分配过程如下:

在sock_init函数中对socket类型的文件系统进行注册

static struct file_system_type sock_fs_type = {

.name =             "sockfs",

.mount =  sockfs_mount,

.kill_sb =  kill_anon_super,

};

static int __init sock_init(void)

{

int err;

/*

*      Initialize the network sysctl infrastructure.

*/

err = net_sysctl_init();

if (err)

goto out;

/*

*      Initialize skbuff SLAB cache

*/

skb_init();

/*

*      Initialize the protocols module.

*/

init_inodecache();

/*下面的函数进行文件系统的注册*/

err = register_filesystem(&sock_fs_type);

if (err)

goto out_fs;

/*下面的函数挂载文件系统*/

sock_mnt = kern_mount(&sock_fs_type);

if (IS_ERR(sock_mnt)) {

err = PTR_ERR(sock_mnt);

goto out_mount;

}

/* The real protocol initialization is performed in later initcalls.

*/

#ifdef CONFIG_NETFILTER

err = netfilter_init();

if (err)

goto out;

#endif

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

skb_timestamping_init();

#endif

out:

return err;

out_mount:

unregister_filesystem(&sock_fs_type);

out_fs:

goto out;

}

(2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数

struct inode *new_inode_pseudo(struct super_block *sb)

{

struct inode *inode = alloc_inode(sb);

if (inode) {

spin_lock(&inode->i_lock);

inode->i_state = 0;

spin_unlock(&inode->i_lock);

INIT_LIST_HEAD(&inode->i_sb_list);

}

return inode;

}

alloc_inode分配一个inode节点,

static struct inode *alloc_inode(struct super_block *sb)

{

struct inode *inode;

if (sb->s_op->alloc_inode)

/*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inode

static const struct super_operations sockfs_ops = {

.alloc_inode     = sock_alloc_inode,

.destroy_inode         = sock_destroy_inode,

.statfs                = simple_statfs,

};

*/

inode = sb->s_op->alloc_inode(sb);

else

inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

if (!inode)

return NULL;

/*对inode结构进行初始化*/

if (unlikely(inode_init_always(sb, inode))) {

if (inode->i_sb->s_op->destroy_inode)

inode->i_sb->s_op->destroy_inode(inode);

else

kmem_cache_free(inode_cachep, inode);

return NULL;

}

return inode;

}

(3) 下面是sock_alloc_inode函数,在socket.c文件中

static struct inode *sock_alloc_inode(struct super_block *sb)

{

struct socket_alloc *ei;

struct socket_wq *wq;

/*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明 */

ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);

if (!ei)

return NULL;

wq = kmalloc(sizeof(*wq), GFP_KERNEL);

if (!wq) {

kmem_cache_free(sock_inode_cachep, ei);

return NULL;

}

init_waitqueue_head(&wq->wait);

wq->fasync_list = NULL;

RCU_INIT_POINTER(ei->socket.wq, wq);

ei->socket.state = SS_UNCONNECTED;

ei->socket.flags = 0;

ei->socket.ops = NULL;

ei->socket.sk = NULL;

ei->socket.file = NULL;

return &ei->vfs_inode; //这里返回的是struct inode vfs_inode;

}

备注说明:在分配函数sock_alloc_inode中调用了ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?

init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用

static int init_inodecache(void)

{

sock_inode_cachep = kmem_cache_create("sock_inode_cache",

sizeof(struct socket_alloc),

0,

(SLAB_HWCACHE_ALIGN |

SLAB_RECLAIM_ACCOUNT |

SLAB_MEM_SPREAD),

init_once);

if (sock_inode_cachep == NULL)

return -ENOMEM;

return 0;

}

1.1.2   inet_create函数

在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中

static int inet_create(struct net *net, struct socket *sock, int protocol,

int kern)

{

struct sock *sk;

struct inet_protosw *answer;

struct inet_sock *inet;

struct proto *answer_prot;

unsigned char answer_flags;

char answer_no_check;

int try_loading_module = 0;

int err;

sock->state = SS_UNCONNECTED;

/* Look for the requested type/protocol pair. */

lookup_protocol:

err = -ESOCKTNOSUPPORT;

rcu_read_lock();

/*  从inetsw中根据类型、协议查找相应的socket interface也就是 inet_protosw */

list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

err = 0;

/* Check the non-wild match. */

if (protocol == answer->protocol) {

if (protocol != IPPROTO_IP)

break;

} else {

/* Check for the two wild cases. */

if (IPPROTO_IP == protocol) {

protocol = answer->protocol;

break;

}

if (IPPROTO_IP == answer->protocol)

break;

}

err = -EPROTONOSUPPORT;

}

/*如果没有找到,尝试加载模块*/

if (unlikely(err)) {

if (try_loading_module < 2) {

rcu_read_unlock();

/*

* Be more specific, e.g. net-pf-2-proto-132-type-1

* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)

*/

if (++try_loading_module == 1)

request_module("net-pf-%d-proto-%d-type-%d",

PF_INET, protocol, sock->type);

/*

* Fall back to generic, e.g. net-pf-2-proto-132

* (net-pf-PF_INET-proto-IPPROTO_SCTP)

*/

else

request_module("net-pf-%d-proto-%d",

PF_INET, protocol);

goto lookup_protocol;

} else

goto out_rcu_unlock;

}

err = -EPERM;

if (sock->type == SOCK_RAW && !kern &&

!ns_capable(net->user_ns, CAP_NET_RAW))

goto out_rcu_unlock;

sock->ops = answer->ops;

answer_prot = answer->prot;

answer_no_check = answer->no_check;

answer_flags = answer->flags;

rcu_read_unlock();

WARN_ON(answer_prot->slab == NULL);

/* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/

err = -ENOBUFS;

sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);

if (sk == NULL)

goto out;

err = 0;

sk->sk_no_check = answer_no_check;

if (INET_PROTOSW_REUSE & answer_flags)

sk->sk_reuse = SK_CAN_REUSE;

inet = inet_sk(sk);

inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

inet->nodefrag = 0;

if (SOCK_RAW == sock->type) {

inet->inet_num = protocol;

if (IPPROTO_RAW == protocol)

inet->hdrincl = 1;

}

if (net->ipv4.sysctl_ip_no_pmtu_disc)

inet->pmtudisc = IP_PMTUDISC_DONT;

else

inet->pmtudisc = IP_PMTUDISC_WANT;

inet->inet_id = 0;

/*对sk结构体中的变量进行初始化操作,*/

sock_init_data(sock, sk);------------------(1)

sk->sk_destruct          = inet_sock_destruct;

sk->sk_protocol           = protocol;

sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

inet->uc_ttl      = -1;

inet->mc_loop = 1;

inet->mc_ttl     = 1;

inet->mc_all     = 1;

inet->mc_index        = 0;

inet->mc_list   = NULL;

inet->rcv_tos   = 0;

sk_refcnt_debug_inc(sk);

if (inet->inet_num) {

/* It assumes that any protocol which allows

* the user to assign a number at socket

* creation time automatically

* shares.

*/

inet->inet_sport = htons(inet->inet_num);

/* Add to protocol hash chains. */

sk->sk_prot->hash(sk);

}

if (sk->sk_prot->init) {

err = sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2)

if (err)

sk_common_release(sk);

}

out:

return err;

out_rcu_unlock:

rcu_read_unlock();

goto out;

}

(1)sock_init_data函数分析

void sock_init_data(struct socket *sock, struct sock *sk)

{

skb_queue_head_init(&sk->sk_receive_queue);

skb_queue_head_init(&sk->sk_write_queue);

skb_queue_head_init(&sk->sk_error_queue);

#ifdef CONFIG_NET_DMA

skb_queue_head_init(&sk->sk_async_wait_queue);

#endif

sk->sk_send_head   =       NULL;

/*初始化sk定时器*/

init_timer(&sk->sk_timer);

sk->sk_allocation     =       GFP_KERNEL;

sk->sk_rcvbuf            =       sysctl_rmem_default;

sk->sk_sndbuf           =       sysctl_wmem_default;

sk->sk_state             =       TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断

sk_set_socket(sk, sock);// sk->sk_socket = sock; 设置sk中指向socket的指针

sock_set_flag(sk, SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思?

if (sock) {

sk->sk_type      =       sock->type;

sk->sk_wq        =       sock->wq;

sock->sk   =       sk; // struct socket *sock 的sk指向sock

} else

sk->sk_wq        =       NULL;

spin_lock_init(&sk->sk_dst_lock);

rwlock_init(&sk->sk_callback_lock);

lockdep_set_class_and_name(&sk->sk_callback_lock,

af_callback_keys + sk->sk_family,

af_family_clock_key_strings[sk->sk_family]);

sk->sk_state_change       =       sock_def_wakeup;

sk->sk_data_ready  =       sock_def_readable;

sk->sk_write_space         =       sock_def_write_space;

sk->sk_error_report         =       sock_def_error_report;

sk->sk_destruct                 =       sock_def_destruct;

sk->sk_frag.page     =       NULL;

sk->sk_frag.offset   =       0;

sk->sk_peek_off                =       -1;

sk->sk_peer_pid     =       NULL;

sk->sk_peer_cred    =       NULL;

sk->sk_write_pending     =       0;

sk->sk_rcvlowat                =       1;

sk->sk_rcvtimeo                =       MAX_SCHEDULE_TIMEOUT;

sk->sk_sndtimeo               =       MAX_SCHEDULE_TIMEOUT;

sk->sk_stamp = ktime_set(-1L, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL

sk->sk_napi_id                   =       0;

sk->sk_ll_usec          =       sysctl_net_busy_read;

#endif

sk->sk_max_pacing_rate = ~0U;

sk->sk_pacing_rate = ~0U;

/*

* Before updating sk_refcnt, we must commit prior changes to memory

* (Documentation/RCU/rculist_nulls.txt for details)

*/

smp_wmb();

atomic_set(&sk->sk_refcnt, 1);//sk的引用计数加1

atomic_set(&sk->sk_drops, 0);

}

(2)static int tcp_v4_init_sock(struct sock *sk)

{

struct inet_connection_sock *icsk = inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换

tcp_init_sock(sk);//进行tcp相关变量的初始化工作

icsk->icsk_af_ops = &ipv4_specific;

#ifdef CONFIG_TCP_MD5SIG

tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;

#endif

return 0;

}

Socket内核调用数SYSCALL_DEFINE3的更多相关文章

  1. linux下改动内核參数进行Tcp性能调优 -- 高并发

    前言: Tcp/ip协议对网络编程的重要性,进行过网络开发的人员都知道,我们所编写的网络程序除了硬件,结构等限制,通过改动Tcp/ip内核參数也能得到非常大的性能提升, 以下就列举一些Tcp/ip内核 ...

  2. c/c++ socket API 调用后的错误判断 perror errno

    socket API 调用后的错误判断 perror errno 调用完socket API后,需要判断调用是否成功与失败.如果失败,会自动设置errno(是个整数), 并且用perror可以打印出具 ...

  3. Linux内核调用I2C驱动_驱动嵌套驱动方法

    禁止转载!!!! Linux内核调用I2C驱动_以MPU6050为例 0. 导语 最近一段时间都在恶补数据结构和C++,加上导师的事情比较多,Linux内核驱动的学习进程总是被阻碍.不过,十一假期终于 ...

  4. Linux内核调用SPI平台级驱动_实现OLED的显示功能

    Linux内核调用SPI驱动_实现OLED显示功能 0. 导语 进入Linux的世界,发现真的是无比的有趣,也发现搞Linux驱动从底层嵌入式搞起真的是很有益处.我们在单片机.DSP这些无操作系统的裸 ...

  5. socket简单调用

    下载 Client using System; using System.Collections.Generic; using System.Linq; using System.Text; //导入 ...

  6. spark docker java kubernetes 获取cpu内核/线程数问题

    升级服务从spark2.3.0-hadoop2.8 至 spark2.4.0 hadoop3.0 一日后导致spark streaming kafka消费数据积压 服务不是传统的部署在yarn上,而是 ...

  7. ok6410 3.0.1内核调用V4L接口出错解决方法(转)

    在做视频监控项目,以前一直用的是2.6.36的内核,一直很正常,但是这几天换3.0.1内核,启动程序,却出现了错误,如下: ./test_usb_camera XXXXXXXXXXXXXXXXXXXX ...

  8. module_init 内核调用过程

    内核版本:linux_2.6.22.6 入口源文件: init.h

  9. Linux Socket - 内核非阻塞功能

    select 函数 int select(int maxfdp,fd_set *readfds,fd_set *writefds,fd_set *errorfds,struct timeval*tim ...

随机推荐

  1. Dart方法基础知识

    方法定义: void main(List args){ print(args); print(getPerson('wwk', 32)); } /*String getPerson(String na ...

  2. [翻译] InfluxDB 存储机制解析

    原文地址: https://medium.com/dataseries/analysis-of-the-storage-mechanism-in-influxdb-b84d686f3697 TODO

  3. python安装pyautogui

    一.问题在安装使用[pip install pyautogui]的时候会出现如下的错误: ERROR: Complete output from command python setup.py egg ...

  4. vue实现跨域请求的设置

    vue实现跨域请求,需要在vue.config.js里添加以下设置 proxy: { '/service/rest': { target: 'http://localhost:8080/autotab ...

  5. java运算符的优先级别

    一.运算符的优先级 运算符按照优先级别的高低排序分别是:自加/减运算符. 算术运算符.比较运算符.逻辑运算符.赋值运算符.具体请参考下表: 顺序 运算符 1. 括号,如 ( ) 和 [ ] 2. 一元 ...

  6. ML学习笔记之LATEX数学公式基本语法

    作者:@houkai本文为作者原创,转载请注明出处:https://www.cnblogs.com/houkai/p/3399646.html 0x00 概述 TEX 是Donald E. Knuth ...

  7. 上传文件大小与时间 Web.Config文件 httpRuntime 限制

    httpRuntime  <httpRuntime executionTimeout="90" maxRequestLength="40960" useF ...

  8. [Leetcode] Binary Tree Pruning

    題目是說,如果左右子樹都不存在又自已為0,就去掉那個子樹(設為null) recursive後序,左子樹,右子樹,然後是根 自已同時又是別人的子樹,所以要告訢根自已是不是存在 從a開始,左右子樹都不存 ...

  9. ASP.NET 异步编程之Async await

    本文重点介绍的是.NET Framework4.5 推出的异步编程方案  async await 请先看个5分钟的微软演示的视频:视频地址: https://channel9.msdn.com/Blo ...

  10. zynq7020开发板+ Z-turn调试计划

    参加米尔zynq7020开发板试用活动. 收到米尔z-turn板子后,焊接了一个JTAG转接板,以方便调试PL部分,对于后面的调试部分,主要分三个部分走:1.调试FPGA部分,实现逻辑控制外围简单的设 ...