socket创建
<h2>概述</h2>
<p>1、申请 scket 对象
2、根据 family 调用其协议对应的 create 函数。对于 AF_INET 而言对应的是 inet_create。详见:net/ipv4/af_inet.c:inet_init()
3、以 TCP 为例,分配 sock 对象,并初始化,包括很多 ops 的赋值
4、设备 sk->sk_data_ready = sock_def_readable,收包时会调用</p>
<h2>分析</h2>
<p>应用层创建 sokcet</p>
<pre><code class="language-c">int main()
{
/*
原型:int socket(int domain, int type, int protocol)
该函数用于创建一个新的socket。
第一个参数:
domain:协议簇,常用的协议簇有:AF_INET, AF_INET6, AF_LOCAL。这个参数决定了socket的地址类型,这个应该很好理解AF_INET用于ipv4地质,AF_INET6用于ipv6地址,AF_LOCAL用于本地进程间通信。
第二个参数:
type:socket类型有好几种,主要是两种:SOCK_STREAM、SOCK_DGRAM(数据报),通俗说就是字节流socket和数据报socket,当你在创建的使用使用哪一种由第二个参数指定。stream socket基于TCP协议,是一个有序、可靠、全双工的字节流通道。datagram socket基于UDP协议,不需要建立和维持连接,可能会丢失或错乱。本文主要说明stream socket,因为ceph用的是这个。
第三个参数:
protocol:指定协议,常用协议有IPPROTO_TCP、IPPROTO_UDP、IPPROTO_STCP、IPPROTO_TICP等,分别对应TCP协议,UDP协议,STCP协议,TICP协议。通常这个参数设置为0,表示使用套接字默认协议。
*/
int sk = socket(AF_INET, SOCK_STREAM, 0);
// ...
}</code></pre>
<p>内核对应处理的代码:</p>
<pre><code class="language-c">// file: net/socket.c
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC &amp; SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK &amp; SOCK_TYPE_MASK);
flags = type &amp; ~SOCK_TYPE_MASK;
if (flags &amp; ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &amp;= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK &amp;&amp; (flags &amp; SOCK_NONBLOCK))
flags = (flags &amp; ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &amp;sock); // 继续
if (retval &lt; 0)
goto out;
retval = sock_map_fd(sock, flags &amp; (O_CLOEXEC | O_NONBLOCK)); // 转换成 fd
if (retval &lt; 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current-&gt;nsproxy-&gt;net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family &lt; 0 || family &gt;= NPROTO)
return -EAFNOSUPPORT;
if (type &lt; 0 || type &gt;= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET &amp;&amp; type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO &quot;%s uses obsolete (PF_INET,SOCK_PACKET)\n&quot;,
current-&gt;comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc(); // socket 对象
if (!sock) {
net_warn_ratelimited(&quot;socket: no more sockets\n&quot;);
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock-&gt;type = type;
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module(&quot;net-pf-%d&quot;, family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); // 获取协议簇操作表,对于 AF_INET 而言对应的是 inet_family_ops,其中主要就是一个 .create 函数为 inet_create。详见:net/ipv4/af_inet.c:inet_init()
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the -&gt;create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf-&gt;owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf-&gt;create(net, sock, protocol, kern); // 调用协议簇操作表中的 create 函数
if (err &lt; 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock-&gt;ops-&gt;owner))
goto out_module_busy;
/*
* Now that we're done with the -&gt;create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf-&gt;owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock-&gt;ops = NULL;
module_put(pf-&gt;owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);</code></pre>
<p>对于 AF_INET 而言对应的是 <code>inet_family_ops</code>,对应的 create 是 <code>inet_create</code>:</p>
<pre><code class="language-c">// file: net/ipv4/af_inet.c
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM, // 1
.protocol = IPPROTO_TCP, // 6
.prot = &amp;tcp_prot, // 包括 tcp_v4_connect, tcp_recvmsg, tcp_sendmsg 等
.ops = &amp;inet_stream_ops, // 包括 inet_sendmsg, inet_recvmsg 等
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM, // 2
.protocol = IPPROTO_UDP, // 17
.prot = &amp;udp_prot,
.ops = &amp;inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM, // 2
.protocol = IPPROTO_ICMP, // 1
.prot = &amp;ping_prot,
.ops = &amp;inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW, // 3
.protocol = IPPROTO_IP, /* wild card */ // 0
.prot = &amp;raw_prot,
.ops = &amp;inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
if (unlikely(!inet_ehash_secret))
if (sock-&gt;type != SOCK_RAW &amp;&amp; sock-&gt;type != SOCK_DGRAM)
build_ehash_secret();
if (protocol &lt; 0 || protocol &gt;= IPPROTO_MAX)
return -EINVAL;
sock-&gt;state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_entry_rcu(answer, &amp;inetsw[sock-&gt;type], list) { // 对于 TCP,sock-&gt;type = SOCK_STREAM。在 af_inet.c:inet_init() 中会将 inetsw_array 中的项注册到 inetsw 中。
err = 0;
/* Check the non-wild match. */
if (protocol == answer-&gt;protocol) { // protocal 相同
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer-&gt;protocol;
break;
}
if (IPPROTO_IP == answer-&gt;protocol)
break;
}
err = -EPROTONOSUPPORT;
}
if (unlikely(err)) {
if (try_loading_module &lt; 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module(&quot;net-pf-%d-proto-%d-type-%d&quot;,
PF_INET, protocol, sock-&gt;type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module(&quot;net-pf-%d-proto-%d&quot;,
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (sock-&gt;type == SOCK_RAW &amp;&amp; !kern &amp;&amp;
!ns_capable(net-&gt;user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock-&gt;ops = answer-&gt;ops; // 对于 tcp 则是 inet_stream_ops,见下文。注意这里是 socket 对象
answer_prot = answer-&gt;prot; // 对于 tcp 则是 tcp_prot,见下文
answer_no_check = answer-&gt;no_check;
answer_flags = answer-&gt;flags;
rcu_read_unlock();
WARN_ON(answer_prot-&gt;slab == NULL);
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); // 分配 sock 对象,并将 tcp_prot 赋值给 sk-&gt;sk_prot 上。。注意这里是 sock 对象
if (sk == NULL)
goto out;
err = 0;
sk-&gt;sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE &amp; answer_flags)
sk-&gt;sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet-&gt;is_icsk = (INET_PROTOSW_ICSK &amp; answer_flags) != 0;
inet-&gt;nodefrag = 0;
if (SOCK_RAW == sock-&gt;type) {
inet-&gt;inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet-&gt;hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet-&gt;pmtudisc = IP_PMTUDISC_DONT;
else
inet-&gt;pmtudisc = IP_PMTUDISC_WANT;
inet-&gt;inet_id = 0;
sock_init_data(sock, sk); // 很多初始化,见下文
sk-&gt;sk_destruct = inet_sock_destruct;
sk-&gt;sk_protocol = protocol;
sk-&gt;sk_backlog_rcv = sk-&gt;sk_prot-&gt;backlog_rcv;
inet-&gt;uc_ttl = -1;
inet-&gt;mc_loop = 1;
inet-&gt;mc_ttl = 1;
inet-&gt;mc_all = 1;
inet-&gt;mc_index = 0;
inet-&gt;mc_list = NULL;
inet-&gt;rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet-&gt;inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet-&gt;inet_sport = htons(inet-&gt;inet_num);
/* Add to protocol hash chains. */
sk-&gt;sk_prot-&gt;hash(sk);
}
if (sk-&gt;sk_prot-&gt;init) { // 注意这里:sk-&gt;sk_prot = tcp_prot,所以调用的是 tcp_prot.init,也就是 tcp_v4_init_sock
err = sk-&gt;sk_prot-&gt;init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll, // epoll 场景会用到
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
struct proto tcp_prot = {
.name = &quot;TCP&quot;,
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock, // 这个函数里会初始化 icsk-&gt;icsk_af_ops(在发送包的时候会调用到里面的接口)
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &amp;tcp_sockets_allocated,
.orphan_count = &amp;tcp_orphan_count,
.memory_allocated = &amp;tcp_memory_allocated,
.memory_pressure = &amp;tcp_memory_pressure,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &amp;tcp_timewait_sock_ops,
.rsk_prot = &amp;tcp_request_sock_ops,
.h.hashinfo = &amp;tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: net/core/sock.c
void sock_init_data(struct socket *sock, struct sock *sk)
{
skb_queue_head_init(&amp;sk-&gt;sk_receive_queue);
skb_queue_head_init(&amp;sk-&gt;sk_write_queue);
skb_queue_head_init(&amp;sk-&gt;sk_error_queue);
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&amp;sk-&gt;sk_async_wait_queue);
#endif
sk-&gt;sk_send_head = NULL;
init_timer(&amp;sk-&gt;sk_timer);
sk-&gt;sk_allocation = GFP_KERNEL;
sk-&gt;sk_rcvbuf = sysctl_rmem_default;
sk-&gt;sk_sndbuf = sysctl_wmem_default;
sk-&gt;sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk-&gt;sk_type = sock-&gt;type;
sk-&gt;sk_wq = sock-&gt;wq;
sock-&gt;sk = sk;
} else
sk-&gt;sk_wq = NULL;
spin_lock_init(&amp;sk-&gt;sk_dst_lock);
rwlock_init(&amp;sk-&gt;sk_callback_lock);
lockdep_set_class_and_name(&amp;sk-&gt;sk_callback_lock,
af_callback_keys + sk-&gt;sk_family,
af_family_clock_key_strings[sk-&gt;sk_family]);
sk-&gt;sk_state_change = sock_def_wakeup;
sk-&gt;sk_data_ready = sock_def_readable; // 内核收到数据时,会调用
sk-&gt;sk_write_space = sock_def_write_space;
sk-&gt;sk_error_report = sock_def_error_report;
sk-&gt;sk_destruct = sock_def_destruct;
sk-&gt;sk_frag.page = NULL;
sk-&gt;sk_frag.offset = 0;
sk-&gt;sk_peek_off = -1;
sk-&gt;sk_peer_pid = NULL;
sk-&gt;sk_peer_cred = NULL;
sk-&gt;sk_write_pending = 0;
sk-&gt;sk_rcvlowat = 1;
sk-&gt;sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk-&gt;sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk-&gt;sk_stamp = ktime_set(-1L, 0);
sk-&gt;sk_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
atomic_set(&amp;sk-&gt;sk_refcnt, 1);
atomic_set(&amp;sk-&gt;sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);
</code></pre>