listen监听
<h2>概述</h2>
<p>1、listen 主要的工作就是申请和初始化接收队列,包括全连接队列和半连接队列。其中半连接队列是一个哈希表,叫队列还不够准确。另外,会将 socket->sk 加入到 tcp_hashinfo->listening_hash,以便 tcp_v4_rcv 查询;</p>
<p>2、当 SYN 到达时,会从 tcp_hashinfo->listening_hash 查找到 listen 的 sk,然后检查队列长度。检查通过后,则创建 request_sock 对象进行初始化,然后则发送 SYN+ACK,并将 request_sock 加入到半连接队列中。注意:这里并没有创建新的 sk 对象,所以接下来收 ACK 包时,查找到的还是 TCP_LISTEN 状态的公共的 sk;</p>
<p>3、如继续收到 ACK 包,则创建子 sk,将 request_sock 从半连接队列中删除,并加入到全连接队列中(并将子 sk 设置到 request_sock->sk);设置连接状态为 ESTABLISHED。</p>
<h2>分析 listen</h2>
<p>通常的用法:</p>
<pre><code class="language-c">int main()
{
int fd = socket(AF_INET, SOCK_STREAM, 0);
bind(fd, ...);
listen(fd, 128);
accept(fd, ...);
}</code></pre>
<p>创建 socket 的系统调用见单独的分析,listen 系统调用分析:</p>
<pre><code class="language-c">// file: net/socket.c
/*
* Perform a listen. Basically, we allow the protocol to do anything
* necessary for a listen, and if that works, we mark the socket as
* ready for listening.
*/
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &amp;err, &amp;fput_needed); // 根据 fd 查找 socket 对象
if (sock) {
somaxconn = sock_net(sock-&gt;sk)-&gt;core.sysctl_somaxconn;
if ((unsigned int)backlog &gt; somaxconn)
backlog = somaxconn; // 全连接队列最大长度为 net.core.somaxconn
err = security_socket_listen(sock, backlog);
if (!err)
err = sock-&gt;ops-&gt;listen(sock, backlog); // socket 上的 opt 是 inet_listen
fput_light(sock-&gt;file, fput_needed);
}
return err;
}
// file: net/ipv4/af_inet.c
/*
* Move a socket into listening state.
*/
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock-&gt;sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock-&gt;state != SS_UNCONNECTED || sock-&gt;type != SOCK_STREAM)
goto out;
old_state = sk-&gt;sk_state;
if (!((1 &lt;&lt; old_state) &amp; (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
/* Check special setups for testing purpose to enable TFO w/o
* requiring TCP_FASTOPEN sockopt.
* Note that only TCP sockets (SOCK_STREAM) will reach here.
* Also fastopenq may already been allocated because this
* socket was in TCP_LISTEN state previously but was
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen &amp; TFO_SERVER_ENABLE) != 0 &amp;&amp;
inet_csk(sk)-&gt;icsk_accept_queue.fastopenq == NULL) {
if ((sysctl_tcp_fastopen &amp; TFO_SERVER_WO_SOCKOPT1) != 0)
err = fastopen_init_queue(sk, backlog);
else if ((sysctl_tcp_fastopen &amp;
TFO_SERVER_WO_SOCKOPT2) != 0)
err = fastopen_init_queue(sk,
((uint)sysctl_tcp_fastopen) &gt;&gt; 16);
else
err = 0;
if (err)
goto out;
}
err = inet_csk_listen_start(sk, backlog); // 继续
if (err)
goto out;
}
sk-&gt;sk_max_ack_backlog = backlog; // 全连接队列大小
err = 0;
out:
release_sock(sk);
return err;
}
EXPORT_SYMBOL(inet_listen);
// file: net/ipv4/inet_connection_sock.c
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk); // 类似 libuv 的层次关系,可以直接转换,见下图
struct inet_connection_sock *icsk = inet_csk(sk); // 同上
int rc = reqsk_queue_alloc(&amp;icsk-&gt;icsk_accept_queue, nr_table_entries); // icsk_accept_queue 是接收队列,这里是初始化接收队列
if (rc != 0)
return rc;
sk-&gt;sk_max_ack_backlog = 0;
sk-&gt;sk_ack_backlog = 0;
inet_csk_delack_init(sk);
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
sk-&gt;sk_state = TCP_LISTEN; // listen 的 sk 状态
if (!sk-&gt;sk_prot-&gt;get_port(sk, inet-&gt;inet_num)) { // 即 inet_csk_get_port(),它在 bind 时也会调用。需要注意:它会重新走请求端口的逻辑,并可能修改 tb-&gt;fastreuse 等信息。问:不知道传入 0 端口会怎么样?答:实测可以跳过 bind 直接 listen,只是会监听一个随机端口
inet-&gt;inet_sport = htons(inet-&gt;inet_num);
sk_dst_reset(sk);
sk-&gt;sk_prot-&gt;hash(sk); // 即是 inet_hash,会将 sk 加入到 tcp_hashinfo-&gt;listening_hash 中。后面当 SYN 包到来的时候,会在此 hash 表中查找
return 0;
}
sk-&gt;sk_state = TCP_CLOSE;
__reqsk_queue_destroy(&amp;icsk-&gt;icsk_accept_queue);
return -EADDRINUSE;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
</code></pre>
<p>几个 sock 之间的关系:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=8d369e7a9187abd9c231985126753f67&amp;file=file.png" alt="" /></p>
<p>接收队列结构:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=8824a79bff71a27161c515afee1324f6&amp;file=file.png" alt="" /></p>
<pre><code class="language-c">// file: net/core/request_sock.c
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock); // 半连接队列
struct listen_sock *lopt;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); // nr 等于 min(backlog, somaxconn, tcp_max_syn_backlog) + 1 并向上取整为 2^n,但限制最小为 16(即8+1向上取整)
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size &gt; PAGE_SIZE)
lopt = vzalloc(lopt_size);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
for (lopt-&gt;max_qlen_log = 3;
(1 &lt;&lt; lopt-&gt;max_qlen_log) &lt; nr_table_entries;
lopt-&gt;max_qlen_log++); // 将 max_qlen_log 设置为 nr_table_entries 的幂次数,即 2^ max_qlen_log == nr_table_entries。据说是为了提升性能。
get_random_bytes(&amp;lopt-&gt;hash_rnd, sizeof(lopt-&gt;hash_rnd));
rwlock_init(&amp;queue-&gt;syn_wait_lock);
queue-&gt;rskq_accept_head = NULL; // 全连接队列队头初始化
lopt-&gt;nr_table_entries = nr_table_entries; // 半连接队列长度,注意和 max_qlen_log 的关系
write_lock_bh(&amp;queue-&gt;syn_wait_lock);
queue-&gt;listen_opt = lopt; // 即是 icsk_accept_queue-&gt;listen_opt = lopt
write_unlock_bh(&amp;queue-&gt;syn_wait_lock);
return 0;
}</code></pre>
<h2>接收 SYN 包</h2>
<p>当 SYN 包到达设备,在 TCP 层进入 <code>tcp_v4_rcv()</code>,查找对应的 sock 对象(具体而言,是在 tcp_hashinfo->listening_hash 中查找到 sk),然后继续进入 <code>tcp_v4_do_rcv()</code>。</p>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
/* The socket must have it's spinlock held when we get
* here.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif
if (sk-&gt;sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk-&gt;sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
if (dst) {
if (inet_sk(sk)-&gt;rx_dst_ifindex != skb-&gt;skb_iif ||
dst-&gt;ops-&gt;check(dst, 0) == NULL) {
dst_release(dst);
sk-&gt;sk_rx_dst = NULL;
}
}
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb-&gt;len)) {
rsk = sk;
goto reset;
}
return 0;
}
if (skb-&gt;len &lt; tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err;
if (sk-&gt;sk_state == TCP_LISTEN) { // 监听状态
struct sock *nsk = tcp_v4_hnd_req(sk, skb); // 检查半连接队列和建立队列
if (!nsk)
goto discard;
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb-&gt;len)) { // 继续
rsk = sk;
goto reset;
}
return 0;
reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &amp;prev, th-&gt;source,
iph-&gt;saddr, iph-&gt;daddr); // 查找半连接队列
if (req)
return tcp_check_req(sk, skb, req, prev, false);
nsk = inet_lookup_established(sock_net(sk), &amp;tcp_hashinfo, iph-&gt;saddr,
th-&gt;source, iph-&gt;daddr, th-&gt;dest, inet_iif(skb)); // 在 tcp_hashinfo-&gt;ehash 中搜索,包括普通的和 TW 状态的
if (nsk) {
if (nsk-&gt;sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th-&gt;syn)
sk = cookie_v4_check(sk, skb, &amp;(IPCB(skb)-&gt;opt));
#endif
return sk;
}
</code></pre>
<p>半连接队列结构如下图:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=c5abf4c88e9495f58360c46c8ed623b4&amp;file=file.png" alt="" /></p>
<pre><code class="language-c">// file: net/ipv4/inet_connection_sock.c
struct request_sock *inet_csk_search_req(const struct sock *sk,
struct request_sock ***prevp,
const __be16 rport, const __be32 raddr,
const __be32 laddr)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk-&gt;icsk_accept_queue.listen_opt; // 半连接队列
struct request_sock *req, **prev;
for (prev = &amp;lopt-&gt;syn_table[inet_synq_hash(raddr, rport, lopt-&gt;hash_rnd,
lopt-&gt;nr_table_entries)]; // 遍历哈希槽
(req = *prev) != NULL;
prev = &amp;req-&gt;dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq-&gt;rmt_port == rport &amp;&amp;
ireq-&gt;rmt_addr == raddr &amp;&amp;
ireq-&gt;loc_addr == laddr &amp;&amp;
AF_INET_FAMILY(req-&gt;rsk_ops-&gt;family)) {
WARN_ON(req-&gt;sk);
*prevp = prev;
break;
}
}
return req;
}
EXPORT_SYMBOL_GPL(inet_csk_search_req);
</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_input.c
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock *req;
int queued = 0;
tp-&gt;rx_opt.saw_tstamp = 0;
switch (sk-&gt;sk_state) {
case TCP_LISTEN:
if (th-&gt;ack)
return 1;
if (th-&gt;rst)
goto discard;
if (th-&gt;syn) {
if (th-&gt;fin)
goto discard;
if (icsk-&gt;icsk_af_ops-&gt;conn_request(sk, skb) &lt; 0) // 即 tcp_v4_conn_request。TODO:初始化的地方未追溯
return 1;
/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
kfree_skb(skb);
return 0;
}
goto discard;
// ... 其它状态省略
}
/* tcp_data could move socket to TIME-WAIT */
if (sk-&gt;sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
if (!queued) {
discard:
__kfree_skb(skb);
}
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
</code></pre>
<p>conn_request 实际是 tcp_v4_conn_request,这是一个非常重要的函数。TODO:未追溯初始化的地方。</p>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_options_received tmp_opt;
struct request_sock *req;
struct inet_request_sock *ireq;
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = NULL;
__be32 saddr = ip_hdr(skb)-&gt;saddr;
__be32 daddr = ip_hdr(skb)-&gt;daddr;
__u32 isn = TCP_SKB_CB(skb)-&gt;when;
bool want_cookie = false;
struct flowi4 fl4;
struct tcp_fastopen_cookie foc = { .len = -1 };
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sk_buff *skb_synack;
int do_fastopen;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)-&gt;rt_flags &amp; (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if (inet_csk_reqsk_queue_is_full(sk) &amp;&amp; !isn) { // 半连接队列是否满了
want_cookie = tcp_syn_flood_action(sk, skb, &quot;TCP&quot;);
if (!want_cookie)
goto drop; // 如果未开启 tcp_syncookies 参数,则直接丢弃
}
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
if (sk_acceptq_is_full(sk) &amp;&amp; inet_csk_reqsk_queue_young(sk) &gt; 1) { // 全连接队列是否满了。摘自网络:young_ack是半连接队列里保存着的一个计数器,记录的是刚有SYN到达,没有被SYN_ACK重传定时器重传过SYN_ACK,同时也没有完成过三次握手的sock数量。
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
req = inet_reqsk_alloc(&amp;tcp_request_sock_ops); // 新建 request_sock 对象。因为当前的 sk 是公共的,必须要有新的对象来记录状态
if (!req)
goto drop;
#ifdef CONFIG_TCP_MD5SIG
tcp_rsk(req)-&gt;af_specific = &amp;tcp_request_sock_ipv4_ops;
#endif
tcp_clear_options(&amp;tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp-&gt;rx_opt.user_mss;
tcp_parse_options(skb, &amp;tmp_opt, 0, want_cookie ? NULL : &amp;foc);
if (want_cookie &amp;&amp; !tmp_opt.saw_tstamp)
tcp_clear_options(&amp;tmp_opt);
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &amp;tmp_opt, skb);
ireq = inet_rsk(req); // 直接转换成 inet_request_sock 对象
ireq-&gt;loc_addr = daddr;
ireq-&gt;rmt_addr = saddr;
ireq-&gt;no_srccheck = inet_sk(sk)-&gt;transparent;
ireq-&gt;opt = tcp_v4_save_options(skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
TCP_ECN_create_request(req, skb, sock_net(sk));
if (want_cookie) {
isn = cookie_v4_init_sequence(sk, skb, &amp;req-&gt;mss);
req-&gt;cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
* state TIME-WAIT, and check against it before
* accepting new connection request.
*
* If &quot;isn&quot; is not zero, this request hit alive
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
if (tmp_opt.saw_tstamp &amp;&amp;
tcp_death_row.sysctl_tw_recycle &amp;&amp;
(dst = inet_csk_route_req(sk, &amp;fl4, req)) != NULL &amp;&amp;
fl4.daddr == saddr) {
if (!tcp_peer_is_proven(req, dst, true)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &amp;&amp;
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) &lt;
(sysctl_max_syn_backlog &gt;&gt; 2)) &amp;&amp;
!tcp_peer_is_proven(req, dst, false)) { // 即半连接请求数量大于 3/4 的 sysctl_max_syn_backlog
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
LIMIT_NETDEBUG(KERN_DEBUG pr_fmt(&quot;drop open request from %pI4/%u\n&quot;),
&amp;saddr, ntohs(tcp_hdr(skb)-&gt;source));
goto drop_and_release;
}
isn = tcp_v4_init_sequence(skb);
}
tcp_rsk(req)-&gt;snt_isn = isn;
if (dst == NULL) {
dst = inet_csk_route_req(sk, &amp;fl4, req);
if (dst == NULL)
goto drop_and_free;
}
do_fastopen = tcp_fastopen_check(sk, skb, req, &amp;foc, &amp;valid_foc);
/* We don't call tcp_v4_send_synack() directly because we need
* to make sure a child socket can be created successfully before
* sending back synack!
*
* XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
* (or better yet, call tcp_send_synack() in the child context
* directly, but will have to fix bunch of other code first)
* after syn_recv_sock() except one will need to first fix the
* latter to remove its dependency on the current implementation
* of tcp_v4_send_synack()-&gt;tcp_select_initial_window().
*/
skb_synack = tcp_make_synack(sk, dst, req,
fastopen_cookie_present(&amp;valid_foc) ? &amp;valid_foc : NULL); // 构建 syn+ack 包
if (skb_synack) {
__tcp_v4_send_check(skb_synack, ireq-&gt;loc_addr, ireq-&gt;rmt_addr);
skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
} else
goto drop_and_free;
if (likely(!do_fastopen)) {
int err;
err = ip_build_and_send_pkt(skb_synack, sk, ireq-&gt;loc_addr,
ireq-&gt;rmt_addr, ireq-&gt;opt); // 发送 syn+ack 包
err = net_xmit_eval(err);
if (err || want_cookie)
goto drop_and_free;
tcp_rsk(req)-&gt;snt_synack = tcp_time_stamp;
tcp_rsk(req)-&gt;listener = NULL;
/* Add the request_sock to the SYN table */
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); // 添加到半连接队列,并启动定时器
if (fastopen_cookie_present(&amp;foc) &amp;&amp; foc.len != 0)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
goto drop_and_free;
return 0;
drop_and_release:
dst_release(dst);
drop_and_free:
reqsk_free(req);
drop:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
</code></pre>
<h2>接收 ACK 包</h2>
<p>当客户端回复了 ACK 包时,在 tcp_hashinfo 中还是查找到公共的 TCP_LISTEN 的 sk:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
// ...
if (sk-&gt;sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb); // 查找半连接队列,创建子 sk,从半连接队列删除,添加到全连接队列。如果一切顺利,这里返回的就是子 sk
if (!nsk)
goto discard;
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) { // 继续
rsk = nsk;
goto reset;
}
return 0;
}
}
// ...
}
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &amp;prev, th-&gt;source,
iph-&gt;saddr, iph-&gt;daddr); // 在 sk 的半连接队列中查找,找到的是 request_sock 对象,而不是 sk
if (req)
return tcp_check_req(sk, skb, req, prev, false); // 继续
nsk = inet_lookup_established(sock_net(sk), &amp;tcp_hashinfo, iph-&gt;saddr,
th-&gt;source, iph-&gt;daddr, th-&gt;dest, inet_iif(skb));
if (nsk) {
if (nsk-&gt;sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th-&gt;syn)
sk = cookie_v4_check(sk, skb, &amp;(IPCB(skb)-&gt;opt));
#endif
return sk;
}
/*
* Process an incoming packet for SYN_RECV sockets represented as a
* request_sock. Normally sk is the listener socket but for TFO it
* points to the child socket.
*
* XXX (TFO) - The current impl contains a special check for ack
* validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
*
* We don't need to initialize tmp_opt.sack_ok as we don't use the results
*/
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev,
bool fastopen)
{
struct tcp_options_received tmp_opt;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) &amp; (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false;
BUG_ON(fastopen == (sk-&gt;sk_state == TCP_LISTEN));
tmp_opt.saw_tstamp = 0;
if (th-&gt;doff &gt; (sizeof(struct tcphdr)&gt;&gt;2)) {
tcp_parse_options(skb, &amp;tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req-&gt;ts_recent;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
*/
tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)&lt;&lt;req-&gt;num_timeout);
paws_reject = tcp_paws_reject(&amp;tmp_opt, th-&gt;rst);
}
}
/* Check for pure retransmitted SYN. */
if (TCP_SKB_CB(skb)-&gt;seq == tcp_rsk(req)-&gt;rcv_isn &amp;&amp;
flg == TCP_FLAG_SYN &amp;&amp;
!paws_reject) {
/*
* RFC793 draws (Incorrectly! It was fixed in RFC1122)
* this case on figure 6 and figure 8, but formal
* protocol description says NOTHING.
* To be more exact, it says that we should send ACK,
* because this segment (at least, if it has no data)
* is out of window.
*
* CONCLUSION: RFC793 (even with RFC1122) DOES NOT
* describe SYN-RECV state. All the description
* is wrong, we cannot believe to it and should
* rely only on common sense and implementation
* experience.
*
* Enforce &quot;SYN-ACK&quot; according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
*
* Note that even if there is new data in the SYN packet
* they will be thrown away too.
*
* Reset timer after retransmitting SYNACK, similar to
* the idea of fast retransmit in recovery.
*/
if (!inet_rtx_syn_ack(sk, req))
req-&gt;expires = min(TCP_TIMEOUT_INIT &lt;&lt; req-&gt;num_timeout,
TCP_RTO_MAX) + jiffies;
return NULL;
}
/* Further reproduces section &quot;SEGMENT ARRIVES&quot;
for state SYN-RECEIVED of RFC793.
It is broken, however, it does not work only
when SYNs are crossed.
You would think that SYN crossing is impossible here, since
we should have a SYN_SENT socket (from connect()) on our end,
but this is not true if the crossed SYNs were sent to both
ends by a malicious third party. We must defend against this,
and to do that we first verify the ACK (as per RFC793, page
36) and reset if it is invalid. Is this a true full defense?
To convince ourselves, let us consider a way in which the ACK
test can still pass in this 'malicious crossed SYNs' case.
Malicious sender sends identical SYNs (and thus identical sequence
numbers) to both A and B:
A: gets SYN, seq=7
B: gets SYN, seq=7
By our good fortune, both A and B select the same initial
send sequence number of seven :-)
A: sends SYN|ACK, seq=7, ack_seq=8
B: sends SYN|ACK, seq=7, ack_seq=8
So we are now A eating this SYN|ACK, ACK test passes. So
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
If icsk-&gt;icsk_accept_queue.rskq_defer_accept, we silently drop this
bare ACK. Otherwise, we create an established connection. Both
ends (listening sockets) accept the new incoming connection and try
to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
But generally, we should (RFC lies!) to accept ACK
from SYNACK both here and in tcp_rcv_state_process().
tcp_rcv_state_process() does not, hence, we do not too.
Note that the case is absolutely generic:
we cannot optimize anything here without
violating protocol. All the checks must be made
before attempt to create socket.
*/
/* RFC793 page 36: &quot;If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
* sent (the segment carries an unacceptable ACK) ...
* a reset is sent.&quot;
*
* Invalid ACK: reset will be sent by listening socket.
* Note that the ACK validity check for a Fast Open socket is done
* elsewhere and is checked directly against the child socket rather
* than req because user data may have been sent out.
*/
if ((flg &amp; TCP_FLAG_ACK) &amp;&amp; !fastopen &amp;&amp;
(TCP_SKB_CB(skb)-&gt;ack_seq !=
tcp_rsk(req)-&gt;snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
* is essentially ACK extension and too early or too late values
* should cause reset in unsynchronized states.
*/
/* RFC793: &quot;first check sequence number&quot;. */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)-&gt;seq, TCP_SKB_CB(skb)-&gt;end_seq,
tcp_rsk(req)-&gt;rcv_nxt, tcp_rsk(req)-&gt;rcv_nxt + req-&gt;rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg &amp; TCP_FLAG_RST))
req-&gt;rsk_ops-&gt;send_ack(sk, skb, req);
if (paws_reject)
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
return NULL;
}
/* In sequence, PAWS is OK. */
if (tmp_opt.saw_tstamp &amp;&amp; !after(TCP_SKB_CB(skb)-&gt;seq, tcp_rsk(req)-&gt;rcv_nxt))
req-&gt;ts_recent = tmp_opt.rcv_tsval;
if (TCP_SKB_CB(skb)-&gt;seq == tcp_rsk(req)-&gt;rcv_isn) {
/* Truncate SYN, it is out of window starting
at tcp_rsk(req)-&gt;rcv_isn + 1. */
flg &amp;= ~TCP_FLAG_SYN;
}
/* RFC793: &quot;second check the RST bit&quot; and
* &quot;fourth, check the SYN bit&quot;
*/
if (flg &amp; (TCP_FLAG_RST|TCP_FLAG_SYN)) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
/* ACK sequence verified above, just make sure ACK is
* set. If ACK not set, just silently drop the packet.
*
* XXX (TFO) - if we ever allow &quot;data after SYN&quot;, the
* following check needs to be removed.
*/
if (!(flg &amp; TCP_FLAG_ACK))
return NULL;
/* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
if (tmp_opt.saw_tstamp &amp;&amp; tmp_opt.rcv_tsecr)
tcp_rsk(req)-&gt;snt_synack = tmp_opt.rcv_tsecr;
else if (req-&gt;num_retrans) /* don't take RTT sample if retrans &amp;&amp; ~TS */
tcp_rsk(req)-&gt;snt_synack = 0;
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
if (fastopen)
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
if (req-&gt;num_timeout &lt; inet_csk(sk)-&gt;icsk_accept_queue.rskq_defer_accept &amp;&amp;
TCP_SKB_CB(skb)-&gt;end_seq == tcp_rsk(req)-&gt;rcv_isn + 1) {
inet_rsk(req)-&gt;acked = 1;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
return NULL;
}
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
child = inet_csk(sk)-&gt;icsk_af_ops-&gt;syn_recv_sock(sk, skb, req, NULL); // 请看备注。这里是创建子 sock。TCP 场景即是 tcp_v4_syn_recv_sock
if (child == NULL)
goto listen_overflow;
inet_csk_reqsk_queue_unlink(sk, req, prev); // 难怪上面要保存 prev,原来是为了快速从 list 中删除
inet_csk_reqsk_queue_removed(sk, req); // 从半连接队列中删除
inet_csk_reqsk_queue_add(sk, req, child); // 添加到全连接队列。注意这里会设置 req-&gt;sk = child
return child;
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
inet_rsk(req)-&gt;acked = 1;
return NULL;
}
embryonic_reset:
if (!(flg &amp; TCP_FLAG_RST)) {
/* Received a bad SYN pkt - for TFO We try not to reset
* the local connection unless it's really necessary to
* avoid becoming vulnerable to outside attack aiming at
* resetting legit local connections.
*/
req-&gt;rsk_ops-&gt;send_reset(sk, skb);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
tcp_reset(sk);
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req, prev);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL;
}
EXPORT_SYMBOL(tcp_check_req);</code></pre>
<h3>创建子 socket</h3>
<p>书接上回,创建子 socket 调用的地方:</p>
<pre><code class="language-c">child = inet_csk(sk)-&gt;icsk_af_ops-&gt;syn_recv_sock(sk, skb, req, NULL); // TCP 场景即是 tcp_v4_syn_recv_sock</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
struct ip_options_rcu *inet_opt;
if (sk_acceptq_is_full(sk)) // 全连接队列是澡满了
goto exit_overflow;
newsk = tcp_create_openreq_child(sk, req, skb); // 创建 sk 并初始化
if (!newsk)
goto exit_nonewsk;
newsk-&gt;sk_gso_type = SKB_GSO_TCPV4;
inet_sk_rx_dst_set(newsk, skb);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
newinet-&gt;inet_daddr = ireq-&gt;rmt_addr;
newinet-&gt;inet_rcv_saddr = ireq-&gt;loc_addr;
newinet-&gt;inet_saddr = ireq-&gt;loc_addr;
inet_opt = ireq-&gt;opt;
rcu_assign_pointer(newinet-&gt;inet_opt, inet_opt);
ireq-&gt;opt = NULL;
newinet-&gt;mc_index = inet_iif(skb);
newinet-&gt;mc_ttl = ip_hdr(skb)-&gt;ttl;
newinet-&gt;rcv_tos = ip_hdr(skb)-&gt;tos;
inet_csk(newsk)-&gt;icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)-&gt;icsk_ext_hdr_len = inet_opt-&gt;opt.optlen;
newinet-&gt;inet_id = newtp-&gt;write_seq ^ jiffies;
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
goto put_and_exit;
} else {
/* syncookie case : see end of cookie_v4_check() */
}
sk_setup_caps(newsk, dst); // 设置能力
tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp-&gt;advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)-&gt;rx_opt.user_mss &amp;&amp;
tcp_sk(sk)-&gt;rx_opt.user_mss &lt; newtp-&gt;advmss)
newtp-&gt;advmss = tcp_sk(sk)-&gt;rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk);
tcp_synack_rtt_meas(newsk, req);
newtp-&gt;total_retrans = req-&gt;num_retrans;
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&amp;newinet-&gt;inet_daddr,
AF_INET);
if (key != NULL) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
* memory, then we end up not copying the key
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&amp;newinet-&gt;inet_daddr,
AF_INET, key-&gt;key, key-&gt;keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
if (__inet_inherit_port(sk, newsk) &lt; 0)
goto put_and_exit;
__inet_hash_nolisten(newsk, NULL); // 将 newsk 加入到 ehash 中,以便后面通过 skb 查找 sk
return newsk;
exit_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
put_and_exit:
inet_csk_prepare_forced_close(newsk);
tcp_done(newsk);
goto exit;
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
</code></pre>
<h3>继续 tcp_child_process</h3>
<pre><code class="language-c">// file: net/ipv4/minisocks.c
/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket.
*
* For the vast majority of cases child-&gt;sk_state will be TCP_SYN_RECV
* when entering. But other states are possible due to a race condition
* where after __inet_lookup_established() fails but before the listener
* locked is obtained, other packets cause the same connection to
* be created.
*/
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
{
int ret = 0;
int state = child-&gt;sk_state;
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
skb-&gt;len); // 经典的状态处理,此时状态为 TCP_SYN_RECV
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV &amp;&amp; child-&gt;sk_state != state)
parent-&gt;sk_data_ready(parent, 0); // parent 就是公共的监听的 sk
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
__sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
sock_put(child);
return ret;
}
EXPORT_SYMBOL(tcp_child_process);
// file: net/ipv4/tcp_input.c
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock *req;
int queued = 0;
tp-&gt;rx_opt.saw_tstamp = 0;
// ...
/* step 5: check the ACK field */
if (true) {
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT) &gt; 0; // 处理 ACK
switch (sk-&gt;sk_state) {
case TCP_SYN_RECV:
if (acceptable) {
/* Once we leave TCP_SYN_RECV, we no longer
* need req so release it.
*/
if (req) {
tcp_synack_rtt_meas(sk, req);
tp-&gt;total_retrans = req-&gt;num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for
* correct metrics.
*/
icsk-&gt;icsk_af_ops-&gt;rebuild_header(sk);
tcp_init_congestion_control(sk);
tcp_mtup_init(sk);
tcp_init_buffer_space(sk);
tp-&gt;copied_seq = tp-&gt;rcv_nxt;
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED); // 更新状态(此处为子连接)
sk-&gt;sk_state_change(sk);
/* Note, that this wakeup is only for marginal
* crossed SYN case. Passively open sockets
* are not waked up, because sk-&gt;sk_sleep ==
* NULL and sk-&gt;sk_socket == NULL.
*/
if (sk-&gt;sk_socket)
sk_wake_async(sk,
SOCK_WAKE_IO, POLL_OUT);
tp-&gt;snd_una = TCP_SKB_CB(skb)-&gt;ack_seq;
tp-&gt;snd_wnd = ntohs(th-&gt;window) &lt;&lt;
tp-&gt;rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)-&gt;seq);
if (tp-&gt;rx_opt.tstamp_ok)
tp-&gt;advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (req) {
/* Re-arm the timer because data may
* have been sent out. This is similar
* to the regular data transmission case
* when new data has just been ack'ed.
*
* (TFO) - we could try to be more
* aggressive and retranmitting any data
* sooner based on when they were sent
* out.
*/
tcp_rearm_rto(sk);
} else
tcp_init_metrics(sk);
tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
tp-&gt;lsndtime = tcp_time_stamp;
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
} else {
return 1;
}
break;
}</code></pre>