TCP收包

<h2>概述</h2> <p>调用树：</p> <pre><code class="language-c">tcp_v4_rcv // 根据五元组等信息，获取到 sk 对象；尝试加入到 prequeue 队列；如果被用户占用，则加入到 backlog tcp_v4_do_rcv // 根据 sk 不同的状态，进行不同的处理</code></pre> <p>对于 local_deliver 的包，在 IP 包处理的后面，会根据 <code>inet_protos[protocol]</code> 来获取下一步的处理函数。其实就是根据协议继续往上层交付。对于 IPV4，其处理函数为 <code>tcp_v4_rcv</code></p> <p>> 参考资料: <a href="https://blog.csdn.net/xiaoyu_750516366/article/details/85539495">https://blog.csdn.net/xiaoyu_750516366/article/details/85539495</a></p> <h2>分析</h2> <pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c /* * From tcp_input.c */ int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb-&gt;dev); if (skb-&gt;pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = tcp_hdr(skb); if (th-&gt;doff &lt; sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th-&gt;doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th-&gt;doff==0 is eliminated. * So, we defer the checks. */ if (!skb_csum_unnecessary(skb) &amp;&amp; tcp_v4_checksum_init(skb)) goto csum_error; th = tcp_hdr(skb); iph = ip_hdr(skb); TCP_SKB_CB(skb)-&gt;seq = ntohl(th-&gt;seq); TCP_SKB_CB(skb)-&gt;end_seq = (TCP_SKB_CB(skb)-&gt;seq + th-&gt;syn + th-&gt;fin + skb-&gt;len - th-&gt;doff * 4); TCP_SKB_CB(skb)-&gt;ack_seq = ntohl(th-&gt;ack_seq); TCP_SKB_CB(skb)-&gt;when = 0; TCP_SKB_CB(skb)-&gt;ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)-&gt;sacked = 0; sk = __inet_lookup_skb(&amp;tcp_hashinfo, skb, th-&gt;source, th-&gt;dest); // 根据五元组等信息，获取到 sk 对象。注意 tcp_hashinfo 里有很多队列，建议详细看看。 if (!sk) goto no_tcp_socket; process: if (sk-&gt;sk_state == TCP_TIME_WAIT) goto do_time_wait; if (unlikely(iph-&gt;ttl &lt; inet_sk(sk)-&gt;min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; th = (const struct tcphdr *)skb-&gt;data; iph = ip_hdr(skb); skb-&gt;dev = NULL; bh_lock_sock_nested(sk); ret = 0; // （借用网上的备注）：是否有进程正在使用这个套接字，将会对处理流程产生影响 // 或者从代码层面上，只要在tcp_recvmsg里，执行lock_sock后只能进入else，而release_sock后会进入if if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp-&gt;ucopy.dma_chan &amp;&amp; tp-&gt;ucopy.pinned_list) tp-&gt;ucopy.dma_chan = net_dma_find_channel(); if (tp-&gt;ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else #endif { if (!tcp_prequeue(sk, skb)) // 尝试加入到 prequeue 队列。里面的逻辑是：如果应用层在等待收数据，则尽量让应用层来继续处理和收取，这样软中断就可以直接完成处理。 ret = tcp_v4_do_rcv(sk, skb); // 继续 } } else if (unlikely(sk_add_backlog(sk, skb, sk-&gt;sk_rcvbuf + sk-&gt;sk_sndbuf))) { // 如果应用层在占用，则加入到 backlog 队列，因为不能等，需要尽快结束完成软中断处理。 bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); sock_put(sk); return ret; no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; if (skb-&gt;len &lt; (th-&gt;doff &lt;&lt; 2) || tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } discard_it: /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: sock_put(sk); goto discard_it; do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb-&gt;len &lt; (th-&gt;doff &lt;&lt; 2)) { inet_twsk_put(inet_twsk(sk)); goto bad_packet; } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb-&gt;dev), &amp;tcp_hashinfo, iph-&gt;saddr, th-&gt;source, iph-&gt;daddr, th-&gt;dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &amp;tcp_death_row); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; } /* Fall through to ACK */ } case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: goto no_tcp_socket; case TCP_TW_SUCCESS:; } goto discard_it; }</code></pre> <p>其中 <code>__inet_lookup_skb</code> 是一个很重要的函数，它负责根据 skb 查找到对应的 sk（即找到相关连接）。先看看此函数：</p> <pre><code class="language-c">// file: include/net/inet_hashtables.h static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, const __be16 sport, const __be16 dport) { struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); if (sk) return sk; else return __inet_lookup(dev_net(skb_dst(skb)-&gt;dev), hashinfo, iph-&gt;saddr, sport, iph-&gt;daddr, dport, inet_iif(skb)); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { u16 hnum = ntohs(dport); struct sock *sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); // 先查找已建立的哈希表，更具体的：先查找 establish 状态连接，再查找 tw_wait 状态连接。参与哈希的有 5 个参数：net 和四元组 return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, daddr, hnum, dif); // 再查找监听的哈希表，参与哈希的是 net 和 hnum 这 2 个参数，不过会检查绑定 IP 和绑定接口 } </code></pre> <p>继续看 <code>tcp_v4_do_rcv</code>：</p> <pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c /* The socket must have it's spinlock held when we get * here. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible * if: * o We're expecting an MD5'd packet and this is no MD5 tcp option * o There is an MD5 option and we're not expecting one */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard; #endif if (sk-&gt;sk_state == TCP_ESTABLISHED) { /* Fast path */ // 连接建立状态 struct dst_entry *dst = sk-&gt;sk_rx_dst; sock_rps_save_rxhash(sk, skb); if (dst) { if (inet_sk(sk)-&gt;rx_dst_ifindex != skb-&gt;skb_iif || dst-&gt;ops-&gt;check(dst, 0) == NULL) { dst_release(dst); sk-&gt;sk_rx_dst = NULL; } } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb-&gt;len)) { // 继续 rsk = sk; goto reset; } return 0; } if (skb-&gt;len &lt; tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk-&gt;sk_state == TCP_LISTEN) { // 监听状态 struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb-&gt;len)) { rsk = sk; goto reset; } return 0; reset: tcp_v4_send_reset(rsk, skb); discard: kfree_skb(skb); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, * but you have been warned. */ return 0; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); </code></pre> <p>继续：</p> <pre><code class="language-c">// file: net/ipv4/tcp_input.c /* * TCP receive function for the ESTABLISHED state. * * It is split into a fast path and a slow path. The fast path is * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received * (detected by checking the TCP header against pred_flags) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. * The first three cases are guaranteed by proper pred_flags setting, * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(sk-&gt;sk_rx_dst == NULL)) inet_csk(sk)-&gt;icsk_af_ops-&gt;sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous * &quot;30 instruction TCP receive&quot; Van Jacobson mail. * * Van's trick is to deposit buffers into socket queue * on a device interrupt, to call tcp_recv function * on the receive process context and checksum and copy * the buffer to user space. smart... * * Our current scheme is not silly either but we take the * extra cost of the net_bh soft interrupt processing... * We do checksum and copy also but from device to kernel. */ tp-&gt;rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 &lt;&lt; 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp-&gt;tcp_header_len &gt;&gt; 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive * space for instance) * PSH flag is ignored. */ if ((tcp_flag_word(th) &amp; TCP_HP_BITS) == tp-&gt;pred_flags &amp;&amp; TCP_SKB_CB(skb)-&gt;seq == tp-&gt;rcv_nxt &amp;&amp; !after(TCP_SKB_CB(skb)-&gt;ack_seq, tp-&gt;snd_nxt)) { int tcp_header_len = tp-&gt;tcp_header_len; /* Timestamp header prediction: tcp_header_len * is automatically equal to th-&gt;doff*4 due to pred_flags * match. */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { /* No? Slow path! */ if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; /* If PAWS failed, check it more carefully in slow path */ if ((s32)(tp-&gt;rx_opt.rcv_tsval - tp-&gt;rx_opt.ts_recent) &lt; 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails * and timestamp was corrupted part, it will result * in a hung connection since we will drop all * future packets due to the PAWS test. */ } if (len &lt;= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup &lt;= rcv_nxt. * Hence, check seq&lt;=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;&amp; tp-&gt;rcv_nxt == tp-&gt;rcv_wup) tcp_store_ts_recent(tp); /* We know that such packets are checksummed * on entry. */ tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { int eaten = 0; int copied_early = 0; bool fragstolen = false; if (tp-&gt;copied_seq == tp-&gt;rcv_nxt &amp;&amp; len - tcp_header_len &lt;= tp-&gt;ucopy.len) { #ifdef CONFIG_NET_DMA if (tp-&gt;ucopy.task == current &amp;&amp; sock_owned_by_user(sk) &amp;&amp; tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } #endif if (tp-&gt;ucopy.task == current &amp;&amp; sock_owned_by_user(sk) &amp;&amp; !copied_early) { __set_current_state(TASK_RUNNING); if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) eaten = 1; } if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup &lt;= rcv_nxt. * Hence, check seq&lt;=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;&amp; tp-&gt;rcv_nxt == tp-&gt;rcv_wup) tcp_store_ts_recent(tp); tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); tp-&gt;rcv_nxt = TCP_SKB_CB(skb)-&gt;end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); } if (copied_early) tcp_cleanup_rbuf(sk, skb-&gt;len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) goto csum_error; if ((int)skb-&gt;truesize &gt; sk-&gt;sk_forward_alloc) goto step5; /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup &lt;= rcv_nxt. * Hence, check seq&lt;=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;&amp; tp-&gt;rcv_nxt == tp-&gt;rcv_wup) tcp_store_ts_recent(tp); tcp_rcv_rtt_measure_ts(sk, skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ eaten = tcp_queue_rcv(sk, skb, tcp_header_len, &amp;fragstolen); // 将 skb 放到 receive_queue 接收队列中 } tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)-&gt;ack_seq != tp-&gt;snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } if (!copied_early || tp-&gt;rcv_nxt != tp-&gt;rcv_wup) __tcp_ack_snd_check(sk, 0); no_ack: #ifdef CONFIG_NET_DMA if (copied_early) __skb_queue_tail(&amp;sk-&gt;sk_async_wait_queue, skb); else #endif if (eaten) kfree_skb_partial(skb, fragstolen); sk-&gt;sk_data_ready(sk, 0); // 即前文所说的 sock_def_readable 函数，其在 sock_init_data 中初始化的。功能主要是：唤醒上阻塞的进程。 return 0; } } slow_path: if (len &lt; (th-&gt;doff &lt;&lt; 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; if (!th-&gt;ack &amp;&amp; !th-&gt;rst) goto discard; /* * Standard slow path. */ if (!tcp_validate_incoming(sk, skb, th, 1)) return 0; step5: if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) &lt; 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ tcp_data_queue(sk, skb); tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return 0; csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: __kfree_skb(skb); return 0; } EXPORT_SYMBOL(tcp_rcv_established); static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&amp;sk-&gt;sk_receive_queue); __skb_pull(skb, hdrlen); eaten = (tail &amp;&amp; tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; // 尝试合并 tcp_sk(sk)-&gt;rcv_nxt = TCP_SKB_CB(skb)-&gt;end_seq; if (!eaten) { __skb_queue_tail(&amp;sk-&gt;sk_receive_queue, skb); // 将 skb 放到接收队列的后面 skb_set_owner_r(skb, sk); } return eaten; }</code></pre> <p>继续：</p> <pre><code class="language-c">// file: net/core/sock.c static void sock_def_readable(struct sock *sk, int len) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk-&gt;sk_wq); if (wq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&amp;wq-&gt;wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } // file: include/net/sock.h static inline bool wq_has_sleeper(struct socket_wq *wq) { /* We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier is paired in the sock_poll_wait. */ smp_mb(); return wq &amp;&amp; waitqueue_active(&amp;wq-&gt;wait); } // file: include/linux/wait.h static inline int waitqueue_active(wait_queue_head_t *q) { return !list_empty(&amp;q-&gt;task_list); }</code></pre> <p>唤醒：</p> <pre><code class="language-c">// file: include/linux/wait.h #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) // 注意这里有个参数 1 // file: kernel/sched/core.c /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not * be migrated to another CPU - ie. the two threads are 'synchronized' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) wake_flags = 0; spin_lock_irqsave(&amp;q-&gt;lock, flags); __wake_up_common(q, mode, nr_exclusive, wake_flags, key); // nr_exclusive 表示唤醒多少个进程，为 0 表示无限制。上文中设置的是 1 spin_unlock_irqrestore(&amp;q-&gt;lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &amp;q-&gt;task_list, task_list) { unsigned flags = curr-&gt;flags; // 注意：这里仅仅是获取一个 wait 项并执行，并没有从 task_list 中删除。 if (curr-&gt;func(curr, mode, wake_flags, key) &amp;&amp; (flags &amp; WQ_FLAG_EXCLUSIVE) &amp;&amp; !--nr_exclusive) // 调用 wait 项里的 func。对于普通 socket 是 autoremove_wake_function；对于 eventpoll 是 ep_poll_callback break; } } // file: include/linux/wait.h typedef struct __wait_queue wait_queue_t; struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; // 本质上就是这个回调函数 struct list_head task_list; };</code></pre> <p>对于普通 socket 是 autoremove_wake_function：</p> <pre><code class="language-c">// file: kernel/wait.c int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); if (ret) list_del_init(&amp;wait-&gt;task_list); // 唤醒成功，则从 task_list 中删除 return ret; } EXPORT_SYMBOL(autoremove_wake_function); int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr-&gt;private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); </code></pre>

公开学习文档

TCP收包

页面列表