公开学习文档

公开学习文档


TCP收包

<h2>概述</h2> <p>调用树:</p> <pre><code class="language-c">tcp_v4_rcv // 根据五元组等信息,获取到 sk 对象;尝试加入到 prequeue 队列;如果被用户占用,则加入到 backlog tcp_v4_do_rcv // 根据 sk 不同的状态,进行不同的处理</code></pre> <p>对于 local_deliver 的包,在 IP 包处理的后面,会根据 <code>inet_protos[protocol]</code> 来获取下一步的处理函数。其实就是根据协议继续往上层交付。 对于 IPV4,其处理函数为 <code>tcp_v4_rcv</code></p> <p>&gt; 参考资料: <a href="https://blog.csdn.net/xiaoyu_750516366/article/details/85539495">https://blog.csdn.net/xiaoyu_750516366/article/details/85539495</a></p> <h2>分析</h2> <pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c /* * From tcp_input.c */ int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb-&amp;gt;dev); if (skb-&amp;gt;pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = tcp_hdr(skb); if (th-&amp;gt;doff &amp;lt; sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th-&amp;gt;doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th-&amp;gt;doff==0 is eliminated. * So, we defer the checks. */ if (!skb_csum_unnecessary(skb) &amp;amp;&amp;amp; tcp_v4_checksum_init(skb)) goto csum_error; th = tcp_hdr(skb); iph = ip_hdr(skb); TCP_SKB_CB(skb)-&amp;gt;seq = ntohl(th-&amp;gt;seq); TCP_SKB_CB(skb)-&amp;gt;end_seq = (TCP_SKB_CB(skb)-&amp;gt;seq + th-&amp;gt;syn + th-&amp;gt;fin + skb-&amp;gt;len - th-&amp;gt;doff * 4); TCP_SKB_CB(skb)-&amp;gt;ack_seq = ntohl(th-&amp;gt;ack_seq); TCP_SKB_CB(skb)-&amp;gt;when = 0; TCP_SKB_CB(skb)-&amp;gt;ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)-&amp;gt;sacked = 0; sk = __inet_lookup_skb(&amp;amp;tcp_hashinfo, skb, th-&amp;gt;source, th-&amp;gt;dest); // 根据五元组等信息,获取到 sk 对象。注意 tcp_hashinfo 里有很多队列,建议详细看看。 if (!sk) goto no_tcp_socket; process: if (sk-&amp;gt;sk_state == TCP_TIME_WAIT) goto do_time_wait; if (unlikely(iph-&amp;gt;ttl &amp;lt; inet_sk(sk)-&amp;gt;min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; th = (const struct tcphdr *)skb-&amp;gt;data; iph = ip_hdr(skb); skb-&amp;gt;dev = NULL; bh_lock_sock_nested(sk); ret = 0; // (借用网上的备注):是否有进程正在使用这个套接字,将会对处理流程产生影响 // 或者从代码层面上,只要在tcp_recvmsg里,执行lock_sock后只能进入else,而release_sock后会进入if if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp-&amp;gt;ucopy.dma_chan &amp;amp;&amp;amp; tp-&amp;gt;ucopy.pinned_list) tp-&amp;gt;ucopy.dma_chan = net_dma_find_channel(); if (tp-&amp;gt;ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else #endif { if (!tcp_prequeue(sk, skb)) // 尝试加入到 prequeue 队列。里面的逻辑是:如果应用层在等待收数据,则尽量让应用层来继续处理和收取,这样软中断就可以直接完成处理。 ret = tcp_v4_do_rcv(sk, skb); // 继续 } } else if (unlikely(sk_add_backlog(sk, skb, sk-&amp;gt;sk_rcvbuf + sk-&amp;gt;sk_sndbuf))) { // 如果应用层在占用,则加入到 backlog 队列,因为不能等,需要尽快结束完成软中断处理。 bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); sock_put(sk); return ret; no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; if (skb-&amp;gt;len &amp;lt; (th-&amp;gt;doff &amp;lt;&amp;lt; 2) || tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } discard_it: /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: sock_put(sk); goto discard_it; do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb-&amp;gt;len &amp;lt; (th-&amp;gt;doff &amp;lt;&amp;lt; 2)) { inet_twsk_put(inet_twsk(sk)); goto bad_packet; } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb-&amp;gt;dev), &amp;amp;tcp_hashinfo, iph-&amp;gt;saddr, th-&amp;gt;source, iph-&amp;gt;daddr, th-&amp;gt;dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &amp;amp;tcp_death_row); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; } /* Fall through to ACK */ } case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: goto no_tcp_socket; case TCP_TW_SUCCESS:; } goto discard_it; }</code></pre> <p>其中 <code>__inet_lookup_skb</code> 是一个很重要的函数,它负责根据 skb 查找到对应的 sk(即找到相关连接)。先看看此函数:</p> <pre><code class="language-c">// file: include/net/inet_hashtables.h static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, const __be16 sport, const __be16 dport) { struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); if (sk) return sk; else return __inet_lookup(dev_net(skb_dst(skb)-&amp;gt;dev), hashinfo, iph-&amp;gt;saddr, sport, iph-&amp;gt;daddr, dport, inet_iif(skb)); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { u16 hnum = ntohs(dport); struct sock *sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); // 先查找已建立的哈希表,更具体的:先查找 establish 状态连接,再查找 tw_wait 状态连接。参与哈希的有 5 个参数:net 和 四元组 return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, daddr, hnum, dif); // 再查找监听的哈希表,参与哈希的是 net 和 hnum 这 2 个参数,不过会检查绑定 IP 和绑定接口 } </code></pre> <p>继续看 <code>tcp_v4_do_rcv</code>:</p> <pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c /* The socket must have it's spinlock held when we get * here. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible * if: * o We're expecting an MD5'd packet and this is no MD5 tcp option * o There is an MD5 option and we're not expecting one */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard; #endif if (sk-&amp;gt;sk_state == TCP_ESTABLISHED) { /* Fast path */ // 连接建立状态 struct dst_entry *dst = sk-&amp;gt;sk_rx_dst; sock_rps_save_rxhash(sk, skb); if (dst) { if (inet_sk(sk)-&amp;gt;rx_dst_ifindex != skb-&amp;gt;skb_iif || dst-&amp;gt;ops-&amp;gt;check(dst, 0) == NULL) { dst_release(dst); sk-&amp;gt;sk_rx_dst = NULL; } } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb-&amp;gt;len)) { // 继续 rsk = sk; goto reset; } return 0; } if (skb-&amp;gt;len &amp;lt; tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk-&amp;gt;sk_state == TCP_LISTEN) { // 监听状态 struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb-&amp;gt;len)) { rsk = sk; goto reset; } return 0; reset: tcp_v4_send_reset(rsk, skb); discard: kfree_skb(skb); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, * but you have been warned. */ return 0; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); </code></pre> <p>继续:</p> <pre><code class="language-c">// file: net/ipv4/tcp_input.c /* * TCP receive function for the ESTABLISHED state. * * It is split into a fast path and a slow path. The fast path is * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received * (detected by checking the TCP header against pred_flags) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. * The first three cases are guaranteed by proper pred_flags setting, * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(sk-&amp;gt;sk_rx_dst == NULL)) inet_csk(sk)-&amp;gt;icsk_af_ops-&amp;gt;sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous * &amp;quot;30 instruction TCP receive&amp;quot; Van Jacobson mail. * * Van's trick is to deposit buffers into socket queue * on a device interrupt, to call tcp_recv function * on the receive process context and checksum and copy * the buffer to user space. smart... * * Our current scheme is not silly either but we take the * extra cost of the net_bh soft interrupt processing... * We do checksum and copy also but from device to kernel. */ tp-&amp;gt;rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 &amp;lt;&amp;lt; 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp-&amp;gt;tcp_header_len &amp;gt;&amp;gt; 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive * space for instance) * PSH flag is ignored. */ if ((tcp_flag_word(th) &amp;amp; TCP_HP_BITS) == tp-&amp;gt;pred_flags &amp;amp;&amp;amp; TCP_SKB_CB(skb)-&amp;gt;seq == tp-&amp;gt;rcv_nxt &amp;amp;&amp;amp; !after(TCP_SKB_CB(skb)-&amp;gt;ack_seq, tp-&amp;gt;snd_nxt)) { int tcp_header_len = tp-&amp;gt;tcp_header_len; /* Timestamp header prediction: tcp_header_len * is automatically equal to th-&amp;gt;doff*4 due to pred_flags * match. */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { /* No? Slow path! */ if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; /* If PAWS failed, check it more carefully in slow path */ if ((s32)(tp-&amp;gt;rx_opt.rcv_tsval - tp-&amp;gt;rx_opt.ts_recent) &amp;lt; 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails * and timestamp was corrupted part, it will result * in a hung connection since we will drop all * future packets due to the PAWS test. */ } if (len &amp;lt;= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup &amp;lt;= rcv_nxt. * Hence, check seq&amp;lt;=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;amp;&amp;amp; tp-&amp;gt;rcv_nxt == tp-&amp;gt;rcv_wup) tcp_store_ts_recent(tp); /* We know that such packets are checksummed * on entry. */ tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { int eaten = 0; int copied_early = 0; bool fragstolen = false; if (tp-&amp;gt;copied_seq == tp-&amp;gt;rcv_nxt &amp;amp;&amp;amp; len - tcp_header_len &amp;lt;= tp-&amp;gt;ucopy.len) { #ifdef CONFIG_NET_DMA if (tp-&amp;gt;ucopy.task == current &amp;amp;&amp;amp; sock_owned_by_user(sk) &amp;amp;&amp;amp; tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } #endif if (tp-&amp;gt;ucopy.task == current &amp;amp;&amp;amp; sock_owned_by_user(sk) &amp;amp;&amp;amp; !copied_early) { __set_current_state(TASK_RUNNING); if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) eaten = 1; } if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup &amp;lt;= rcv_nxt. * Hence, check seq&amp;lt;=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;amp;&amp;amp; tp-&amp;gt;rcv_nxt == tp-&amp;gt;rcv_wup) tcp_store_ts_recent(tp); tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); tp-&amp;gt;rcv_nxt = TCP_SKB_CB(skb)-&amp;gt;end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); } if (copied_early) tcp_cleanup_rbuf(sk, skb-&amp;gt;len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) goto csum_error; if ((int)skb-&amp;gt;truesize &amp;gt; sk-&amp;gt;sk_forward_alloc) goto step5; /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup &amp;lt;= rcv_nxt. * Hence, check seq&amp;lt;=rcv_wup reduces to: */ if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;amp;&amp;amp; tp-&amp;gt;rcv_nxt == tp-&amp;gt;rcv_wup) tcp_store_ts_recent(tp); tcp_rcv_rtt_measure_ts(sk, skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ eaten = tcp_queue_rcv(sk, skb, tcp_header_len, &amp;amp;fragstolen); // 将 skb 放到 receive_queue 接收队列中 } tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)-&amp;gt;ack_seq != tp-&amp;gt;snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } if (!copied_early || tp-&amp;gt;rcv_nxt != tp-&amp;gt;rcv_wup) __tcp_ack_snd_check(sk, 0); no_ack: #ifdef CONFIG_NET_DMA if (copied_early) __skb_queue_tail(&amp;amp;sk-&amp;gt;sk_async_wait_queue, skb); else #endif if (eaten) kfree_skb_partial(skb, fragstolen); sk-&amp;gt;sk_data_ready(sk, 0); // 即前文所说的 sock_def_readable 函数,其在 sock_init_data 中初始化的。功能主要是:唤醒上阻塞的进程。 return 0; } } slow_path: if (len &amp;lt; (th-&amp;gt;doff &amp;lt;&amp;lt; 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; if (!th-&amp;gt;ack &amp;amp;&amp;amp; !th-&amp;gt;rst) goto discard; /* * Standard slow path. */ if (!tcp_validate_incoming(sk, skb, th, 1)) return 0; step5: if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) &amp;lt; 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ tcp_data_queue(sk, skb); tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return 0; csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: __kfree_skb(skb); return 0; } EXPORT_SYMBOL(tcp_rcv_established); static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&amp;amp;sk-&amp;gt;sk_receive_queue); __skb_pull(skb, hdrlen); eaten = (tail &amp;amp;&amp;amp; tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; // 尝试合并 tcp_sk(sk)-&amp;gt;rcv_nxt = TCP_SKB_CB(skb)-&amp;gt;end_seq; if (!eaten) { __skb_queue_tail(&amp;amp;sk-&amp;gt;sk_receive_queue, skb); // 将 skb 放到接收队列的后面 skb_set_owner_r(skb, sk); } return eaten; }</code></pre> <p>继续:</p> <pre><code class="language-c">// file: net/core/sock.c static void sock_def_readable(struct sock *sk, int len) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk-&amp;gt;sk_wq); if (wq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&amp;amp;wq-&amp;gt;wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } // file: include/net/sock.h static inline bool wq_has_sleeper(struct socket_wq *wq) { /* We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier is paired in the sock_poll_wait. */ smp_mb(); return wq &amp;amp;&amp;amp; waitqueue_active(&amp;amp;wq-&amp;gt;wait); } // file: include/linux/wait.h static inline int waitqueue_active(wait_queue_head_t *q) { return !list_empty(&amp;amp;q-&amp;gt;task_list); }</code></pre> <p>唤醒:</p> <pre><code class="language-c">// file: include/linux/wait.h #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) // 注意这里有个参数 1 // file: kernel/sched/core.c /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not * be migrated to another CPU - ie. the two threads are 'synchronized' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) wake_flags = 0; spin_lock_irqsave(&amp;amp;q-&amp;gt;lock, flags); __wake_up_common(q, mode, nr_exclusive, wake_flags, key); // nr_exclusive 表示唤醒多少个进程,为 0 表示无限制。上文中设置的是 1 spin_unlock_irqrestore(&amp;amp;q-&amp;gt;lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &amp;amp;q-&amp;gt;task_list, task_list) { unsigned flags = curr-&amp;gt;flags; // 注意:这里仅仅是获取一个 wait 项并执行,并没有从 task_list 中删除。 if (curr-&amp;gt;func(curr, mode, wake_flags, key) &amp;amp;&amp;amp; (flags &amp;amp; WQ_FLAG_EXCLUSIVE) &amp;amp;&amp;amp; !--nr_exclusive) // 调用 wait 项里的 func。对于普通 socket 是 autoremove_wake_function;对于 eventpoll 是 ep_poll_callback break; } } // file: include/linux/wait.h typedef struct __wait_queue wait_queue_t; struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; // 本质上就是这个回调函数 struct list_head task_list; };</code></pre> <p>对于普通 socket 是 autoremove_wake_function:</p> <pre><code class="language-c">// file: kernel/wait.c int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); if (ret) list_del_init(&amp;amp;wait-&amp;gt;task_list); // 唤醒成功,则从 task_list 中删除 return ret; } EXPORT_SYMBOL(autoremove_wake_function); int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr-&amp;gt;private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); </code></pre>

页面列表

ITEM_HTML