TCP收包
<h2>概述</h2>
<p>调用树:</p>
<pre><code class="language-c">tcp_v4_rcv // 根据五元组等信息,获取到 sk 对象;尝试加入到 prequeue 队列;如果被用户占用,则加入到 backlog
tcp_v4_do_rcv // 根据 sk 不同的状态,进行不同的处理</code></pre>
<p>对于 local_deliver 的包,在 IP 包处理的后面,会根据 <code>inet_protos[protocol]</code> 来获取下一步的处理函数。其实就是根据协议继续往上层交付。
对于 IPV4,其处理函数为 <code>tcp_v4_rcv</code></p>
<p>> 参考资料: <a href="https://blog.csdn.net/xiaoyu_750516366/article/details/85539495">https://blog.csdn.net/xiaoyu_750516366/article/details/85539495</a></p>
<h2>分析</h2>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
/*
* From tcp_input.c
*/
int tcp_v4_rcv(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
int ret;
struct net *net = dev_net(skb-&gt;dev);
if (skb-&gt;pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = tcp_hdr(skb);
if (th-&gt;doff &lt; sizeof(struct tcphdr) / 4)
goto bad_packet;
if (!pskb_may_pull(skb, th-&gt;doff * 4))
goto discard_it;
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th-&gt;doff==0 is eliminated.
* So, we defer the checks. */
if (!skb_csum_unnecessary(skb) &amp;&amp; tcp_v4_checksum_init(skb))
goto csum_error;
th = tcp_hdr(skb);
iph = ip_hdr(skb);
TCP_SKB_CB(skb)-&gt;seq = ntohl(th-&gt;seq);
TCP_SKB_CB(skb)-&gt;end_seq = (TCP_SKB_CB(skb)-&gt;seq + th-&gt;syn + th-&gt;fin +
skb-&gt;len - th-&gt;doff * 4);
TCP_SKB_CB(skb)-&gt;ack_seq = ntohl(th-&gt;ack_seq);
TCP_SKB_CB(skb)-&gt;when = 0;
TCP_SKB_CB(skb)-&gt;ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)-&gt;sacked = 0;
sk = __inet_lookup_skb(&amp;tcp_hashinfo, skb, th-&gt;source, th-&gt;dest); // 根据五元组等信息,获取到 sk 对象。注意 tcp_hashinfo 里有很多队列,建议详细看看。
if (!sk)
goto no_tcp_socket;
process:
if (sk-&gt;sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (unlikely(iph-&gt;ttl &lt; inet_sk(sk)-&gt;min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
nf_reset(skb);
if (tcp_filter(sk, skb))
goto discard_and_relse;
th = (const struct tcphdr *)skb-&gt;data;
iph = ip_hdr(skb);
skb-&gt;dev = NULL;
bh_lock_sock_nested(sk);
ret = 0;
// (借用网上的备注):是否有进程正在使用这个套接字,将会对处理流程产生影响
// 或者从代码层面上,只要在tcp_recvmsg里,执行lock_sock后只能进入else,而release_sock后会进入if
if (!sock_owned_by_user(sk)) {
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp-&gt;ucopy.dma_chan &amp;&amp; tp-&gt;ucopy.pinned_list)
tp-&gt;ucopy.dma_chan = net_dma_find_channel();
if (tp-&gt;ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
#endif
{
if (!tcp_prequeue(sk, skb)) // 尝试加入到 prequeue 队列。里面的逻辑是:如果应用层在等待收数据,则尽量让应用层来继续处理和收取,这样软中断就可以直接完成处理。
ret = tcp_v4_do_rcv(sk, skb); // 继续
}
} else if (unlikely(sk_add_backlog(sk, skb,
sk-&gt;sk_rcvbuf + sk-&gt;sk_sndbuf))) { // 如果应用层在占用,则加入到 backlog 队列,因为不能等,需要尽快结束完成软中断处理。
bh_unlock_sock(sk);
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
bh_unlock_sock(sk);
sock_put(sk);
return ret;
no_tcp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
if (skb-&gt;len &lt; (th-&gt;doff &lt;&lt; 2) || tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
}
discard_it:
/* Discard frame. */
kfree_skb(skb);
return 0;
discard_and_relse:
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
if (skb-&gt;len &lt; (th-&gt;doff &lt;&lt; 2)) {
inet_twsk_put(inet_twsk(sk));
goto bad_packet;
}
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb-&gt;dev),
&amp;tcp_hashinfo,
iph-&gt;saddr, th-&gt;source,
iph-&gt;daddr, th-&gt;dest,
inet_iif(skb));
if (sk2) {
inet_twsk_deschedule(inet_twsk(sk), &amp;tcp_death_row);
inet_twsk_put(inet_twsk(sk));
sk = sk2;
goto process;
}
/* Fall through to ACK */
}
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
goto no_tcp_socket;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}</code></pre>
<p>其中 <code>__inet_lookup_skb</code> 是一个很重要的函数,它负责根据 skb 查找到对应的 sk(即找到相关连接)。先看看此函数:</p>
<pre><code class="language-c">// file: include/net/inet_hashtables.h
static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
struct sk_buff *skb,
const __be16 sport,
const __be16 dport)
{
struct sock *sk = skb_steal_sock(skb);
const struct iphdr *iph = ip_hdr(skb);
if (sk)
return sk;
else
return __inet_lookup(dev_net(skb_dst(skb)-&gt;dev), hashinfo,
iph-&gt;saddr, sport,
iph-&gt;daddr, dport, inet_iif(skb));
}
static inline struct sock *__inet_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif)
{
u16 hnum = ntohs(dport);
struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif); // 先查找已建立的哈希表,更具体的:先查找 establish 状态连接,再查找 tw_wait 状态连接。参与哈希的有 5 个参数:net 和 四元组
return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
daddr, hnum, dif); // 再查找监听的哈希表,参与哈希的是 net 和 hnum 这 2 个参数,不过会检查绑定 IP 和绑定接口
}
</code></pre>
<p>继续看 <code>tcp_v4_do_rcv</code>:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
/* The socket must have it's spinlock held when we get
* here.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif
if (sk-&gt;sk_state == TCP_ESTABLISHED) { /* Fast path */ // 连接建立状态
struct dst_entry *dst = sk-&gt;sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
if (dst) {
if (inet_sk(sk)-&gt;rx_dst_ifindex != skb-&gt;skb_iif ||
dst-&gt;ops-&gt;check(dst, 0) == NULL) {
dst_release(dst);
sk-&gt;sk_rx_dst = NULL;
}
}
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb-&gt;len)) { // 继续
rsk = sk;
goto reset;
}
return 0;
}
if (skb-&gt;len &lt; tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err;
if (sk-&gt;sk_state == TCP_LISTEN) { // 监听状态
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb-&gt;len)) {
rsk = sk;
goto reset;
}
return 0;
reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_input.c
/*
* TCP receive function for the ESTABLISHED state.
*
* It is split into a fast path and a slow path. The fast path is
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
* (detected by checking the TCP header against pred_flags)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
* - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
* The first three cases are guaranteed by proper pred_flags setting,
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(sk-&gt;sk_rx_dst == NULL))
inet_csk(sk)-&gt;icsk_af_ops-&gt;sk_rx_dst_set(sk, skb);
/*
* Header prediction.
* The code loosely follows the one in the famous
* &quot;30 instruction TCP receive&quot; Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
* Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp-&gt;rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 &lt;&lt; 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp-&gt;tcp_header_len &gt;&gt; 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
if ((tcp_flag_word(th) &amp; TCP_HP_BITS) == tp-&gt;pred_flags &amp;&amp;
TCP_SKB_CB(skb)-&gt;seq == tp-&gt;rcv_nxt &amp;&amp;
!after(TCP_SKB_CB(skb)-&gt;ack_seq, tp-&gt;snd_nxt)) {
int tcp_header_len = tp-&gt;tcp_header_len;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th-&gt;doff*4 due to pred_flags
* match.
*/
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
/* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp-&gt;rx_opt.rcv_tsval - tp-&gt;rx_opt.ts_recent) &lt; 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
* and timestamp was corrupted part, it will result
* in a hung connection since we will drop all
* future packets due to the PAWS test.
*/
}
if (len &lt;= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup &lt;= rcv_nxt.
* Hence, check seq&lt;=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;&amp;
tp-&gt;rcv_nxt == tp-&gt;rcv_wup)
tcp_store_ts_recent(tp);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
} else { /* Header too small */
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
int copied_early = 0;
bool fragstolen = false;
if (tp-&gt;copied_seq == tp-&gt;rcv_nxt &amp;&amp;
len - tcp_header_len &lt;= tp-&gt;ucopy.len) {
#ifdef CONFIG_NET_DMA
if (tp-&gt;ucopy.task == current &amp;&amp;
sock_owned_by_user(sk) &amp;&amp;
tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
copied_early = 1;
eaten = 1;
}
#endif
if (tp-&gt;ucopy.task == current &amp;&amp;
sock_owned_by_user(sk) &amp;&amp; !copied_early) {
__set_current_state(TASK_RUNNING);
if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
eaten = 1;
}
if (eaten) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup &lt;= rcv_nxt.
* Hence, check seq&lt;=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) +
TCPOLEN_TSTAMP_ALIGNED) &amp;&amp;
tp-&gt;rcv_nxt == tp-&gt;rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
tp-&gt;rcv_nxt = TCP_SKB_CB(skb)-&gt;end_seq;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
}
if (copied_early)
tcp_cleanup_rbuf(sk, skb-&gt;len);
}
if (!eaten) {
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
if ((int)skb-&gt;truesize &gt; sk-&gt;sk_forward_alloc)
goto step5;
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup &lt;= rcv_nxt.
* Hence, check seq&lt;=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &amp;&amp;
tp-&gt;rcv_nxt == tp-&gt;rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
&amp;fragstolen); // 将 skb 放到 receive_queue 接收队列中
}
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)-&gt;ack_seq != tp-&gt;snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
if (!copied_early || tp-&gt;rcv_nxt != tp-&gt;rcv_wup)
__tcp_ack_snd_check(sk, 0);
no_ack:
#ifdef CONFIG_NET_DMA
if (copied_early)
__skb_queue_tail(&amp;sk-&gt;sk_async_wait_queue, skb);
else
#endif
if (eaten)
kfree_skb_partial(skb, fragstolen);
sk-&gt;sk_data_ready(sk, 0); // 即前文所说的 sock_def_readable 函数,其在 sock_init_data 中初始化的。功能主要是:唤醒上阻塞的进程。
return 0;
}
}
slow_path:
if (len &lt; (th-&gt;doff &lt;&lt; 2) || tcp_checksum_complete_user(sk, skb))
goto csum_error;
if (!th-&gt;ack &amp;&amp; !th-&gt;rst)
goto discard;
/*
* Standard slow path.
*/
if (!tcp_validate_incoming(sk, skb, th, 1))
return 0;
step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) &lt; 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return 0;
csum_error:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL(tcp_rcv_established);
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&amp;sk-&gt;sk_receive_queue);
__skb_pull(skb, hdrlen);
eaten = (tail &amp;&amp;
tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; // 尝试合并
tcp_sk(sk)-&gt;rcv_nxt = TCP_SKB_CB(skb)-&gt;end_seq;
if (!eaten) {
__skb_queue_tail(&amp;sk-&gt;sk_receive_queue, skb); // 将 skb 放到接收队列的后面
skb_set_owner_r(skb, sk);
}
return eaten;
}</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: net/core/sock.c
static void sock_def_readable(struct sock *sk, int len)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk-&gt;sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&amp;wq-&gt;wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
// file: include/net/sock.h
static inline bool wq_has_sleeper(struct socket_wq *wq)
{
/* We need to be sure we are in sync with the
* add_wait_queue modifications to the wait queue.
*
* This memory barrier is paired in the sock_poll_wait.
*/
smp_mb();
return wq &amp;&amp; waitqueue_active(&amp;wq-&gt;wait);
}
// file: include/linux/wait.h
static inline int waitqueue_active(wait_queue_head_t *q)
{
return !list_empty(&amp;q-&gt;task_list);
}</code></pre>
<p>唤醒:</p>
<pre><code class="language-c">// file: include/linux/wait.h
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) // 注意这里有个参数 1
// file: kernel/sched/core.c
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
* be migrated to another CPU - ie. the two threads are 'synchronized'
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
int wake_flags = WF_SYNC;
if (unlikely(!q))
return;
if (unlikely(!nr_exclusive))
wake_flags = 0;
spin_lock_irqsave(&amp;q-&gt;lock, flags);
__wake_up_common(q, mode, nr_exclusive, wake_flags, key); // nr_exclusive 表示唤醒多少个进程,为 0 表示无限制。上文中设置的是 1
spin_unlock_irqrestore(&amp;q-&gt;lock, flags);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &amp;q-&gt;task_list, task_list) {
unsigned flags = curr-&gt;flags;
// 注意:这里仅仅是获取一个 wait 项并执行,并没有从 task_list 中删除。
if (curr-&gt;func(curr, mode, wake_flags, key) &amp;&amp;
(flags &amp; WQ_FLAG_EXCLUSIVE) &amp;&amp; !--nr_exclusive) // 调用 wait 项里的 func。对于普通 socket 是 autoremove_wake_function;对于 eventpoll 是 ep_poll_callback
break;
}
}
// file: include/linux/wait.h
typedef struct __wait_queue wait_queue_t;
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
void *private;
wait_queue_func_t func; // 本质上就是这个回调函数
struct list_head task_list;
};</code></pre>
<p>对于普通 socket 是 autoremove_wake_function:</p>
<pre><code class="language-c">// file: kernel/wait.c
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
if (ret)
list_del_init(&amp;wait-&gt;task_list); // 唤醒成功,则从 task_list 中删除
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr-&gt;private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
</code></pre>