recv
<h2>概述</h2>
<p>应用调用 <code>recv</code> 接口时,实际会执行 <code>recvfrom</code> 系统调用。主要就是从 sk->sk_receive_queue 中获取 skb 的数据,如果没有 skb 则会休眠,让出 CPU,导致进程上下文切换。</p>
<h2>流程分析</h2>
<p>看 <code>recvfrom</code> 系统调用代码:</p>
<pre><code class="language-c">// file: net/socket.c
/*
* Receive a frame from the socket and optionally record the address of the
* sender. We verify the buffers are writable and if needed move the
* sender address from kernel to user space.
*/
SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
unsigned int, flags, struct sockaddr __user *, addr,
int __user *, addr_len)
{
struct socket *sock;
struct iovec iov;
struct msghdr msg;
struct sockaddr_storage address;
int err, err2;
int fput_needed;
if (size &gt; INT_MAX)
size = INT_MAX;
sock = sockfd_lookup_light(fd, &amp;err, &amp;fput_needed); // 根据 fd 找到 sock
if (!sock)
goto out;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_iovlen = 1;
msg.msg_iov = &amp;iov;
iov.iov_len = size;
iov.iov_base = ubuf;
/* Save some cycles and don't copy the address if not needed */
msg.msg_name = addr ? (struct sockaddr *)&amp;address : NULL;
/* We assume all kernel code knows the size of sockaddr_storage */
msg.msg_namelen = 0;
if (sock-&gt;file-&gt;f_flags &amp; O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &amp;msg, size, flags); // 继续
if (err &gt;= 0 &amp;&amp; addr != NULL) {
err2 = move_addr_to_user(&amp;address,
msg.msg_namelen, addr, addr_len);
if (err2 &lt; 0)
err = err2;
}
fput_light(sock-&gt;file, fput_needed);
out:
return err;
}
int sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
struct kiocb iocb;
struct sock_iocb siocb;
int ret;
init_sync_kiocb(&amp;iocb, NULL);
iocb.private = &amp;siocb;
ret = __sock_recvmsg(&amp;iocb, sock, msg, size, flags); // 继续
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&amp;iocb);
return ret;
}
EXPORT_SYMBOL(sock_recvmsg);
static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
int err = security_socket_recvmsg(sock, msg, size, flags);
return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); // 继续
}
static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
struct sock_iocb *si = kiocb_to_siocb(iocb);
si-&gt;sock = sock;
si-&gt;scm = NULL;
si-&gt;msg = msg;
si-&gt;size = size;
si-&gt;flags = flags;
return sock-&gt;ops-&gt;recvmsg(iocb, sock, msg, size, flags); // 对于 AF_INET,这里是 inet_recvmsg
}
</code></pre>
<p>对于 AF_INET,<code>recvmsg</code> 是 <code>inet_recvmsg</code>,继续:</p>
<pre><code class="language-c">// file: net/ipv4/af_inet.c
int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
struct sock *sk = sock-&gt;sk;
int addr_len = 0;
int err;
sock_rps_record_flow(sk);
err = sk-&gt;sk_prot-&gt;recvmsg(iocb, sk, msg, size, flags &amp; MSG_DONTWAIT,
flags &amp; ~MSG_DONTWAIT, &amp;addr_len); // 对于 TCP 而言,是 tcp_recvmsg
if (err &gt;= 0)
msg-&gt;msg_namelen = addr_len;
return err;
}
EXPORT_SYMBOL(inet_recvmsg);
</code></pre>
<p>继续看 <code>tcp_recvmsg</code>:</p>
<pre><code class="language-c">// file: net/ipv4/tcp.c
/*
* This routine copies from a sock struct into the user buffer.
*
* Technical note: in 2.3 we work on _locked_ socket, so that
* tricks with *seq access order and skb-&gt;users are not required.
* Probably, code can be easily improved even more.
*/
int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags, int *addr_len)
{
// ...
do {
/* Next get a buffer. */
skb_queue_walk(&amp;sk-&gt;sk_receive_queue, skb) { // 遍历 sk 上的接收队列
// ...
}
// ...
if (copied &gt;= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
} else
sk_wait_data(sk, &amp;timeo); // 没有收到足够数据,阻塞当前进程
// ...
} while (len &gt; 0);
// ...
}
// file: net/core/sock.c
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
* @timeo: for how long
*
* Now socket state including sk-&gt;sk_err is changed only under lock,
* hence we may omit checks after joining wait queue.
* We check receive queue before schedule() only as optimization;
* it is very likely that release_sock() added new data.
*/
int sk_wait_data(struct sock *sk, long *timeo)
{
int rc;
DEFINE_WAIT(wait); // 定义 wait 变量,类型为 wait_queue_t。即定义一个等待队列项,其中包含有 current 进程信息
prepare_to_wait(sk_sleep(sk), &amp;wait, TASK_INTERRUPTIBLE); // sk_sleep 返回 sk-&gt;sk_wq-&gt;wait,即等待队列列表头。这句是把 wait 项加入到 sk-&gt;sk_wq-&gt;wait-&gt;task_list 中
set_bit(SOCK_ASYNC_WAITDATA, &amp;sk-&gt;sk_socket-&gt;flags);
rc = sk_wait_event(sk, timeo, !skb_queue_empty(&amp;sk-&gt;sk_receive_queue)); // 判断接收队列。如果为空,则会调用 schedule_timeout 让出 CPU
clear_bit(SOCK_ASYNC_WAITDATA, &amp;sk-&gt;sk_socket-&gt;flags);
finish_wait(sk_sleep(sk), &amp;wait);
return rc;
}
EXPORT_SYMBOL(sk_wait_data);
// file: include/linux/wait.h
#define DEFINE_WAIT_FUNC(name, function) \
wait_queue_t name = { \
.private = current, \
.func = function, \
.task_list = LIST_HEAD_INIT((name).task_list), \
}
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) // 就是定义一个 wait_queue_t wait 变量,设置了进程、func 等信息
</code></pre>