传输层
<h2>概述</h2>
<p>调用树:</p>
<pre><code class="language-c">tcp_sendmsg
tcp_push
__tcp_push_pending_frames // 可能是其它的函数,最终会调用 tcp_write_xmit
tcp_write_xmit // 从 sk-&gt;sk_send_head 队头获取 skb 并开始发送,涉及 GSO 相关逻辑
tcp_transmit_skb // 克隆一份 skb 进行发送,原始的 skb 后面需要用于重传;增加 TCP 头部
ip_queue_xmit // 网络层</code></pre>
<p>1、先检查在发送队列 sk->sk_write_queue 上最后一个 skb 是否有空间,如果有空间,则直接追加数据;否则就申请新的 skb 并放到发送队列尾
2、数据包最大可以存放数据的长度为 size_goal,待进一步分析如何设置
3、数据先尝试放到线性区,如果不够则放到分页区
4、根据条件触发数据包发送(还涉及内存相关的检查,待进一步分析),最终调用 tcp_write_xmit
5、从队头获取 skb,并克隆一份新的 skb,原始的 skb 后面需要用于重传
6、增加 TCP 头部
7、调用网络层接口进行发送</p>
<p>另外,tcp_send_head(sk) 即 sk->sk_send_head 是指向 sk->sk_write_queue 上的首个未发送的元素,不一定就是 sk->sk_write_queue 上首个元素。例如在发送一个数据包但未收到对端确认时,这个包还在 sk_write_queue 上,但 sk_send_head 已经指向下一个包了。而那些已经发送但未确认的包,会由定时器根据情况进行重传,而不是由 tcp_write_xmit 来负责发送。</p>
<h2>分析</h2>
<p>对于 TCP 而言,socket->ops = inet_stream_ops,所以 sendmsg = inet_sendmsg。分析详见:<a href="https://www.showdoc.com.cn/zother/10666262664742487">https://www.showdoc.com.cn/zother/10666262664742487</a></p>
<pre><code class="language-c">// file: net/ipv4/af_inet.c
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock-&gt;sk;
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
if (!inet_sk(sk)-&gt;inet_num &amp;&amp; !sk-&gt;sk_prot-&gt;no_autobind &amp;&amp;
inet_autobind(sk))
return -EAGAIN;
return sk-&gt;sk_prot-&gt;sendmsg(iocb, sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);</code></pre>
<p>对于 TCP,sk->sk_prot = tcp_prot,所以 sendmsg = tcp_sendmsg。相应赋值的地方,同上面的链接。</p>
<p>> 个人理解:至少对于 TCP 而言,socket->ops 都是以 inet_xxx 开头的;而底层的 sock->prot 都是 tcp_xxx 开头的,是具体的实现。所以这里是实现了分层:socket 是 inet 层,会继续调用实际的 tcp 或 udp 层。</p>
<h2>传输层</h2>
<pre><code class="language-c">// file: net/ipv4/tcp.c
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
bool sg;
long timeo;
lock_sock(sk);
flags = msg-&gt;msg_flags;
if (flags &amp; MSG_FASTOPEN) {
err = tcp_sendmsg_fastopen(sk, msg, &amp;copied_syn, size);
if (err == -EINPROGRESS &amp;&amp; copied_syn &gt; 0)
goto out;
else if (err)
goto out_err;
offset = copied_syn;
}
timeo = sock_sndtimeo(sk, flags &amp; MSG_DONTWAIT);
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
*/
if (((1 &lt;&lt; sk-&gt;sk_state) &amp; ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &amp;&amp;
!tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &amp;timeo)) != 0)
goto do_error;
}
if (unlikely(tp-&gt;repair)) {
if (tp-&gt;repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
goto out_nopush;
}
err = -EINVAL;
if (tp-&gt;repair_queue == TCP_NO_QUEUE)
goto out_err;
/* 'common' sending to sendq */
}
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &amp;sk-&gt;sk_socket-&gt;flags);
mss_now = tcp_send_mss(sk, &amp;size_goal, flags);
/* Ok commence sending. */
iovlen = msg-&gt;msg_iovlen; // 数据块个数
iov = msg-&gt;msg_iov; // 数据块指针
copied = 0;
err = -EPIPE;
if (sk-&gt;sk_err || (sk-&gt;sk_shutdown &amp; SEND_SHUTDOWN))
goto out_err;
sg = !!(sk-&gt;sk_route_caps &amp; NETIF_F_SG);
while (--iovlen &gt;= 0) {
size_t seglen = iov-&gt;iov_len;
unsigned char __user *from = iov-&gt;iov_base;
iov++; //上面取完数据块信息,就指向下一个块
if (unlikely(offset &gt; 0)) { /* Skip bytes copied in SYN */
if (offset &gt;= seglen) {
offset -= seglen;
continue;
}
seglen -= offset;
from += offset;
offset = 0;
}
while (seglen &gt; 0) { // 数据块长度
int copy = 0;
int max = size_goal;
skb = tcp_write_queue_tail(sk); // 发送队列 sk-&gt;sk_write_queue 上最后一个 skb,可以已经发送过但未确认
if (tcp_send_head(sk)) { // sk-&gt;sk_send_head 是首个未发送的元素,不一定就是 sk-&gt;sk_write_queue 上首个元素
if (skb-&gt;ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb-&gt;len; // 此 skb 可追加的数据长度
}
// 不能追加数据,则需要申请新的 skb
if (copy &lt;= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk)) // 发送队列大小超过发送缓存上限,则等待。定义为:sk-&gt;sk_wmem_queued &lt; sk-&gt;sk_sndbuf
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk-&gt;sk_allocation); // 申请 skb
if (!skb)
goto wait_for_memory;
/*
* All packets are restored as if they have
* already been sent.
*/
if (tp-&gt;repair)
TCP_SKB_CB(skb)-&gt;when = tcp_time_stamp;
/*
* Check whether we can use HW checksum.
*/
if (sk-&gt;sk_route_caps &amp; NETIF_F_ALL_CSUM)
skb-&gt;ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb); // 将 skb 放到发送队列尾,注意此时它里面还没有拷贝数据
copy = size_goal;
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy &gt; seglen)
copy = seglen;
/* Where to copy to? */
if (skb_availroom(skb) &gt; 0) { // 线性数据区?
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else { // 分页区
bool merge = true;
int i = skb_shinfo(skb)-&gt;nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
goto wait_for_memory;
if (!skb_can_coalesce(skb, i, pfrag-&gt;page,
pfrag-&gt;offset)) {
if (i == MAX_SKB_FRAGS || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
merge = false;
}
copy = min_t(int, copy, pfrag-&gt;size - pfrag-&gt;offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
err = skb_copy_to_page_nocache(sk, from, skb,
pfrag-&gt;page,
pfrag-&gt;offset,
copy);
if (err)
goto do_error;
/* Update the skb. */
if (merge) {
skb_frag_size_add(&amp;skb_shinfo(skb)-&gt;frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, pfrag-&gt;page,
pfrag-&gt;offset, copy);
get_page(pfrag-&gt;page);
}
pfrag-&gt;offset += copy;
}
if (!copied) // 如果是第 1 次拷贝
TCP_SKB_CB(skb)-&gt;tcp_flags &amp;= ~TCPHDR_PSH;
tp-&gt;write_seq += copy;
TCP_SKB_CB(skb)-&gt;end_seq += copy;
skb_shinfo(skb)-&gt;gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 &amp;&amp; iovlen == 0) // 如果所有数据都拷贝完成
goto out;
if (skb-&gt;len &lt; max || (flags &amp; MSG_OOB) || unlikely(tp-&gt;repair)) // 如果 skb 还可以追加数据,则继续拷贝。和上面的判断是一样的。
continue;
if (forced_push(tp)) { // 未发送的数据是否已经超过最大窗口的一半
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); // 尽可能将发送队列中的数据发送出去
} else if (skb == tcp_send_head(sk)) // 发送队列的队头就是此 skb,即只有一个待发送的 skb
tcp_push_one(sk, mss_now); // 只发送一个
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &amp;sk-&gt;sk_socket-&gt;flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags &amp; ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &amp;timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &amp;size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp-&gt;nonagle); // 继续
out_nopush:
release_sock(sk);
return copied + copied_syn;
do_fault:
if (!skb-&gt;len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied + copied_syn)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
EXPORT_SYMBOL(tcp_sendmsg);
static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
skb-&gt;csum = 0;
tcb-&gt;seq = tcb-&gt;end_seq = tp-&gt;write_seq;
tcb-&gt;tcp_flags = TCPHDR_ACK;
tcb-&gt;sacked = 0;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb); // 函数里面检测:如果 sk-&gt;sk_send_head 为空,就设置为 skb
sk-&gt;sk_wmem_queued += skb-&gt;truesize;
sk_mem_charge(sk, skb-&gt;truesize);
if (tp-&gt;nonagle &amp; TCP_NAGLE_PUSH)
tp-&gt;nonagle &amp;= ~TCP_NAGLE_PUSH;
}
static inline void tcp_push(struct sock *sk, int flags, int mss_now,
int nonagle)
{
if (tcp_send_head(sk)) { // sk-&gt;sk_send_head
struct tcp_sock *tp = tcp_sk(sk);
if (!(flags &amp; MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, tcp_write_queue_tail(sk));
tcp_mark_urg(tp, flags);
__tcp_push_pending_frames(sk, mss_now,
(flags &amp; MSG_MORE) ? TCP_NAGLE_CORK : nonagle); // 继续
}
}
// file: net/ipv4/tcp_output.c
/* Push out any pending frames which were held back due to
* TCP_CORK or attempt at coalescing tiny packets.
* The socket must be locked by the caller.
*/
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
*/
if (unlikely(sk-&gt;sk_state == TCP_CLOSE))
return;
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_atomic(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}</code></pre>
<p>不管是调用哪个 push 函数,最终都是要调用 <code>tcp_write_xmit</code> 来发包,这是非常核心的函数,注意它是从 sk->sk_send_head 获取数据包进行发送,所以它不负责重传发包:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_output.c
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
*
* LARGESEND note: !tcp_urg_mode is overkill, only frames between
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
* Send at most one packet when push_one &gt; 0. Temporarily ignore
* cwnd limit to force at most one packet out when push_one == 2.
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*/
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
sent_pkts = 0;
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
return false;
} else if (result &gt; 0) {
sent_pkts = 1;
}
}
while ((skb = tcp_send_head(sk))) { // sk-&gt;sk_send_head 即获取首个未发送过的元素
unsigned int limit;
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp-&gt;repair) &amp;&amp; tp-&gt;repair_queue == TCP_SEND_QUEUE)
goto repair; /* Skip network transmission */
cwnd_quota = tcp_cwnd_test(tp, skb); // 滑动窗口相关
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
else
break;
}
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) // 滑动窗口相关
break;
if (tso_segs == 1 || !sk-&gt;sk_gso_max_segs) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
if (!push_one &amp;&amp; tcp_tso_should_defer(sk, skb))
break;
}
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* This allows for :
* - better RTT estimation and ACK scheduling
* - faster recovery
* - high rates
* Alas, some drivers / subsystems require a fair amount
* of queued bytes to ensure line rate.
* One example is wifi aggregation (802.11 AMPDU)
*/
limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
sk-&gt;sk_pacing_rate &gt;&gt; 10);
if (atomic_read(&amp;sk-&gt;sk_wmem_alloc) &gt; limit) {
set_bit(TSQ_THROTTLED, &amp;tp-&gt;tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
* We abuse smp_mb__after_clear_bit() because
* there is no smp_mb__after_set_bit() yet
*/
smp_mb__after_clear_bit();
if (atomic_read(&amp;sk-&gt;sk_wmem_alloc) &gt; limit)
break;
}
limit = mss_now;
if (tso_segs &gt; 1 &amp;&amp; sk-&gt;sk_gso_max_segs &amp;&amp; !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
sk-&gt;sk_gso_max_segs)); // GSO相关,设置本次能发送的数据大小
if (skb-&gt;len &gt; limit &amp;&amp;
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) // 按照 limit 限制切分 skb
break;
TCP_SKB_CB(skb)-&gt;when = tcp_time_stamp;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) // 开始发送。注意:如果发送失败,则不会加入重传队列。
break;
repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
tcp_event_new_data_sent(sk, skb); // 更新了 sk-&gt;sk_send_head 还有已发送未确认 sk-&gt;snd_nxt。 还将包加入重传队列,并可能启动定时器。TODO:待分析
tcp_minshall_update(tp, mss_now, skb);
sent_pkts += tcp_skb_pcount(skb);
if (push_one)
break;
}
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp-&gt;prr_out += sent_pkts;
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
tcp_cwnd_validate(sk);
return false;
}
return (push_one == 2) || (!tp-&gt;packets_out &amp;&amp; tcp_send_head(sk));
}
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless. It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
/* If congestion control is doing timestamping, we must
* take such a timestamp before we potentially clone/copy.
*/
if (icsk-&gt;icsk_ca_ops-&gt;flags &amp; TCP_CONG_RTT_STAMP)
__net_timestamp(skb);
if (likely(clone_it)) { // 上面传进来的是 1,也就是要克隆。原因是:网卡发送会释放 skb,但实际 skb 还要保留,因为要等到收到 ACK 才能完全释放,所以 clone 一份保留
const struct sk_buff *fclone = skb + 1;
if (unlikely(skb-&gt;fclone == SKB_FCLONE_ORIG &amp;&amp;
fclone-&gt;fclone == SKB_FCLONE_CLONE))
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
inet = inet_sk(sk);
tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&amp;opts, 0, sizeof(opts));
if (unlikely(tcb-&gt;tcp_flags &amp; TCPHDR_SYN))
tcp_options_size = tcp_syn_options(sk, skb, &amp;opts, &amp;md5);
else
tcp_options_size = tcp_established_options(sk, skb, &amp;opts,
&amp;md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue.
*/
skb-&gt;ooo_okay = sk_wmem_alloc_get(sk) == 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb-&gt;sk = sk;
skb-&gt;destructor = tcp_wfree;
atomic_add(skb-&gt;truesize, &amp;sk-&gt;sk_wmem_alloc);
/* Build TCP header and checksum it. */
th = tcp_hdr(skb); // 封装 TCP 头部
th-&gt;source = inet-&gt;inet_sport;
th-&gt;dest = inet-&gt;inet_dport;
th-&gt;seq = htonl(tcb-&gt;seq);
th-&gt;ack_seq = htonl(tp-&gt;rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size &gt;&gt; 2) &lt;&lt; 12) |
tcb-&gt;tcp_flags);
if (unlikely(tcb-&gt;tcp_flags &amp; TCPHDR_SYN)) {
/* RFC1323: The window in SYN &amp; SYN/ACK segments
* is never scaled.
*/
th-&gt;window = htons(min(tp-&gt;rcv_wnd, 65535U));
} else {
th-&gt;window = htons(tcp_select_window(sk));
}
th-&gt;check = 0;
th-&gt;urg_ptr = 0;
/* The urg_mode check is necessary during a below snd_una win probe */
if (unlikely(tcp_urg_mode(tp) &amp;&amp; before(tcb-&gt;seq, tp-&gt;snd_up))) {
if (before(tp-&gt;snd_up, tcb-&gt;seq + 0x10000)) {
th-&gt;urg_ptr = htons(tp-&gt;snd_up - tcb-&gt;seq);
th-&gt;urg = 1;
} else if (after(tcb-&gt;seq + 0xFFFF, tp-&gt;snd_nxt)) {
th-&gt;urg_ptr = htons(0xFFFF);
th-&gt;urg = 1;
}
}
tcp_options_write((__be32 *)(th + 1), tp, &amp;opts);
if (likely((tcb-&gt;tcp_flags &amp; TCPHDR_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp-&gt;af_specific-&gt;calc_md5_hash(opts.hash_location,
md5, sk, NULL, skb);
}
#endif
icsk-&gt;icsk_af_ops-&gt;send_check(sk, skb);
if (likely(tcb-&gt;tcp_flags &amp; TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); // TODO:感觉里面有事,建议分析
if (skb-&gt;len != tcp_header_size)
tcp_event_data_sent(tp, sk); // ?
if (after(tcb-&gt;end_seq, tp-&gt;snd_nxt) || tcb-&gt;seq == tcb-&gt;end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
err = icsk-&gt;icsk_af_ops-&gt;queue_xmit(skb, &amp;inet-&gt;cork.fl); // 网络层发送接口
if (likely(err &lt;= 0))
return err;
tcp_enter_cwr(sk, 1);
return net_xmit_eval(err);
}</code></pre>
<p>会调用 <code>queue_xmit</code> 这个网络层接口进行发送,那么这个指针是什么呢?</p>
<pre><code class="language-c">// file: net/ipv4/tcp_v4.c
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit, // 这个函数
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
.bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
EXPORT_SYMBOL(ipv4_specific);
/* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
icsk-&gt;icsk_af_ops = &amp;ipv4_specific; // 这里赋值
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)-&gt;af_specific = &amp;tcp_sock_ipv4_specific;
#endif
return 0;
}
struct proto tcp_prot = {
.name = &quot;TCP&quot;,
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock, // 这里调用
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &amp;tcp_sockets_allocated,
.orphan_count = &amp;tcp_orphan_count,
.memory_allocated = &amp;tcp_memory_allocated,
.memory_pressure = &amp;tcp_memory_pressure,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &amp;tcp_timewait_sock_ops,
.rsk_prot = &amp;tcp_request_sock_ops,
.h.hashinfo = &amp;tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);
</code></pre>
<p>在 <code>tcp_prot.init</code> 中初始化,这个函数是在创建 socket 时 <code>inet_create()</code> 里调用的,具体见:<a href="https://www.showdoc.com.cn/zother/10666262664742487">https://www.showdoc.com.cn/zother/10666262664742487</a></p>