公开学习文档

公开学习文档


connect连接

<h2>概述</h2> <p>1、执行 connect 时,内核把 sokcet 状态设置为 TCP_SYN_SENT,选择一个可用端口,发 SYN 包并启动重传定时器。对于阻塞的 socket,会在 connect 执行时等待 sk 状态变化。</p> <p>2、另一方面,当内核收到 SYN+ACK 包时,会进入到 <code>tcp_v4_rcv</code> 中处理。具体而言,会清除 connect 时设置的重传定时器,socket 状态设置为 ESTABLISHED,开启保活定时器,并发出 ACK 包。</p> <h2>分析 connect</h2> <p>应用层用法:</p> <pre><code class="language-c">int main() { fd = socket(AF_INET, SOCK_STREAM, 0); connect(fd, ...); // ... }</code></pre> <pre><code class="language-c">// file: net/socket.c /* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. * * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to * break bindings * * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and * other SEQPACKET protocols that take time to connect() as it doesn't * include the -EINPROGRESS status for such sockets. */ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &amp;amp;err, &amp;amp;fput_needed); // 根据 fd 获取 socket 对象 if (!sock) goto out; err = move_addr_to_kernel(uservaddr, addrlen, &amp;amp;address); if (err &amp;lt; 0) goto out_put; err = security_socket_connect(sock, (struct sockaddr *)&amp;amp;address, addrlen); if (err) goto out_put; err = sock-&amp;gt;ops-&amp;gt;connect(sock, (struct sockaddr *)&amp;amp;address, addrlen, sock-&amp;gt;file-&amp;gt;f_flags); // 即 inet_stram_connect out_put: fput_light(sock-&amp;gt;file, fput_needed); out: return err; } // file: net/ipv4/af_inet.c int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { int err; lock_sock(sock-&amp;gt;sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags); release_sock(sock-&amp;gt;sk); return err; } EXPORT_SYMBOL(inet_stream_connect); /* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock-&amp;gt;sk; int err; long timeo; if (addr_len &amp;lt; sizeof(uaddr-&amp;gt;sa_family)) return -EINVAL; if (uaddr-&amp;gt;sa_family == AF_UNSPEC) { err = sk-&amp;gt;sk_prot-&amp;gt;disconnect(sk, flags); sock-&amp;gt;state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } switch (sock-&amp;gt;state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: // 刚刚创建的 socket 状态为 SS_UNCONNECTED err = -EISCONN; if (sk-&amp;gt;sk_state != TCP_CLOSE) goto out; err = sk-&amp;gt;sk_prot-&amp;gt;connect(sk, uaddr, addr_len); // 对于 AF_INET 的 TCP socket 来说,这里即是 tcp_v4_connect if (err &amp;lt; 0) goto out; sock-&amp;gt;state = SS_CONNECTING; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags &amp;amp; O_NONBLOCK); // 阻塞类型,则有等待超时时间 if ((1 &amp;lt;&amp;lt; sk-&amp;gt;sk_state) &amp;amp; (TCPF_SYN_SENT | TCPF_SYN_RECV)) { int writebias = (sk-&amp;gt;sk_protocol == IPPROTO_TCP) &amp;amp;&amp;amp; tcp_sk(sk)-&amp;gt;fastopen_req &amp;amp;&amp;amp; tcp_sk(sk)-&amp;gt;fastopen_req-&amp;gt;data ? 1 : 0; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) // 等待对端回 SYN+ACK,并更新 sk 状态 goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk-&amp;gt;sk_state == TCP_CLOSE) goto sock_error; /* sk-&amp;gt;sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock-&amp;gt;state = SS_CONNECTED; // 成功 err = 0; out: return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock-&amp;gt;state = SS_UNCONNECTED; if (sk-&amp;gt;sk_prot-&amp;gt;disconnect(sk, flags)) sock-&amp;gt;state = SS_DISCONNECTING; goto out; } EXPORT_SYMBOL(__inet_stream_connect); </code></pre> <pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; struct ip_options_rcu *inet_opt; if (addr_len &amp;lt; sizeof(struct sockaddr_in)) return -EINVAL; if (usin-&amp;gt;sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin-&amp;gt;sin_addr.s_addr; // 目标 IP inet_opt = rcu_dereference_protected(inet-&amp;gt;inet_opt, sock_owned_by_user(sk)); if (inet_opt &amp;amp;&amp;amp; inet_opt-&amp;gt;opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt-&amp;gt;opt.faddr; } orig_sport = inet-&amp;gt;inet_sport; orig_dport = usin-&amp;gt;sin_port; fl4 = &amp;amp;inet-&amp;gt;cork.fl.u.ip4; rt = ip_route_connect(fl4, nexthop, inet-&amp;gt;inet_saddr, RT_CONN_FLAGS(sk), sk-&amp;gt;sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk, true); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } if (rt-&amp;gt;rt_flags &amp;amp; (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet_opt || !inet_opt-&amp;gt;opt.srr) daddr = fl4-&amp;gt;daddr; if (!inet-&amp;gt;inet_saddr) // 源 IP 为 ANY inet-&amp;gt;inet_saddr = fl4-&amp;gt;saddr; inet-&amp;gt;inet_rcv_saddr = inet-&amp;gt;inet_saddr; // 赋值源 IP if (tp-&amp;gt;rx_opt.ts_recent_stamp &amp;amp;&amp;amp; inet-&amp;gt;inet_daddr != daddr) { /* Reset inherited state */ tp-&amp;gt;rx_opt.ts_recent = 0; tp-&amp;gt;rx_opt.ts_recent_stamp = 0; if (likely(!tp-&amp;gt;repair)) tp-&amp;gt;write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle &amp;amp;&amp;amp; !tp-&amp;gt;rx_opt.ts_recent_stamp &amp;amp;&amp;amp; fl4-&amp;gt;daddr == daddr) tcp_fetch_timewait_stamp(sk, &amp;amp;rt-&amp;gt;dst); inet-&amp;gt;inet_dport = usin-&amp;gt;sin_port; // 目标端口 inet-&amp;gt;inet_daddr = daddr; // 目标 IP inet_csk(sk)-&amp;gt;icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)-&amp;gt;icsk_ext_hdr_len = inet_opt-&amp;gt;opt.optlen; tp-&amp;gt;rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); // 更新状态 err = inet_hash_connect(&amp;amp;tcp_death_row, sk); // 动态选择一个端口 if (err) goto failure; rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet-&amp;gt;inet_sport, inet-&amp;gt;inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ sk-&amp;gt;sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &amp;amp;rt-&amp;gt;dst); // 特性能力 if (!tp-&amp;gt;write_seq &amp;amp;&amp;amp; likely(!tp-&amp;gt;repair)) tp-&amp;gt;write_seq = secure_tcp_sequence_number(inet-&amp;gt;inet_saddr, inet-&amp;gt;inet_daddr, inet-&amp;gt;inet_sport, usin-&amp;gt;sin_port); inet-&amp;gt;inet_id = tp-&amp;gt;write_seq ^ jiffies; err = tcp_connect(sk); // 构建 syn 报文,并将其发送出去 rt = NULL; if (err) goto failure; return 0; failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk-&amp;gt;sk_route_caps = 0; inet-&amp;gt;inet_dport = 0; return err; } EXPORT_SYMBOL(tcp_v4_connect); </code></pre> <p>执行 <code>inet_hash_connect</code> -&gt; <code>__inet_hash_connect</code> 获取源端口,需要注意:对于有大量相同目标地址(包括 IP 和端口)的连接时,在端口不足的情况下,会非常消耗性能。分析下:</p> <pre><code class="language-c">// file: net/ipv4/inet_hashtables.c /* * Bind a port for a connect operation and hash it. */ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), // inet_sk_port_offset 根据目的 IP 和端口生成一个随机数 __inet_check_established, __inet_hash_nolisten); // __inet_check_established 检查是否有现有的 ESTABLISH 状态连接冲突;__inet_hash_nolisten 则是将 sk 添加到 ehash 上 } EXPORT_SYMBOL_GPL(inet_hash_connect); int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **), int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) { struct inet_hashinfo *hinfo = death_row-&amp;gt;hashinfo; const unsigned short snum = inet_sk(sk)-&amp;gt;inet_num; struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; // 存放某端口的绑定信息 int ret; struct net *net = sock_net(sk); int twrefcnt = 1; if (!snum) { // 源端口为 0 int i, remaining, low, high, port; static u32 hint; u32 offset = hint + port_offset; struct inet_timewait_sock *tw = NULL; inet_get_local_port_range(&amp;amp;low, &amp;amp;high); // 本地随机端口范围,即 net.ipv4.ip_local_port_range,可查看 /proc/sys/net/ipv4/ip_local_port_range remaining = (high - low) + 1; local_bh_disable(); for (i = 1; i &amp;lt;= remaining; i++) { port = low + (i + offset) % remaining; if (inet_is_reserved_local_port(port)) // 保留端口,即 net.ipv4.ip_local_reserved_ports continue; head = &amp;amp;hinfo-&amp;gt;bhash[inet_bhashfn(net, port, hinfo-&amp;gt;bhash_size)]; // bhash 即所有绑定过的端口,存放在全局变量 tcp_hashinfo 中 spin_lock(&amp;amp;head-&amp;gt;lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ inet_bind_bucket_for_each(tb, &amp;amp;head-&amp;gt;chain) { // 遍历 bhash 槽 if (net_eq(ib_net(tb), net) &amp;amp;&amp;amp; tb-&amp;gt;port == port) { // 端口被占用,但只要四元组不同也可以用 if (tb-&amp;gt;fastreuse &amp;gt;= 0 || tb-&amp;gt;fastreuseport &amp;gt;= 0) goto next_port; WARN_ON(hlist_empty(&amp;amp;tb-&amp;gt;owners)); if (!check_established(death_row, sk, port, &amp;amp;tw)) // 虽然端口用了,但是 n 元组可能并不一样,这种情况端口也是可以用的。里面判断包括了一般连接状态和 time_wait 状态 goto ok; goto next_port; } } // 到这里,说明端口没有被使用,可以分配给本连接 tb = inet_bind_bucket_create(hinfo-&amp;gt;bind_bucket_cachep, net, head, port); // 创建 bhash 的节点,并添加到 head 中,即槽位链表上 if (!tb) { spin_unlock(&amp;amp;head-&amp;gt;lock); break; } tb-&amp;gt;fastreuse = -1; tb-&amp;gt;fastreuseport = -1; goto ok; next_port: spin_unlock(&amp;amp;head-&amp;gt;lock); } local_bh_enable(); return -EADDRNOTAVAIL; ok: hint += i; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); // 将 sk 添加到 tb-&amp;gt;owners 上,并且设置 inet-&amp;gt;inet_num = port if (sk_unhashed(sk)) { inet_sk(sk)-&amp;gt;inet_sport = htons(port); twrefcnt += hash(sk, tw); // 传入的是 __inet_hash_nolisten,它会将 sk 添加到 ehash 上,以便接收 SYN+ACK 时可以根据 skb 查找到 sk } if (tw) twrefcnt += inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&amp;amp;head-&amp;gt;lock); if (tw) { inet_twsk_deschedule(tw, death_row); while (twrefcnt) { twrefcnt--; inet_twsk_put(tw); } } ret = 0; goto out; } // 这里是已经绑定了端口的情况 head = &amp;amp;hinfo-&amp;gt;bhash[inet_bhashfn(net, snum, hinfo-&amp;gt;bhash_size)]; tb = inet_csk(sk)-&amp;gt;icsk_bind_hash; spin_lock_bh(&amp;amp;head-&amp;gt;lock); if (sk_head(&amp;amp;tb-&amp;gt;owners) == sk &amp;amp;&amp;amp; !sk-&amp;gt;sk_bind_node.next) { // 如果绑定端口的只有自己,就没有冲突了 hash(sk, NULL); // 同上 spin_unlock_bh(&amp;amp;head-&amp;gt;lock); return 0; } else { spin_unlock(&amp;amp;head-&amp;gt;lock); /* No definite answer... Walk to established hash table */ ret = check_established(death_row, sk, snum, NULL); // 否则就要检查是否和 establish 有冲突。因为虽然本进程 A 绑定了端口,但可能其它进程 B 发起的连接已经随机用了此端口(因为 B 在分配随机端口时,不检查 owners,只是检查 establish 连接),那么本进程再进行 connect 时,就有可能和其它进程的连接发生冲突,因此需要判断。另外,此函数里还会将 sk 加入到 ehash 中。 out: local_bh_enable(); return ret; } } </code></pre> <p>全局变量 tcp_hashinfo 结构如下图,包含多个 hash 表:</p> <p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=71d3771b143bc9b1e2c34d3661e9dd38&amp;amp;file=file.png" alt="" /></p> <pre><code class="language-c">// file: net/ipv4/tcp_output.c /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_connect_init(sk); if (unlikely(tp-&amp;gt;repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = sk_stream_alloc_skb(sk, 0, sk-&amp;gt;sk_allocation); // 申请 skb if (unlikely(!buff)) return -ENOBUFS; tcp_init_nondata_skb(buff, tp-&amp;gt;write_seq++, TCPHDR_SYN); tp-&amp;gt;retrans_stamp = TCP_SKB_CB(buff)-&amp;gt;when = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); // 添加到 sk-&amp;gt;sk_write_queue TCP_ECN_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp-&amp;gt;fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk-&amp;gt;sk_allocation); // 疑问:既然已经加入到 sk-&amp;gt;sk_write_queue,为何这里要带 skb 参数直接发送? if (err == -ECONNREFUSED) return err; /* We change tp-&amp;gt;snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ tp-&amp;gt;snd_nxt = tp-&amp;gt;write_seq; tp-&amp;gt;pushed_seq = tp-&amp;gt;write_seq; TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)-&amp;gt;icsk_rto, TCP_RTO_MAX); // 启动定时器,超时时间在上面的 tcp_connect_init() 初始化 return 0; } EXPORT_SYMBOL(tcp_connect); </code></pre> <p>发出的 SYN 包,由服务器接收并处理,在 TCP 层,相关入口就是 <code>tcp_v4_rcv()</code>。</p> <h2>内核收 SYN+ACK 包</h2> <p>当 SYN+ACK 包到达时,会进入 <code>tcp_v4_rcv</code> 处理,在此函数中,会在 tcp_hashinfo 中查找对应 sk,然后进入 <code>tcp_v4_do_rcv</code> -&gt; <code>tcp_rcv_state_process</code> 中:</p> <pre><code class="language-c">// file: net/ipv4/tcp_input.c int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; tp-&amp;gt;rx_opt.saw_tstamp = 0; switch (sk-&amp;gt;sk_state) { // 忽略其它代码 ... case TCP_SYN_SENT: queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued &amp;gt;= 0) return queued; /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } // ... } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp-&amp;gt;rx_opt.mss_clamp; tcp_parse_options(skb, &amp;amp;tp-&amp;gt;rx_opt, 0, &amp;amp;foc); if (tp-&amp;gt;rx_opt.saw_tstamp &amp;amp;&amp;amp; tp-&amp;gt;rx_opt.rcv_tsecr) tp-&amp;gt;rx_opt.rcv_tsecr -= tp-&amp;gt;tsoffset; if (th-&amp;gt;ack) { /* rfc793: * &amp;quot;If the state is SYN-SENT then * first check the ACK bit * If the ACK bit is set * If SEG.ACK =&amp;lt; ISS, or SEG.ACK &amp;gt; SND.NXT, send * a reset (unless the RST bit is set, if so drop * the segment and return)&amp;quot; */ if (!after(TCP_SKB_CB(skb)-&amp;gt;ack_seq, tp-&amp;gt;snd_una) || after(TCP_SKB_CB(skb)-&amp;gt;ack_seq, tp-&amp;gt;snd_nxt)) goto reset_and_undo; if (tp-&amp;gt;rx_opt.saw_tstamp &amp;amp;&amp;amp; tp-&amp;gt;rx_opt.rcv_tsecr &amp;amp;&amp;amp; !between(tp-&amp;gt;rx_opt.rcv_tsecr, tp-&amp;gt;retrans_stamp, tcp_time_stamp)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } /* Now ACK is acceptable. * * &amp;quot;If the RST bit is set * If the ACK was acceptable then signal the user &amp;quot;error: * connection reset&amp;quot;, drop the segment, enter CLOSED state, * delete TCB, and return.&amp;quot; */ if (th-&amp;gt;rst) { tcp_reset(sk); goto discard; } /* rfc793: * &amp;quot;fifth, if neither of the SYN or RST bits is set then * drop the segment and return.&amp;quot; * * See note below! * --ANK(990513) */ if (!th-&amp;gt;syn) goto discard_and_undo; /* rfc793: * &amp;quot;If the SYN bit is on ... * are acceptable then ... * (our SYN has been ACKed), change the connection * state to ESTABLISHED...&amp;quot; */ TCP_ECN_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)-&amp;gt;seq); tcp_ack(sk, skb, FLAG_SLOWPATH); // 处理 ACK 包 /* Ok.. it's good. Set up sequence numbers and * move to established. */ tp-&amp;gt;rcv_nxt = TCP_SKB_CB(skb)-&amp;gt;seq + 1; tp-&amp;gt;rcv_wup = TCP_SKB_CB(skb)-&amp;gt;seq + 1; /* RFC1323: The window in SYN &amp;amp; SYN/ACK segments is * never scaled. */ tp-&amp;gt;snd_wnd = ntohs(th-&amp;gt;window); if (!tp-&amp;gt;rx_opt.wscale_ok) { tp-&amp;gt;rx_opt.snd_wscale = tp-&amp;gt;rx_opt.rcv_wscale = 0; tp-&amp;gt;window_clamp = min(tp-&amp;gt;window_clamp, 65535U); } if (tp-&amp;gt;rx_opt.saw_tstamp) { tp-&amp;gt;rx_opt.tstamp_ok = 1; tp-&amp;gt;tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; tp-&amp;gt;advmss -= TCPOLEN_TSTAMP_ALIGNED; tcp_store_ts_recent(tp); } else { tp-&amp;gt;tcp_header_len = sizeof(struct tcphdr); } if (tcp_is_sack(tp) &amp;amp;&amp;amp; sysctl_tcp_fack) tcp_enable_fack(tp); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk-&amp;gt;icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ tp-&amp;gt;copied_seq = tp-&amp;gt;rcv_nxt; smp_mb(); tcp_finish_connect(sk, skb); // 继续 if ((tp-&amp;gt;syn_fastopen || tp-&amp;gt;syn_data) &amp;amp;&amp;amp; tcp_rcv_fastopen_synack(sk, skb, &amp;amp;foc)) return -1; if (sk-&amp;gt;sk_write_pending || icsk-&amp;gt;icsk_accept_queue.rskq_defer_accept || icsk-&amp;gt;icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * * It may be deleted, but with this feature tcpdumps * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); // 延迟确认 discard: __kfree_skb(skb); return 0; } else { tcp_send_ack(sk); // 直接确认 } return -1; } /* No ACK in the segment */ if (th-&amp;gt;rst) { /* rfc793: * &amp;quot;If the RST bit is set * * Otherwise (no ACK) drop the segment and return.&amp;quot; */ goto discard_and_undo; } /* PAWS check. */ if (tp-&amp;gt;rx_opt.ts_recent_stamp &amp;amp;&amp;amp; tp-&amp;gt;rx_opt.saw_tstamp &amp;amp;&amp;amp; tcp_paws_reject(&amp;amp;tp-&amp;gt;rx_opt, 0)) goto discard_and_undo; if (th-&amp;gt;syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. */ tcp_set_state(sk, TCP_SYN_RECV); if (tp-&amp;gt;rx_opt.saw_tstamp) { tp-&amp;gt;rx_opt.tstamp_ok = 1; tcp_store_ts_recent(tp); tp-&amp;gt;tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { tp-&amp;gt;tcp_header_len = sizeof(struct tcphdr); } tp-&amp;gt;rcv_nxt = TCP_SKB_CB(skb)-&amp;gt;seq + 1; tp-&amp;gt;copied_seq = tp-&amp;gt;rcv_nxt; tp-&amp;gt;rcv_wup = TCP_SKB_CB(skb)-&amp;gt;seq + 1; /* RFC1323: The window in SYN &amp;amp; SYN/ACK segments is * never scaled. */ tp-&amp;gt;snd_wnd = ntohs(th-&amp;gt;window); tp-&amp;gt;snd_wl1 = TCP_SKB_CB(skb)-&amp;gt;seq; tp-&amp;gt;max_window = tp-&amp;gt;snd_wnd; TCP_ECN_rcv_syn(tp, th); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk-&amp;gt;icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. * There are no obstacles to make this (except that we must * either change tcp_recvmsg() to prevent it from returning data * before 3WHS completes per RFC793, or employ TCP Fast Open). * * However, if we ignore data in ACKless segments sometimes, * we have no reasons to accept it sometimes. * Also, seems the code doing it in step6 of tcp_rcv_state_process * is not flawless. So, discard packet for sanity. * Uncomment this return to process the data. */ return -1; #else goto discard; #endif } /* &amp;quot;fifth, if neither of the SYN or RST bits is set then * drop the segment and return.&amp;quot; */ discard_and_undo: tcp_clear_options(&amp;amp;tp-&amp;gt;rx_opt); tp-&amp;gt;rx_opt.mss_clamp = saved_clamp; goto discard; reset_and_undo: tcp_clear_options(&amp;amp;tp-&amp;gt;rx_opt); tp-&amp;gt;rx_opt.mss_clamp = saved_clamp; return 1; } void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); tcp_set_state(sk, TCP_ESTABLISHED); // 更新状态 icsk-&amp;gt;icsk_ack.lrcvtime = tcp_time_stamp; if (skb != NULL) { icsk-&amp;gt;icsk_af_ops-&amp;gt;sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } /* Make sure socket is routed, for correct metrics. */ icsk-&amp;gt;icsk_af_ops-&amp;gt;rebuild_header(sk); tcp_init_metrics(sk); tcp_init_congestion_control(sk); // 初始化拥塞控制 /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp-&amp;gt;lsndtime = tcp_time_stamp; tcp_init_buffer_space(sk); // 可研究 if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); // 打开保活定时器 if (!tp-&amp;gt;rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp-&amp;gt;snd_wnd); else tp-&amp;gt;pred_flags = 0; if (!sock_flag(sk, SOCK_DEAD)) { sk-&amp;gt;sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } } </code></pre>

页面列表

ITEM_HTML