connect连接
<h2>概述</h2>
<p>1、执行 connect 时,内核把 sokcet 状态设置为 TCP_SYN_SENT,选择一个可用端口,发 SYN 包并启动重传定时器。对于阻塞的 socket,会在 connect 执行时等待 sk 状态变化。</p>
<p>2、另一方面,当内核收到 SYN+ACK 包时,会进入到 <code>tcp_v4_rcv</code> 中处理。具体而言,会清除 connect 时设置的重传定时器,socket 状态设置为 ESTABLISHED,开启保活定时器,并发出 ACK 包。</p>
<h2>分析 connect</h2>
<p>应用层用法:</p>
<pre><code class="language-c">int main()
{
fd = socket(AF_INET, SOCK_STREAM, 0);
connect(fd, ...);
// ...
}</code></pre>
<pre><code class="language-c">// file: net/socket.c
/*
* Attempt to connect to a socket with the server address. The address
* is in user space so we verify it is OK and move it to kernel space.
*
* For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
* break bindings
*
* NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
* other SEQPACKET protocols that take time to connect() as it doesn't
* include the -EINPROGRESS status for such sockets.
*/
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &amp;err, &amp;fput_needed); // 根据 fd 获取 socket 对象
if (!sock)
goto out;
err = move_addr_to_kernel(uservaddr, addrlen, &amp;address);
if (err &lt; 0)
goto out_put;
err =
security_socket_connect(sock, (struct sockaddr *)&amp;address, addrlen);
if (err)
goto out_put;
err = sock-&gt;ops-&gt;connect(sock, (struct sockaddr *)&amp;address, addrlen,
sock-&gt;file-&gt;f_flags); // 即 inet_stram_connect
out_put:
fput_light(sock-&gt;file, fput_needed);
out:
return err;
}
// file: net/ipv4/af_inet.c
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
int err;
lock_sock(sock-&gt;sk);
err = __inet_stream_connect(sock, uaddr, addr_len, flags);
release_sock(sock-&gt;sk);
return err;
}
EXPORT_SYMBOL(inet_stream_connect);
/*
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock-&gt;sk;
int err;
long timeo;
if (addr_len &lt; sizeof(uaddr-&gt;sa_family))
return -EINVAL;
if (uaddr-&gt;sa_family == AF_UNSPEC) {
err = sk-&gt;sk_prot-&gt;disconnect(sk, flags);
sock-&gt;state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
}
switch (sock-&gt;state) {
default:
err = -EINVAL;
goto out;
case SS_CONNECTED:
err = -EISCONN;
goto out;
case SS_CONNECTING:
err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED: // 刚刚创建的 socket 状态为 SS_UNCONNECTED
err = -EISCONN;
if (sk-&gt;sk_state != TCP_CLOSE)
goto out;
err = sk-&gt;sk_prot-&gt;connect(sk, uaddr, addr_len); // 对于 AF_INET 的 TCP socket 来说,这里即是 tcp_v4_connect
if (err &lt; 0)
goto out;
sock-&gt;state = SS_CONNECTING;
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
err = -EINPROGRESS;
break;
}
timeo = sock_sndtimeo(sk, flags &amp; O_NONBLOCK); // 阻塞类型,则有等待超时时间
if ((1 &lt;&lt; sk-&gt;sk_state) &amp; (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
int writebias = (sk-&gt;sk_protocol == IPPROTO_TCP) &amp;&amp;
tcp_sk(sk)-&gt;fastopen_req &amp;&amp;
tcp_sk(sk)-&gt;fastopen_req-&gt;data ? 1 : 0;
/* Error code is set above */
if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) // 等待对端回 SYN+ACK,并更新 sk 状态
goto out;
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
}
/* Connection was closed by RST, timeout, ICMP error
* or another process disconnected us.
*/
if (sk-&gt;sk_state == TCP_CLOSE)
goto sock_error;
/* sk-&gt;sk_err may be not zero now, if RECVERR was ordered by user
* and error was received after socket entered established state.
* Hence, it is handled normally after connect() return successfully.
*/
sock-&gt;state = SS_CONNECTED; // 成功
err = 0;
out:
return err;
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock-&gt;state = SS_UNCONNECTED;
if (sk-&gt;sk_prot-&gt;disconnect(sk, flags))
sock-&gt;state = SS_DISCONNECTING;
goto out;
}
EXPORT_SYMBOL(__inet_stream_connect);
</code></pre>
<pre><code class="language-c">// file: net/ipv4/tcp_ipv4.c
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
if (addr_len &lt; sizeof(struct sockaddr_in))
return -EINVAL;
if (usin-&gt;sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin-&gt;sin_addr.s_addr; // 目标 IP
inet_opt = rcu_dereference_protected(inet-&gt;inet_opt,
sock_owned_by_user(sk));
if (inet_opt &amp;&amp; inet_opt-&gt;opt.srr) {
if (!daddr)
return -EINVAL;
nexthop = inet_opt-&gt;opt.faddr;
}
orig_sport = inet-&gt;inet_sport;
orig_dport = usin-&gt;sin_port;
fl4 = &amp;inet-&gt;cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet-&gt;inet_saddr,
RT_CONN_FLAGS(sk), sk-&gt;sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk, true);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
}
if (rt-&gt;rt_flags &amp; (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet_opt || !inet_opt-&gt;opt.srr)
daddr = fl4-&gt;daddr;
if (!inet-&gt;inet_saddr) // 源 IP 为 ANY
inet-&gt;inet_saddr = fl4-&gt;saddr;
inet-&gt;inet_rcv_saddr = inet-&gt;inet_saddr; // 赋值源 IP
if (tp-&gt;rx_opt.ts_recent_stamp &amp;&amp; inet-&gt;inet_daddr != daddr) {
/* Reset inherited state */
tp-&gt;rx_opt.ts_recent = 0;
tp-&gt;rx_opt.ts_recent_stamp = 0;
if (likely(!tp-&gt;repair))
tp-&gt;write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &amp;&amp;
!tp-&gt;rx_opt.ts_recent_stamp &amp;&amp; fl4-&gt;daddr == daddr)
tcp_fetch_timewait_stamp(sk, &amp;rt-&gt;dst);
inet-&gt;inet_dport = usin-&gt;sin_port; // 目标端口
inet-&gt;inet_daddr = daddr; // 目标 IP
inet_csk(sk)-&gt;icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(sk)-&gt;icsk_ext_hdr_len = inet_opt-&gt;opt.optlen;
tp-&gt;rx_opt.mss_clamp = TCP_MSS_DEFAULT;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT); // 更新状态
err = inet_hash_connect(&amp;tcp_death_row, sk); // 动态选择一个端口
if (err)
goto failure;
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet-&gt;inet_sport, inet-&gt;inet_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
goto failure;
}
/* OK, now commit destination to socket. */
sk-&gt;sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &amp;rt-&gt;dst); // 特性能力
if (!tp-&gt;write_seq &amp;&amp; likely(!tp-&gt;repair))
tp-&gt;write_seq = secure_tcp_sequence_number(inet-&gt;inet_saddr,
inet-&gt;inet_daddr,
inet-&gt;inet_sport,
usin-&gt;sin_port);
inet-&gt;inet_id = tp-&gt;write_seq ^ jiffies;
err = tcp_connect(sk); // 构建 syn 报文,并将其发送出去
rt = NULL;
if (err)
goto failure;
return 0;
failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk-&gt;sk_route_caps = 0;
inet-&gt;inet_dport = 0;
return err;
}
EXPORT_SYMBOL(tcp_v4_connect);
</code></pre>
<p>执行 <code>inet_hash_connect</code> -> <code>__inet_hash_connect</code> 获取源端口,需要注意:对于有大量相同目标地址(包括 IP 和端口)的连接时,在端口不足的情况下,会非常消耗性能。分析下:</p>
<pre><code class="language-c">// file: net/ipv4/inet_hashtables.c
/*
* Bind a port for a connect operation and hash it.
*/
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), // inet_sk_port_offset 根据目的 IP 和端口生成一个随机数
__inet_check_established, __inet_hash_nolisten); // __inet_check_established 检查是否有现有的 ESTABLISH 状态连接冲突;__inet_hash_nolisten 则是将 sk 添加到 ehash 上
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
struct inet_hashinfo *hinfo = death_row-&gt;hashinfo;
const unsigned short snum = inet_sk(sk)-&gt;inet_num;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb; // 存放某端口的绑定信息
int ret;
struct net *net = sock_net(sk);
int twrefcnt = 1;
if (!snum) { // 源端口为 0
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port_offset;
struct inet_timewait_sock *tw = NULL;
inet_get_local_port_range(&amp;low, &amp;high); // 本地随机端口范围,即 net.ipv4.ip_local_port_range,可查看 /proc/sys/net/ipv4/ip_local_port_range
remaining = (high - low) + 1;
local_bh_disable();
for (i = 1; i &lt;= remaining; i++) {
port = low + (i + offset) % remaining;
if (inet_is_reserved_local_port(port)) // 保留端口,即 net.ipv4.ip_local_reserved_ports
continue;
head = &amp;hinfo-&gt;bhash[inet_bhashfn(net, port,
hinfo-&gt;bhash_size)]; // bhash 即所有绑定过的端口,存放在全局变量 tcp_hashinfo 中
spin_lock(&amp;head-&gt;lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
inet_bind_bucket_for_each(tb, &amp;head-&gt;chain) { // 遍历 bhash 槽
if (net_eq(ib_net(tb), net) &amp;&amp;
tb-&gt;port == port) { // 端口被占用,但只要四元组不同也可以用
if (tb-&gt;fastreuse &gt;= 0 ||
tb-&gt;fastreuseport &gt;= 0)
goto next_port;
WARN_ON(hlist_empty(&amp;tb-&gt;owners));
if (!check_established(death_row, sk,
port, &amp;tw)) // 虽然端口用了,但是 n 元组可能并不一样,这种情况端口也是可以用的。里面判断包括了一般连接状态和 time_wait 状态
goto ok;
goto next_port;
}
}
// 到这里,说明端口没有被使用,可以分配给本连接
tb = inet_bind_bucket_create(hinfo-&gt;bind_bucket_cachep,
net, head, port); // 创建 bhash 的节点,并添加到 head 中,即槽位链表上
if (!tb) {
spin_unlock(&amp;head-&gt;lock);
break;
}
tb-&gt;fastreuse = -1;
tb-&gt;fastreuseport = -1;
goto ok;
next_port:
spin_unlock(&amp;head-&gt;lock);
}
local_bh_enable();
return -EADDRNOTAVAIL;
ok:
hint += i;
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port); // 将 sk 添加到 tb-&gt;owners 上,并且设置 inet-&gt;inet_num = port
if (sk_unhashed(sk)) {
inet_sk(sk)-&gt;inet_sport = htons(port);
twrefcnt += hash(sk, tw); // 传入的是 __inet_hash_nolisten,它会将 sk 添加到 ehash 上,以便接收 SYN+ACK 时可以根据 skb 查找到 sk
}
if (tw)
twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&amp;head-&gt;lock);
if (tw) {
inet_twsk_deschedule(tw, death_row);
while (twrefcnt) {
twrefcnt--;
inet_twsk_put(tw);
}
}
ret = 0;
goto out;
}
// 这里是已经绑定了端口的情况
head = &amp;hinfo-&gt;bhash[inet_bhashfn(net, snum, hinfo-&gt;bhash_size)];
tb = inet_csk(sk)-&gt;icsk_bind_hash;
spin_lock_bh(&amp;head-&gt;lock);
if (sk_head(&amp;tb-&gt;owners) == sk &amp;&amp; !sk-&gt;sk_bind_node.next) { // 如果绑定端口的只有自己,就没有冲突了
hash(sk, NULL); // 同上
spin_unlock_bh(&amp;head-&gt;lock);
return 0;
} else {
spin_unlock(&amp;head-&gt;lock);
/* No definite answer... Walk to established hash table */
ret = check_established(death_row, sk, snum, NULL); // 否则就要检查是否和 establish 有冲突。因为虽然本进程 A 绑定了端口,但可能其它进程 B 发起的连接已经随机用了此端口(因为 B 在分配随机端口时,不检查 owners,只是检查 establish 连接),那么本进程再进行 connect 时,就有可能和其它进程的连接发生冲突,因此需要判断。另外,此函数里还会将 sk 加入到 ehash 中。
out:
local_bh_enable();
return ret;
}
}
</code></pre>
<p>全局变量 tcp_hashinfo 结构如下图,包含多个 hash 表:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=71d3771b143bc9b1e2c34d3661e9dd38&amp;file=file.png" alt="" /></p>
<pre><code class="language-c">// file: net/ipv4/tcp_output.c
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int err;
tcp_connect_init(sk);
if (unlikely(tp-&gt;repair)) {
tcp_finish_connect(sk, NULL);
return 0;
}
buff = sk_stream_alloc_skb(sk, 0, sk-&gt;sk_allocation); // 申请 skb
if (unlikely(!buff))
return -ENOBUFS;
tcp_init_nondata_skb(buff, tp-&gt;write_seq++, TCPHDR_SYN);
tp-&gt;retrans_stamp = TCP_SKB_CB(buff)-&gt;when = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff); // 添加到 sk-&gt;sk_write_queue
TCP_ECN_send_syn(sk, buff);
/* Send off SYN; include data in Fast Open. */
err = tp-&gt;fastopen_req ? tcp_send_syn_data(sk, buff) :
tcp_transmit_skb(sk, buff, 1, sk-&gt;sk_allocation); // 疑问:既然已经加入到 sk-&gt;sk_write_queue,为何这里要带 skb 参数直接发送?
if (err == -ECONNREFUSED)
return err;
/* We change tp-&gt;snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
tp-&gt;snd_nxt = tp-&gt;write_seq;
tp-&gt;pushed_seq = tp-&gt;write_seq;
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)-&gt;icsk_rto, TCP_RTO_MAX); // 启动定时器,超时时间在上面的 tcp_connect_init() 初始化
return 0;
}
EXPORT_SYMBOL(tcp_connect);
</code></pre>
<p>发出的 SYN 包,由服务器接收并处理,在 TCP 层,相关入口就是 <code>tcp_v4_rcv()</code>。</p>
<h2>内核收 SYN+ACK 包</h2>
<p>当 SYN+ACK 包到达时,会进入 <code>tcp_v4_rcv</code> 处理,在此函数中,会在 tcp_hashinfo 中查找对应 sk,然后进入 <code>tcp_v4_do_rcv</code> -> <code>tcp_rcv_state_process</code> 中:</p>
<pre><code class="language-c">// file: net/ipv4/tcp_input.c
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock *req;
int queued = 0;
tp-&gt;rx_opt.saw_tstamp = 0;
switch (sk-&gt;sk_state) {
// 忽略其它代码 ...
case TCP_SYN_SENT:
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued &gt;= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
}
// ...
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp-&gt;rx_opt.mss_clamp;
tcp_parse_options(skb, &amp;tp-&gt;rx_opt, 0, &amp;foc);
if (tp-&gt;rx_opt.saw_tstamp &amp;&amp; tp-&gt;rx_opt.rcv_tsecr)
tp-&gt;rx_opt.rcv_tsecr -= tp-&gt;tsoffset;
if (th-&gt;ack) {
/* rfc793:
* &quot;If the state is SYN-SENT then
* first check the ACK bit
* If the ACK bit is set
* If SEG.ACK =&lt; ISS, or SEG.ACK &gt; SND.NXT, send
* a reset (unless the RST bit is set, if so drop
* the segment and return)&quot;
*/
if (!after(TCP_SKB_CB(skb)-&gt;ack_seq, tp-&gt;snd_una) ||
after(TCP_SKB_CB(skb)-&gt;ack_seq, tp-&gt;snd_nxt))
goto reset_and_undo;
if (tp-&gt;rx_opt.saw_tstamp &amp;&amp; tp-&gt;rx_opt.rcv_tsecr &amp;&amp;
!between(tp-&gt;rx_opt.rcv_tsecr, tp-&gt;retrans_stamp,
tcp_time_stamp)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
}
/* Now ACK is acceptable.
*
* &quot;If the RST bit is set
* If the ACK was acceptable then signal the user &quot;error:
* connection reset&quot;, drop the segment, enter CLOSED state,
* delete TCB, and return.&quot;
*/
if (th-&gt;rst) {
tcp_reset(sk);
goto discard;
}
/* rfc793:
* &quot;fifth, if neither of the SYN or RST bits is set then
* drop the segment and return.&quot;
*
* See note below!
* --ANK(990513)
*/
if (!th-&gt;syn)
goto discard_and_undo;
/* rfc793:
* &quot;If the SYN bit is on ...
* are acceptable then ...
* (our SYN has been ACKed), change the connection
* state to ESTABLISHED...&quot;
*/
TCP_ECN_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)-&gt;seq);
tcp_ack(sk, skb, FLAG_SLOWPATH); // 处理 ACK 包
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
tp-&gt;rcv_nxt = TCP_SKB_CB(skb)-&gt;seq + 1;
tp-&gt;rcv_wup = TCP_SKB_CB(skb)-&gt;seq + 1;
/* RFC1323: The window in SYN &amp; SYN/ACK segments is
* never scaled.
*/
tp-&gt;snd_wnd = ntohs(th-&gt;window);
if (!tp-&gt;rx_opt.wscale_ok) {
tp-&gt;rx_opt.snd_wscale = tp-&gt;rx_opt.rcv_wscale = 0;
tp-&gt;window_clamp = min(tp-&gt;window_clamp, 65535U);
}
if (tp-&gt;rx_opt.saw_tstamp) {
tp-&gt;rx_opt.tstamp_ok = 1;
tp-&gt;tcp_header_len =
sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
tp-&gt;advmss -= TCPOLEN_TSTAMP_ALIGNED;
tcp_store_ts_recent(tp);
} else {
tp-&gt;tcp_header_len = sizeof(struct tcphdr);
}
if (tcp_is_sack(tp) &amp;&amp; sysctl_tcp_fack)
tcp_enable_fack(tp);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk-&gt;icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
tp-&gt;copied_seq = tp-&gt;rcv_nxt;
smp_mb();
tcp_finish_connect(sk, skb); // 继续
if ((tp-&gt;syn_fastopen || tp-&gt;syn_data) &amp;&amp;
tcp_rcv_fastopen_synack(sk, skb, &amp;foc))
return -1;
if (sk-&gt;sk_write_pending ||
icsk-&gt;icsk_accept_queue.rskq_defer_accept ||
icsk-&gt;icsk_ack.pingpong) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
* It may be deleted, but with this feature tcpdumps
* look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX); // 延迟确认
discard:
__kfree_skb(skb);
return 0;
} else {
tcp_send_ack(sk); // 直接确认
}
return -1;
}
/* No ACK in the segment */
if (th-&gt;rst) {
/* rfc793:
* &quot;If the RST bit is set
*
* Otherwise (no ACK) drop the segment and return.&quot;
*/
goto discard_and_undo;
}
/* PAWS check. */
if (tp-&gt;rx_opt.ts_recent_stamp &amp;&amp; tp-&gt;rx_opt.saw_tstamp &amp;&amp;
tcp_paws_reject(&amp;tp-&gt;rx_opt, 0))
goto discard_and_undo;
if (th-&gt;syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
* Particularly, it can be connect to self.
*/
tcp_set_state(sk, TCP_SYN_RECV);
if (tp-&gt;rx_opt.saw_tstamp) {
tp-&gt;rx_opt.tstamp_ok = 1;
tcp_store_ts_recent(tp);
tp-&gt;tcp_header_len =
sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
tp-&gt;tcp_header_len = sizeof(struct tcphdr);
}
tp-&gt;rcv_nxt = TCP_SKB_CB(skb)-&gt;seq + 1;
tp-&gt;copied_seq = tp-&gt;rcv_nxt;
tp-&gt;rcv_wup = TCP_SKB_CB(skb)-&gt;seq + 1;
/* RFC1323: The window in SYN &amp; SYN/ACK segments is
* never scaled.
*/
tp-&gt;snd_wnd = ntohs(th-&gt;window);
tp-&gt;snd_wl1 = TCP_SKB_CB(skb)-&gt;seq;
tp-&gt;max_window = tp-&gt;snd_wnd;
TCP_ECN_rcv_syn(tp, th);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk-&gt;icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
tcp_send_synack(sk);
#if 0
/* Note, we could accept data and URG from this segment.
* There are no obstacles to make this (except that we must
* either change tcp_recvmsg() to prevent it from returning data
* before 3WHS completes per RFC793, or employ TCP Fast Open).
*
* However, if we ignore data in ACKless segments sometimes,
* we have no reasons to accept it sometimes.
* Also, seems the code doing it in step6 of tcp_rcv_state_process
* is not flawless. So, discard packet for sanity.
* Uncomment this return to process the data.
*/
return -1;
#else
goto discard;
#endif
}
/* &quot;fifth, if neither of the SYN or RST bits is set then
* drop the segment and return.&quot;
*/
discard_and_undo:
tcp_clear_options(&amp;tp-&gt;rx_opt);
tp-&gt;rx_opt.mss_clamp = saved_clamp;
goto discard;
reset_and_undo:
tcp_clear_options(&amp;tp-&gt;rx_opt);
tp-&gt;rx_opt.mss_clamp = saved_clamp;
return 1;
}
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_set_state(sk, TCP_ESTABLISHED); // 更新状态
icsk-&gt;icsk_ack.lrcvtime = tcp_time_stamp;
if (skb != NULL) {
icsk-&gt;icsk_af_ops-&gt;sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
}
/* Make sure socket is routed, for correct metrics. */
icsk-&gt;icsk_af_ops-&gt;rebuild_header(sk);
tcp_init_metrics(sk);
tcp_init_congestion_control(sk); // 初始化拥塞控制
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
*/
tp-&gt;lsndtime = tcp_time_stamp;
tcp_init_buffer_space(sk); // 可研究
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); // 打开保活定时器
if (!tp-&gt;rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp-&gt;snd_wnd);
else
tp-&gt;pred_flags = 0;
if (!sock_flag(sk, SOCK_DEAD)) {
sk-&gt;sk_state_change(sk);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
}
</code></pre>