公开学习文档

公开学习文档


网络层

<h2>概述</h2> <p>调用树:</p> <pre><code class="language-c">ip_queue_xmit // 查路由;设置 IP 头; ip_local_out __ip_local_out // 执行 NF_INET_LOCAL_OUT 钩子 dst_output ip_output // 执行 NF_INET_POST_ROUTING 钩子 ip_finish_output // 检查 IP 层分片 ip_finish_output2 // 查询邻居项 dst_neigh_output // 邻居子系统层</code></pre> <p>网络层发送接口为 <code>ip_queue_xmit</code></p> <h2>分析</h2> <pre><code class="language-c">// file: net/ipv4/ip_output.c int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) { struct sock *sk = skb-&amp;gt;sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ rcu_read_lock(); inet_opt = rcu_dereference(inet-&amp;gt;inet_opt); fl4 = &amp;amp;fl-&amp;gt;u.ip4; rt = skb_rtable(skb); // 是否已经有路由 if (rt != NULL) goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); if (rt == NULL) { __be32 daddr; /* Use correct destination address if we have options. */ daddr = inet-&amp;gt;inet_daddr; if (inet_opt &amp;amp;&amp;amp; inet_opt-&amp;gt;opt.srr) daddr = inet_opt-&amp;gt;opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet-&amp;gt;inet_saddr, inet-&amp;gt;inet_dport, inet-&amp;gt;inet_sport, sk-&amp;gt;sk_protocol, RT_CONN_FLAGS(sk), sk-&amp;gt;sk_bound_dev_if); // 查路由 if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &amp;amp;rt-&amp;gt;dst); } skb_dst_set_noref(skb, &amp;amp;rt-&amp;gt;dst); // 设置路由结果 packet_routed: if (inet_opt &amp;amp;&amp;amp; inet_opt-&amp;gt;opt.is_strictroute &amp;amp;&amp;amp; rt-&amp;gt;rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt-&amp;gt;opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); // 设置 IP 头 *((__be16 *)iph) = htons((4 &amp;lt;&amp;lt; 12) | (5 &amp;lt;&amp;lt; 8) | (inet-&amp;gt;tos &amp;amp; 0xff)); if (ip_dont_fragment(sk, &amp;amp;rt-&amp;gt;dst) &amp;amp;&amp;amp; !skb-&amp;gt;local_df) iph-&amp;gt;frag_off = htons(IP_DF); else iph-&amp;gt;frag_off = 0; iph-&amp;gt;ttl = ip_select_ttl(inet, &amp;amp;rt-&amp;gt;dst); iph-&amp;gt;protocol = sk-&amp;gt;sk_protocol; ip_copy_addrs(iph, fl4); /* Transport layer set skb-&amp;gt;h.foo itself. */ if (inet_opt &amp;amp;&amp;amp; inet_opt-&amp;gt;opt.optlen) { iph-&amp;gt;ihl += inet_opt-&amp;gt;opt.optlen &amp;gt;&amp;gt; 2; ip_options_build(skb, &amp;amp;inet_opt-&amp;gt;opt, inet-&amp;gt;inet_daddr, rt, 0); } ip_select_ident_segs(skb, sk, skb_shinfo(skb)-&amp;gt;gso_segs ?: 1); skb-&amp;gt;priority = sk-&amp;gt;sk_priority; skb-&amp;gt;mark = sk-&amp;gt;sk_mark; res = ip_local_out(skb); // 发送 rcu_read_unlock(); return res; no_route: rcu_read_unlock(); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } EXPORT_SYMBOL(ip_queue_xmit); int ip_local_out(struct sk_buff *skb) { int err; err = __ip_local_out(skb); // 执行 netfilter 钩子 if (likely(err == 1)) err = dst_output(skb); // err == 1 说明 __ip_local_out 中没有调用 dst_output,所以这里继续调用 return err; } EXPORT_SYMBOL_GPL(ip_local_out); int __ip_local_out(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph-&amp;gt;tot_len = htons(skb-&amp;gt;len); ip_send_check(iph); skb-&amp;gt;protocol = htons(ETH_P_IP); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)-&amp;gt;dev, dst_output); // 执行 netfilter 钩子,通过后就调用 dst_output } /* Output packet to network from transport. */ static inline int dst_output(struct sk_buff *skb) { return skb_dst(skb)-&amp;gt;output(skb); }</code></pre> <p>根据路由表进行发送,先看看 dst 结构:</p> <pre><code class="language-c">// file: include/net/dst.h struct dst_entry { struct rcu_head rcu_head; struct dst_entry *child; struct net_device *dev; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; struct dst_entry *path; struct dst_entry *from; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct sk_buff *); unsigned short flags; #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOHASH 0x0008 #define DST_NOCACHE 0x0010 #define DST_NOCOUNT 0x0020 #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 #define DST_XFRM_QUEUE 0x0200 unsigned short pending_confirm; short error; /* A non-zero value of dst-&amp;gt;obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops-&amp;gt;check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else __u32 __pad2; #endif /* * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT long __pad_to_align_refcnt[2]; #endif /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ atomic_t __refcnt; /* client references */ int __use; unsigned long lastuse; union { struct dst_entry *next; struct rtable __rcu *rt_next; struct rt6_info *rt6_next; struct dn_route __rcu *dn_next; }; }; </code></pre> <p>上面的 output,在此场景下实际会设置为 <code>ip_output</code>,具体设置的地方为:<code>net/ipv4/route.c:__mkroute_output()</code></p> <p>&gt; 关于路由相关知识,有一篇原作者的文档:<a href="https://www.51cto.com/article/698945.html">https://www.51cto.com/article/698945.html</a></p> <p>继续分析:</p> <pre><code class="language-c">// file: net/ipv4/ip_output.c int ip_output(struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)-&amp;gt;dev; IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb-&amp;gt;len); skb-&amp;gt;dev = dev; skb-&amp;gt;protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)-&amp;gt;flags &amp;amp; IPSKB_REROUTED)); // POST_ROUTING 钩子 } static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) &amp;amp;&amp;amp; defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)-&amp;gt;xfrm != NULL) { IPCB(skb)-&amp;gt;flags |= IPSKB_REROUTED; return dst_output(skb); } #endif if (skb-&amp;gt;len &amp;gt; ip_skb_dst_mtu(skb) &amp;amp;&amp;amp; !skb_is_gso(skb)) // 如果超过 MTU 并且不支持 GSO 的话 return ip_fragment(skb, ip_finish_output2); // ip 分片 else return ip_finish_output2(skb); // 继续,看简单的场景 } static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst-&amp;gt;dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; u32 nexthop; if (rt-&amp;gt;rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb-&amp;gt;len); } else if (rt-&amp;gt;rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb-&amp;gt;len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) &amp;lt; hh_len &amp;amp;&amp;amp; dev-&amp;gt;header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); if (skb2 == NULL) { kfree_skb(skb); return -ENOMEM; } if (skb-&amp;gt;sk) skb_set_owner_w(skb2, skb-&amp;gt;sk); consume_skb(skb); skb = skb2; } rcu_read_lock_bh(); nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)-&amp;gt;daddr); // 下一跳 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); // 领居项 if (unlikely(!neigh)) // 没有则创建 neigh = __neigh_create(&amp;amp;arp_tbl, &amp;amp;nexthop, dev, false); if (!IS_ERR(neigh)) { int res = dst_neigh_output(dst, neigh, skb); // 根据邻居项,继续往下层传递 rcu_read_unlock_bh(); return res; } rcu_read_unlock_bh(); net_dbg_ratelimited(&amp;quot;%s: No header cache and no neighbour!\n&amp;quot;, __func__); kfree_skb(skb); return -EINVAL; } </code></pre>

页面列表

ITEM_HTML