网络层
<h2>概述</h2>
<p>调用树:</p>
<pre><code class="language-c">ip_queue_xmit // 查路由;设置 IP 头;
ip_local_out
__ip_local_out // 执行 NF_INET_LOCAL_OUT 钩子
dst_output
ip_output // 执行 NF_INET_POST_ROUTING 钩子
ip_finish_output // 检查 IP 层分片
ip_finish_output2 // 查询邻居项
dst_neigh_output // 邻居子系统层</code></pre>
<p>网络层发送接口为 <code>ip_queue_xmit</code></p>
<h2>分析</h2>
<pre><code class="language-c">// file: net/ipv4/ip_output.c
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
{
struct sock *sk = skb-&gt;sk;
struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet-&gt;inet_opt);
fl4 = &amp;fl-&gt;u.ip4;
rt = skb_rtable(skb); // 是否已经有路由
if (rt != NULL)
goto packet_routed;
/* Make sure we can route this packet. */
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (rt == NULL) {
__be32 daddr;
/* Use correct destination address if we have options. */
daddr = inet-&gt;inet_daddr;
if (inet_opt &amp;&amp; inet_opt-&gt;opt.srr)
daddr = inet_opt-&gt;opt.faddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
rt = ip_route_output_ports(sock_net(sk), fl4, sk,
daddr, inet-&gt;inet_saddr,
inet-&gt;inet_dport,
inet-&gt;inet_sport,
sk-&gt;sk_protocol,
RT_CONN_FLAGS(sk),
sk-&gt;sk_bound_dev_if); // 查路由
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &amp;rt-&gt;dst);
}
skb_dst_set_noref(skb, &amp;rt-&gt;dst); // 设置路由结果
packet_routed:
if (inet_opt &amp;&amp; inet_opt-&gt;opt.is_strictroute &amp;&amp; rt-&gt;rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt-&gt;opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb); // 设置 IP 头
*((__be16 *)iph) = htons((4 &lt;&lt; 12) | (5 &lt;&lt; 8) | (inet-&gt;tos &amp; 0xff));
if (ip_dont_fragment(sk, &amp;rt-&gt;dst) &amp;&amp; !skb-&gt;local_df)
iph-&gt;frag_off = htons(IP_DF);
else
iph-&gt;frag_off = 0;
iph-&gt;ttl = ip_select_ttl(inet, &amp;rt-&gt;dst);
iph-&gt;protocol = sk-&gt;sk_protocol;
ip_copy_addrs(iph, fl4);
/* Transport layer set skb-&gt;h.foo itself. */
if (inet_opt &amp;&amp; inet_opt-&gt;opt.optlen) {
iph-&gt;ihl += inet_opt-&gt;opt.optlen &gt;&gt; 2;
ip_options_build(skb, &amp;inet_opt-&gt;opt, inet-&gt;inet_daddr, rt, 0);
}
ip_select_ident_segs(skb, sk, skb_shinfo(skb)-&gt;gso_segs ?: 1);
skb-&gt;priority = sk-&gt;sk_priority;
skb-&gt;mark = sk-&gt;sk_mark;
res = ip_local_out(skb); // 发送
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
EXPORT_SYMBOL(ip_queue_xmit);
int ip_local_out(struct sk_buff *skb)
{
int err;
err = __ip_local_out(skb); // 执行 netfilter 钩子
if (likely(err == 1))
err = dst_output(skb); // err == 1 说明 __ip_local_out 中没有调用 dst_output,所以这里继续调用
return err;
}
EXPORT_SYMBOL_GPL(ip_local_out);
int __ip_local_out(struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph-&gt;tot_len = htons(skb-&gt;len);
ip_send_check(iph);
skb-&gt;protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
skb_dst(skb)-&gt;dev, dst_output); // 执行 netfilter 钩子,通过后就调用 dst_output
}
/* Output packet to network from transport. */
static inline int dst_output(struct sk_buff *skb)
{
return skb_dst(skb)-&gt;output(skb);
}</code></pre>
<p>根据路由表进行发送,先看看 dst 结构:</p>
<pre><code class="language-c">// file: include/net/dst.h
struct dst_entry {
struct rcu_head rcu_head;
struct dst_entry *child;
struct net_device *dev;
struct dst_ops *ops;
unsigned long _metrics;
unsigned long expires;
struct dst_entry *path;
struct dst_entry *from;
#ifdef CONFIG_XFRM
struct xfrm_state *xfrm;
#else
void *__pad1;
#endif
int (*input)(struct sk_buff *);
int (*output)(struct sk_buff *);
unsigned short flags;
#define DST_HOST 0x0001
#define DST_NOXFRM 0x0002
#define DST_NOPOLICY 0x0004
#define DST_NOHASH 0x0008
#define DST_NOCACHE 0x0010
#define DST_NOCOUNT 0x0020
#define DST_NOPEER 0x0040
#define DST_FAKE_RTABLE 0x0080
#define DST_XFRM_TUNNEL 0x0100
#define DST_XFRM_QUEUE 0x0200
unsigned short pending_confirm;
short error;
/* A non-zero value of dst-&gt;obsolete forces by-hand validation
* of the route entry. Positive values are set by the generic
* dst layer to indicate that the entry has been forcefully
* destroyed.
*
* Negative values are used by the implementation layer code to
* force invocation of the dst_ops-&gt;check() method.
*/
short obsolete;
#define DST_OBSOLETE_NONE 0
#define DST_OBSOLETE_DEAD 2
#define DST_OBSOLETE_FORCE_CHK -1
#define DST_OBSOLETE_KILL -2
unsigned short header_len; /* more space at head required */
unsigned short trailer_len; /* space to reserve at tail */
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 tclassid;
#else
__u32 __pad2;
#endif
/*
* Align __refcnt to a 64 bytes alignment
* (L1_CACHE_SIZE would be too much)
*/
#ifdef CONFIG_64BIT
long __pad_to_align_refcnt[2];
#endif
/*
* __refcnt wants to be on a different cache line from
* input/output/ops or performance tanks badly
*/
atomic_t __refcnt; /* client references */
int __use;
unsigned long lastuse;
union {
struct dst_entry *next;
struct rtable __rcu *rt_next;
struct rt6_info *rt6_next;
struct dn_route __rcu *dn_next;
};
};
</code></pre>
<p>上面的 output,在此场景下实际会设置为 <code>ip_output</code>,具体设置的地方为:<code>net/ipv4/route.c:__mkroute_output()</code></p>
<p>> 关于路由相关知识,有一篇原作者的文档:<a href="https://www.51cto.com/article/698945.html">https://www.51cto.com/article/698945.html</a></p>
<p>继续分析:</p>
<pre><code class="language-c">// file: net/ipv4/ip_output.c
int ip_output(struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)-&gt;dev;
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb-&gt;len);
skb-&gt;dev = dev;
skb-&gt;protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)-&gt;flags &amp; IPSKB_REROUTED)); // POST_ROUTING 钩子
}
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) &amp;&amp; defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)-&gt;xfrm != NULL) {
IPCB(skb)-&gt;flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb-&gt;len &gt; ip_skb_dst_mtu(skb) &amp;&amp; !skb_is_gso(skb)) // 如果超过 MTU 并且不支持 GSO 的话
return ip_fragment(skb, ip_finish_output2); // ip 分片
else
return ip_finish_output2(skb); // 继续,看简单的场景
}
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst-&gt;dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;
if (rt-&gt;rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb-&gt;len);
} else if (rt-&gt;rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb-&gt;len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) &lt; hh_len &amp;&amp; dev-&gt;header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb-&gt;sk)
skb_set_owner_w(skb2, skb-&gt;sk);
consume_skb(skb);
skb = skb2;
}
rcu_read_lock_bh();
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)-&gt;daddr); // 下一跳
neigh = __ipv4_neigh_lookup_noref(dev, nexthop); // 领居项
if (unlikely(!neigh)) // 没有则创建
neigh = __neigh_create(&amp;arp_tbl, &amp;nexthop, dev, false);
if (!IS_ERR(neigh)) {
int res = dst_neigh_output(dst, neigh, skb); // 根据邻居项,继续往下层传递
rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();
net_dbg_ratelimited(&quot;%s: No header cache and no neighbour!\n&quot;,
__func__);
kfree_skb(skb);
return -EINVAL;
}
</code></pre>