IP收包
<h2>概述</h2>
<p>调用树:</p>
<pre><code class="language-c">ip_rcv // 走 NF_INET_PRE_ROUTING 钩子
ip_rcv_finish // 没有路由,则查路由;查到路由后设置 input = ip_local_deliver
dst_input // 即 dst-&gt;input(skb)
ip_local_deliver // 分包处理;走 NF_INET_LOCAL_IN 钩子
ip_local_deliver_finish // 根据控制层不同协议调用对应接口,对于 TCP,则是 tcp_v4_rcv</code></pre>
<p>ip 注册在 ptype_base 中对应的处理函数是 <code>ip_rcv</code>。</p>
<p>1、检测到为 IP 类型时,进入 <code>ip_rcv</code>
2、先检测和设置一些字段
3、走 PRE_ROUTING 钩子
4、查找路由,根据路由结果设置处理函数,并调用。对于到本机的 ip 包,则是 <code>ip_local_deliver</code>
5、走 LOCAL_IN 钩子,并检测包协议类型,如 TCP 或 UDP,再调用对应的处理函数。对于 tcp 包,则是 <code>tcp_v4_rcv</code></p>
<h2>分析</h2>
<pre><code class="language-c">// file: net/ipv4/ip_input.c
/*
* Main IP Receive routine.
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb-&gt;pkt_type == PACKET_OTHERHOST)
goto drop;
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb-&gt;len);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { // 注意这里会保证 skb 是独有的,因为如果不是独有,就会 clone 一份出来
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
if (iph-&gt;ihl &lt; 5 || iph-&gt;version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph-&gt;ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph-&gt;ihl)))
goto csum_error;
len = ntohs(iph-&gt;tot_len);
if (skb-&gt;len &lt; len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len &lt; (iph-&gt;ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb-&gt;len holds ntohs(iph-&gt;tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
skb-&gt;transport_header = skb-&gt;network_header + iph-&gt;ihl*4;
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish); // 意思是:先依次执行对应的钩子函数,如果都返回 1,则最后执行 ip_rcv_finish 函数
csum_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
</code></pre>
<p>> skb_pull 解释:<a href="https://blog.csdn.net/qq_24521983/article/details/71423264">https://blog.csdn.net/qq_24521983/article/details/71423264</a>
> 其它:<a href="https://blog.csdn.net/rikeyone/article/details/108610841">https://blog.csdn.net/rikeyone/article/details/108610841</a></p>
<p>netfilter 框架:</p>
<pre><code class="language-c">// file: include/linux/netfilter.h
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,
struct net_device *in, struct net_device *out,
int (*okfn)(struct sk_buff *))
{
return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);
}
/* Activate hook; either okfn or kfree_skb called, unless a hook
returns NF_STOLEN (in which case, it's up to the hook to deal with
the consequences).
Returns -ERRNO if packet dropped. Zero means queued, stolen or
accepted.
*/
/* RR:
&gt; I don't want nf_hook to return anything because people might forget
&gt; about async and trust the return value to mean &quot;packet was ok&quot;.
AK:
Just document it clearly, then you can expect some sense from kernel
coders :)
*/
static inline int
NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb,
struct net_device *in, struct net_device *out,
int (*okfn)(struct sk_buff *), int thresh)
{
int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh);
if (ret == 1)
ret = okfn(skb);
return ret;
}
</code></pre>
<p>继续 IP 收包:</p>
<pre><code class="language-c">// file: net/ipv4/ip_input.c
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
if (sysctl_ip_early_demux &amp;&amp; !skb_dst(skb) &amp;&amp; skb-&gt;sk == NULL) {
const struct net_protocol *ipprot;
int protocol = iph-&gt;protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot &amp;&amp; ipprot-&gt;early_demux) {
ipprot-&gt;early_demux(skb);
/* must reload iph, skb-&gt;head might have changed */
iph = ip_hdr(skb);
}
}
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (!skb_dst(skb)) { // 没有路由,则查路由。查出路径,由下一步的处理函数,存放在 dst-&gt;input 中
int err = ip_route_input_noref(skb, iph-&gt;daddr, iph-&gt;saddr,
iph-&gt;tos, skb-&gt;dev);
if (unlikely(err)) {
if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb-&gt;dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)-&gt;tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)-&gt;tclassid;
st[idx&amp;0xFF].o_packets++;
st[idx&amp;0xFF].o_bytes += skb-&gt;len;
st[(idx&gt;&gt;16)&amp;0xFF].i_packets++;
st[(idx&gt;&gt;16)&amp;0xFF].i_bytes += skb-&gt;len;
}
#endif
if (iph-&gt;ihl &gt; 5 &amp;&amp; ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt-&gt;rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt-&gt;dst.dev), IPSTATS_MIB_INMCAST,
skb-&gt;len);
} else if (rt-&gt;rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt-&gt;dst.dev), IPSTATS_MIB_INBCAST,
skb-&gt;len);
return dst_input(skb); // 即 dst-&gt;input(skb)
drop:
kfree_skb(skb);
return NET_RX_DROP;
}</code></pre>
<p>路由匹配这里暂不分析,有时间可以搜索资料研究。</p>
<p>> 关于路由匹配那一段,可参考:<a href="https://blog.csdn.net/Megahertz66/article/details/110239947">https://blog.csdn.net/Megahertz66/article/details/110239947</a></p>
<p>这里经过查路由后,得到的 input 函数是 <code>ip_local_deliver</code>,继续分析:</p>
<pre><code class="language-c">// file: net/ipv4/ip_input.c
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb-&gt;dev, NULL,
ip_local_deliver_finish); // 走钩子,最后调用 ip_local_deliver_finish
}
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb-&gt;dev);
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
{
int protocol = ip_hdr(skb)-&gt;protocol;
const struct net_protocol *ipprot;
int raw;
resubmit:
raw = raw_local_deliver(skb, protocol);
ipprot = rcu_dereference(inet_protos[protocol]); // TCP、UDP ?
if (ipprot != NULL) {
int ret;
if (!ipprot-&gt;no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot-&gt;handler(skb); // 对于 TCP,则是 tcp_v4_rcv
if (ret &lt; 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
}
out:
rcu_read_unlock();
return 0;
}</code></pre>