收包
<h2>概述</h2>
<p>调用树:</p>
<pre><code class="language-c">软中断 NET_RX_SOFTIRQ
net_rx_action // 遍历 softnet_data-&gt;poll_list,并执行其 poll 函数
igb_poll
igb_clean_rx_irq // 获取 skb
napi_gro_receive // 检查 GRO 特性,将小包合并成大包
napi_skb_finish
netif_receive_skb
__netif_receive_skb
__netif_receive_skb_core // ptype_all 抓包;检查执行 ptype_base 协议
deliver_skb // 调用 ptype_base 里的接口,对于 IP 协议,存放的是 ip_recv</code></pre>
<p>硬中断 -> igb_msix_ring -> 软中断 NET_RX_SOFTIRQ -> net_rx_action -> napi->poll(即 igb_poll)-> napi_gro_receive -></p>
<p>1、网卡收包后,产生硬中断,对应的处理函数为 <code>igb_msix_ring</code>
2、硬中断做的工作很少,主要就是:将 napi 挂在 softnet_data->poll_list 上,然后触发 NET_RX_SOFTIRQ 类型的软中断
3、软中断对应的响应函数为 <code>net_rx_action</code>,进一步调用 napi->poll 函数。对于 igb 而言,即是网卡注册的 <code>igb_poll</code>
4、获取 skb,并根据包类型在 ptype_base 中找到对应的处理函数,进一步处理。对于 IP 类型,即是 <code>ip_rcv</code></p>
<h2>流程</h2>
<p>数据包到达网卡时,网卡在分配给自己的 RingBuffer 中找到可用内存,将数据 DMA 过去保存,此过程 CPU 无感知。待 DMA 完成后,网卡则向 CPU 发起硬中断。</p>
<p>对于 igb 网卡而言,对应的响应函数为 <code>igb_msix_ring</code>。</p>
<p>> 网卡中断分析见:<a href="https://www.showdoc.com.cn/zother/10479112992928211">https://www.showdoc.com.cn/zother/10479112992928211</a></p>
<p>其中最核心的,就是:</p>
<pre><code class="language-c">/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&amp;napi-&gt;poll_list, &amp;sd-&gt;poll_list); // 将 napi 加入。napi 变量是放在 q_vector 下的,具体见网卡初始化
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}</code></pre>
<p>即触发软中断 NET_RX_SOFTIRQ,对应的响应函数是 net_rx_action,是在 <code>core/dev.c:net_dev_init()</code> 中注册的:</p>
<pre><code class="language-c">static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &amp;__get_cpu_var(softnet_data); // 每 CPU 的 softnet_data
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&amp;sd-&gt;poll_list)) {
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget &lt;= 0 || time_after_eq(jiffies, time_limit))) // 有限制
goto softnet_break;
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only -&gt;poll()
* calls can remove this head entry from the list.
*/
n = list_first_entry(&amp;sd-&gt;poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n-&gt;weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the -&gt;poll() call. Therefore we avoid
* accidentally calling -&gt;poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &amp;n-&gt;state)) {
work = n-&gt;poll(n, weight); // 调用 napi-&gt;poll 函数,即网卡的 poll 函数
trace_napi_poll(n);
}
WARN_ON_ONCE(work &gt; weight);
budget -= work;
local_irq_disable();
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still &quot;owns&quot; the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n))) {
local_irq_enable();
napi_complete(n);
local_irq_disable();
} else {
if (n-&gt;gro_list) {
/* flush too old packets
* If HZ &lt; 1000, flush all packets.
*/
local_irq_enable();
napi_gro_flush(n, HZ &gt;= 1000);
local_irq_disable();
}
list_move_tail(&amp;n-&gt;poll_list, &amp;sd-&gt;poll_list);
}
}
netpoll_poll_unlock(have);
}
out:
net_rps_action_and_irq_enable(sd);
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
dma_issue_pending_all();
#endif
return;
softnet_break:
sd-&gt;time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}</code></pre>
<p>在 NET_RX_WOFTIRQ 软中断中,调用 napi->poll(),这是由网卡注册的,对于 igb 而言,即是 <code>igb_poll</code>:</p>
<pre><code class="language-c">// file: igb_main.c
/**
* igb_poll - NAPI Rx polling callback
* @napi: napi polling structure
* @budget: count of how many packets we should handle
**/
static int igb_poll(struct napi_struct *napi, int budget)
{
struct igb_q_vector *q_vector = container_of(napi,
struct igb_q_vector,
napi);
bool clean_complete = true;
#ifdef CONFIG_IGB_DCA
if (q_vector-&gt;adapter-&gt;flags &amp; IGB_FLAG_DCA_ENABLED)
igb_update_dca(q_vector);
#endif
if (q_vector-&gt;tx.ring)
clean_complete = igb_clean_tx_irq(q_vector); // 说一句:里面包含了对发送数据包 RingBuffer 的回收
if (q_vector-&gt;rx.ring)
clean_complete &amp;= igb_clean_rx_irq(q_vector, budget); // 继续
/* If all work not completed, return budget and keep polling */
if (!clean_complete)
return budget;
/* If not enough Rx work done, exit the polling mode */
napi_complete(napi);
igb_ring_irq_enable(q_vector);
return 0;
}
static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
{
struct igb_ring *rx_ring = q_vector-&gt;rx.ring;
struct sk_buff *skb = rx_ring-&gt;skb;
unsigned int total_bytes = 0, total_packets = 0;
u16 cleaned_count = igb_desc_unused(rx_ring); // 未使用的 desc。疑问:desc 是否个数不变,也不会摘取和补充?
do {
union e1000_adv_rx_desc *rx_desc;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count &gt;= IGB_RX_BUFFER_WRITE) { // 这里的意思应该是:如果空闲的 desc 太多,就为对应的 rx_buffer_info 分配内存。这是针对此循环而言的。
igb_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
rx_desc = IGB_RX_DESC(rx_ring, rx_ring-&gt;next_to_clean); // desc
if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_DD))
break;
/* This memory barrier is needed to keep us from reading
* any other fields out of the rx_desc until we know the
* RXD_STAT_DD bit is set
*/
rmb();
/* retrieve a buffer from the ring */
skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb); // 申请并填充 skb。skb 是全新申请的,对于小包是全部拷贝一次到新的 skb 中;分片包似乎是直接用的 ring buffer 里的内存。也就是说,对于小包:网卡 ring buffer 和 skb 没有关系了。
/* exit if we failed to retrieve a buffer */
if (!skb)
break;
cleaned_count++;
/* fetch next buffer in frame if non-eop */
if (igb_is_non_eop(rx_ring, rx_desc)) // 是否结束
continue;
/* verify the packet layout is correct */
if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
skb = NULL;
continue;
}
/* probably a little skewed due to removing CRC */
total_bytes += skb-&gt;len;
/* populate checksum, timestamp, VLAN, and protocol */
igb_process_skb_fields(rx_ring, rx_desc, skb);
napi_gro_receive(&amp;q_vector-&gt;napi, skb); // 注意:这里调用后,后面是没有再管 skb 的,即这里不负责释放
/* reset skb pointer */
skb = NULL;
/* update budget accounting */
total_packets++;
} while (likely(total_packets &lt; budget));
/* place incomplete frames back on ring for completion */
rx_ring-&gt;skb = skb;
u64_stats_update_begin(&amp;rx_ring-&gt;rx_syncp);
rx_ring-&gt;rx_stats.packets += total_packets;
rx_ring-&gt;rx_stats.bytes += total_bytes;
u64_stats_update_end(&amp;rx_ring-&gt;rx_syncp);
q_vector-&gt;rx.total_packets += total_packets;
q_vector-&gt;rx.total_bytes += total_bytes;
if (cleaned_count)
igb_alloc_rx_buffers(rx_ring, cleaned_count);
return (total_packets &lt; budget);
}</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: net/core/dev.c
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
skb_gro_reset_offset(skb);
return napi_skb_finish(dev_gro_receive(napi, skb), skb); // dev_gro_receive 是 GRO 特性,将小包合并成大包,以减少包数
}
EXPORT_SYMBOL(napi_gro_receive);
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
case GRO_NORMAL:
if (netif_receive_skb(skb)) // 一般是这里
ret = GRO_DROP;
break;
case GRO_DROP:
kfree_skb(skb);
break;
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)-&gt;free == NAPI_GRO_FREE_STOLEN_HEAD)
kmem_cache_free(skbuff_head_cache, skb);
else
__kfree_skb(skb);
break;
case GRO_HELD:
case GRO_MERGED:
break;
}
return ret;
}
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
#ifdef CONFIG_RPS
if (static_key_false(&amp;rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &amp;voidflow;
int cpu = get_rps_cpu(skb-&gt;dev, skb, &amp;rflow);
if (cpu &gt;= 0) {
ret = enqueue_to_backlog(skb, cpu, &amp;rflow-&gt;last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb); // 继续
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
if (sk_memalloc_socks() &amp;&amp; skb_pfmemalloc(skb)) {
unsigned long pflags = current-&gt;flags;
/*
* PFMEMALLOC skbs are special, they should
* - be delivered to SOCK_MEMALLOC sockets only
* - stay away from userspace
* - have bounded memory usage
*
* Use PF_MEMALLOC as this saves us from propagating the allocation
* context down to all allocation sites.
*/
current-&gt;flags |= PF_MEMALLOC;
ret = __netif_receive_skb_core(skb, true);
tsk_restore_flags(current, pflags, PF_MEMALLOC);
} else
ret = __netif_receive_skb_core(skb, false); // 继续
return ret;
}
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
// ...
list_for_each_entry_rcu(ptype, &amp;ptype_all, list) { // ptype_all
if (!ptype-&gt;dev || ptype-&gt;dev == skb-&gt;dev) {
if (pt_prev) // 注意:这里延迟处理,原因待确认
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
// ...
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb-&gt;dev : NULL;
type = skb-&gt;protocol;
list_for_each_entry_rcu(ptype,
&amp;ptype_base[ntohs(type) &amp; PTYPE_HASH_MASK], list) { // ptype_base
if (ptype-&gt;type == type &amp;&amp;
(ptype-&gt;dev == null_or_dev || ptype-&gt;dev == skb-&gt;dev ||
ptype-&gt;dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev); // 这里会增加 skb-&gt;user 计数,即会 clone 一份 skb。但是最后一个 ptype_base 则不会,往下看
pt_prev = ptype;
}
}
if (pt_prev) {
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev-&gt;func(skb, skb-&gt;dev, pt_prev, orig_dev); // 与上面不同,并非调用 deliver_skb,而是直接调用 func,这就避免了 clone,独占 skb
} else {
drop:
atomic_long_inc(&amp;skb-&gt;dev-&gt;rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
return ret;
}
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
return -ENOMEM;
atomic_inc(&amp;skb-&gt;users); // 注意
return pt_prev-&gt;func(skb, skb-&gt;dev, pt_prev, orig_dev); // 对于 IP 协议,存放的是 ip_recv
}</code></pre>