本机收包
<h2>概述</h2>
<p>1、在软中断处理函数 <code>net_rx_action</code> 中调用 softnet_data->backlog 的 poll 函数。这里的 poll 即是 <code>process_backlog</code>
2、<code>process_backlog</code>:将 sd->input_pkt_queue 内容移到 sd->process_queue 中,然后调用 <code>__netif_receive_skb</code> 走协议栈。</p>
<h2>分析</h2>
<p>书接上文。
触发 NET_RX_SOFTIRQ 软中断后,调用软中断处理函数 <code>net_rx_action</code>:</p>
<pre><code class="language-c">// file: net/core/dev.c
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &amp;__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&amp;sd-&gt;poll_list)) {
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget &lt;= 0 || time_after_eq(jiffies, time_limit)))
goto softnet_break;
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only -&gt;poll()
* calls can remove this head entry from the list.
*/
n = list_first_entry(&amp;sd-&gt;poll_list, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n-&gt;weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the -&gt;poll() call. Therefore we avoid
* accidentally calling -&gt;poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &amp;n-&gt;state)) {
work = n-&gt;poll(n, weight);
trace_napi_poll(n);
}
WARN_ON_ONCE(work &gt; weight);
budget -= work;
local_irq_disable();
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still &quot;owns&quot; the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n))) {
local_irq_enable();
napi_complete(n);
local_irq_disable();
} else {
if (n-&gt;gro_list) {
/* flush too old packets
* If HZ &lt; 1000, flush all packets.
*/
local_irq_enable();
napi_gro_flush(n, HZ &gt;= 1000);
local_irq_disable();
}
list_move_tail(&amp;n-&gt;poll_list, &amp;sd-&gt;poll_list);
}
}
netpoll_poll_unlock(have);
}
out:
net_rps_action_and_irq_enable(sd);
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
dma_issue_pending_all();
#endif
return;
softnet_break:
sd-&gt;time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}</code></pre>
<p>那么 sd->backlog 的 poll 函数是从哪里来的呢?</p>
<pre><code class="language-c">// file: net/core/dev.c
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
*/
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
*/
static int __init net_dev_init(void)
{
int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
if (dev_proc_init())
goto out;
if (netdev_kobject_init())
goto out;
INIT_LIST_HEAD(&amp;ptype_all);
for (i = 0; i &lt; PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&amp;ptype_base[i]);
INIT_LIST_HEAD(&amp;offload_base);
if (register_pernet_subsys(&amp;netdev_net_ops))
goto out;
/*
* Initialise the packet receive queues.
*/
for_each_possible_cpu(i) {
struct softnet_data *sd = &amp;per_cpu(softnet_data, i);
memset(sd, 0, sizeof(*sd));
skb_queue_head_init(&amp;sd-&gt;input_pkt_queue);
skb_queue_head_init(&amp;sd-&gt;process_queue);
sd-&gt;completion_queue = NULL;
INIT_LIST_HEAD(&amp;sd-&gt;poll_list);
sd-&gt;output_queue = NULL;
sd-&gt;output_queue_tailp = &amp;sd-&gt;output_queue;
#ifdef CONFIG_RPS
sd-&gt;csd.func = rps_trigger_softirq;
sd-&gt;csd.info = sd;
sd-&gt;csd.flags = 0;
sd-&gt;cpu = i;
#endif
sd-&gt;backlog.poll = process_backlog; // 这里设置。注意是每个 CPU 都有自己的 softnet_data,因此也是每个都要初始化
sd-&gt;backlog.weight = weight_p;
sd-&gt;backlog.gro_list = NULL;
sd-&gt;backlog.gro_count = 0;
}
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
* is present in a network namespace the loopback device must
* be present. Since we now dynamically allocate and free the
* loopback device ensure this invariant is maintained by
* keeping the loopback device as the first device on the
* list of network devices. Ensuring the loopback devices
* is the first device that appears and the last network device
* that disappears.
*/
if (register_pernet_device(&amp;loopback_net_ops))
goto out;
if (register_pernet_device(&amp;default_device_ops))
goto out;
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
rc = 0;
out:
return rc;
}
</code></pre>
<p>继续分析收包:</p>
<pre><code class="language-c">// file: net/core/dev.c
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
#ifdef CONFIG_RPS
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd-&gt;rps_ipi_list) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
#endif
napi-&gt;weight = weight_p;
local_irq_disable();
while (work &lt; quota) {
struct sk_buff *skb;
unsigned int qlen;
while ((skb = __skb_dequeue(&amp;sd-&gt;process_queue))) { // 本质是还是处理这个
rcu_read_lock();
local_irq_enable();
__netif_receive_skb(skb); // 收包
rcu_read_unlock();
local_irq_disable();
input_queue_head_incr(sd);
if (++work &gt;= quota) {
local_irq_enable();
return work;
}
}
rps_lock(sd);
qlen = skb_queue_len(&amp;sd-&gt;input_pkt_queue); // lo 发送的 skb 就是放到这个队列
if (qlen)
skb_queue_splice_tail_init(&amp;sd-&gt;input_pkt_queue,
&amp;sd-&gt;process_queue); // 将 sd-&gt;input_pkt_queue 内容放到 sd-&gt;process_queue 的后面,并将 sd-&gt;input_pkt_queue 清空。
if (qlen &lt; quota - work) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set on backlog.
* we can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&amp;napi-&gt;poll_list);
napi-&gt;state = 0;
quota = work + qlen;
}
rps_unlock(sd);
}
local_irq_enable();
return work;
}
// file: include/linux/skbuff.h
/**
* skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
* @list: the new list to add
* @head: the place to add it in the first list
*
* Each of the lists is a queue.
* The list at @list is reinitialised
*/
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
struct sk_buff_head *head)
{
if (!skb_queue_empty(list)) {
__skb_queue_splice(list, head-&gt;prev, (struct sk_buff *) head);
head-&gt;qlen += list-&gt;qlen;
__skb_queue_head_init(list);
}
}
</code></pre>
<p><code>__netif_receive_skb</code> 之后的调用过程是一样的,即 <code>__netif_receive_skb</code> => <code>__netif_receive_skb_core</code> => <code>deliver_skb</code>,将 skb 送入 <code>ip_rcv</code>。</p>