本机收包

<h2>概述</h2> <p>1、在软中断处理函数 <code>net_rx_action</code> 中调用 softnet_data->backlog 的 poll 函数。这里的 poll 即是 <code>process_backlog</code> 2、<code>process_backlog</code>：将 sd->input_pkt_queue 内容移到 sd->process_queue 中，然后调用 <code>__netif_receive_skb</code> 走协议栈。</p> <h2>分析</h2> <p>书接上文。触发 NET_RX_SOFTIRQ 软中断后，调用软中断处理函数 <code>net_rx_action</code>：</p> <pre><code class="language-c">// file: net/core/dev.c static void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = &amp;__get_cpu_var(softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; local_irq_disable(); while (!list_empty(&amp;sd-&gt;poll_list)) { struct napi_struct *n; int work, weight; /* If softirq window is exhuasted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget &lt;= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; local_irq_enable(); /* Even though interrupts have been re-enabled, this * access is safe because interrupts can only add new * entries to the tail of this list, and only -&gt;poll() * calls can remove this head entry from the list. */ n = list_first_entry(&amp;sd-&gt;poll_list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); weight = n-&gt;weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the -&gt;poll() call. Therefore we avoid * accidentally calling -&gt;poll() when NAPI is not scheduled. */ work = 0; if (test_bit(NAPI_STATE_SCHED, &amp;n-&gt;state)) { work = n-&gt;poll(n, weight); trace_napi_poll(n); } WARN_ON_ONCE(work &gt; weight); budget -= work; local_irq_disable(); /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still &quot;owns&quot; the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(work == weight)) { if (unlikely(napi_disable_pending(n))) { local_irq_enable(); napi_complete(n); local_irq_disable(); } else { if (n-&gt;gro_list) { /* flush too old packets * If HZ &lt; 1000, flush all packets. */ local_irq_enable(); napi_gro_flush(n, HZ &gt;= 1000); local_irq_disable(); } list_move_tail(&amp;n-&gt;poll_list, &amp;sd-&gt;poll_list); } } netpoll_poll_unlock(have); } out: net_rps_action_and_irq_enable(sd); #ifdef CONFIG_NET_DMA /* * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */ dma_issue_pending_all(); #endif return; softnet_break: sd-&gt;time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; }</code></pre> <p>那么 sd->backlog 的 poll 函数是从哪里来的呢？</p> <pre><code class="language-c">// file: net/core/dev.c /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. * */ /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; INIT_LIST_HEAD(&amp;ptype_all); for (i = 0; i &lt; PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&amp;ptype_base[i]); INIT_LIST_HEAD(&amp;offload_base); if (register_pernet_subsys(&amp;netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ for_each_possible_cpu(i) { struct softnet_data *sd = &amp;per_cpu(softnet_data, i); memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&amp;sd-&gt;input_pkt_queue); skb_queue_head_init(&amp;sd-&gt;process_queue); sd-&gt;completion_queue = NULL; INIT_LIST_HEAD(&amp;sd-&gt;poll_list); sd-&gt;output_queue = NULL; sd-&gt;output_queue_tailp = &amp;sd-&gt;output_queue; #ifdef CONFIG_RPS sd-&gt;csd.func = rps_trigger_softirq; sd-&gt;csd.info = sd; sd-&gt;csd.flags = 0; sd-&gt;cpu = i; #endif sd-&gt;backlog.poll = process_backlog; // 这里设置。注意是每个 CPU 都有自己的 softnet_data，因此也是每个都要初始化 sd-&gt;backlog.weight = weight_p; sd-&gt;backlog.gro_list = NULL; sd-&gt;backlog.gro_count = 0; } dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ if (register_pernet_device(&amp;loopback_net_ops)) goto out; if (register_pernet_device(&amp;default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); dst_init(); rc = 0; out: return rc; } </code></pre> <p>继续分析收包：</p> <pre><code class="language-c">// file: net/core/dev.c static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); #ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd-&gt;rps_ipi_list) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } #endif napi-&gt;weight = weight_p; local_irq_disable(); while (work &lt; quota) { struct sk_buff *skb; unsigned int qlen; while ((skb = __skb_dequeue(&amp;sd-&gt;process_queue))) { // 本质是还是处理这个 rcu_read_lock(); local_irq_enable(); __netif_receive_skb(skb); // 收包 rcu_read_unlock(); local_irq_disable(); input_queue_head_incr(sd); if (++work &gt;= quota) { local_irq_enable(); return work; } } rps_lock(sd); qlen = skb_queue_len(&amp;sd-&gt;input_pkt_queue); // lo 发送的 skb 就是放到这个队列 if (qlen) skb_queue_splice_tail_init(&amp;sd-&gt;input_pkt_queue, &amp;sd-&gt;process_queue); // 将 sd-&gt;input_pkt_queue 内容放到 sd-&gt;process_queue 的后面，并将 sd-&gt;input_pkt_queue 清空。 if (qlen &lt; quota - work) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set on backlog. * we can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&amp;napi-&gt;poll_list); napi-&gt;state = 0; quota = work + qlen; } rps_unlock(sd); } local_irq_enable(); return work; } // file: include/linux/skbuff.h /** * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head-&gt;prev, (struct sk_buff *) head); head-&gt;qlen += list-&gt;qlen; __skb_queue_head_init(list); } } </code></pre> <p><code>__netif_receive_skb</code> 之后的调用过程是一样的，即 <code>__netif_receive_skb</code> => <code>__netif_receive_skb_core</code> => <code>deliver_skb</code>，将 skb 送入 <code>ip_rcv</code>。</p>

公开学习文档

本机收包

页面列表