公开学习文档

公开学习文档


网卡初始化

<h2>简述</h2> <p>主要分为 2 个阶段: 1、加载网卡驱动 2、探测到硬件后,调用 probe 函数 3、网卡 up 时,调用 open 函数</p> <p>probe 函数:</p> <p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=8bb3ea0000154cf11a813e10ac1878e5&amp;amp;file=file.png" alt="" /></p> <p>open 函数:</p> <p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=006d288749ac0aecf96de834fe14e655&amp;amp;file=file.png" alt="" /></p> <h2>网卡驱动初始化</h2> <p>网卡驱动在加载时进行的初始化,主要是注册一个 <code>igb_driver</code> :</p> <pre><code class="language-c">// file: igb_main.c /** * igb_init_module - Driver Registration Routine * * igb_init_module is the first routine called when the driver is * loaded. All it does is register with the PCI subsystem. **/ static int __init igb_init_module(void) { int ret; pr_info(&amp;quot;%s - version %s\n&amp;quot;, igb_driver_string, igb_driver_version); pr_info(&amp;quot;%s\n&amp;quot;, igb_copyright); #ifdef CONFIG_IGB_DCA dca_register_notify(&amp;amp;dca_notifier); #endif ret = pci_register_driver(&amp;amp;igb_driver); return ret; } module_init(igb_init_module);</code></pre> <p><code>igb_driver</code> 里主要关注 <code>.probe</code> 函数,当网卡设备被识别以后,就会调用 <code>.probe</code> 即 <code>igb_probe</code> 函数,其定义如下:</p> <pre><code class="language-c">// file: igb_main.c static struct pci_driver igb_driver = { .name = igb_driver_name, .id_table = igb_pci_tbl, .probe = igb_probe, // 主要关注此函数,会在检测硬件时执行 .remove = igb_remove, #ifdef CONFIG_PM .driver.pm = &amp;amp;igb_pm_ops, #endif .shutdown = igb_shutdown, .sriov_configure = igb_pci_sriov_configure, .err_handler = &amp;amp;igb_err_handler };</code></pre> <p><code>igb_probe</code> 函数里会完成以下工作:</p> <ol> <li>初始化 DMZ</li> <li>注册 ethtool 函数</li> <li>注册 net_ddevice_ops、netdev 等变量</li> <li>初始化 q_vector,以及 q_vector-&gt;napi,即 NAPI 初始化,注册 poll 函数</li> </ol> <pre><code class="language-c">// file: igb_main.c /** * igb_probe - Device Initialization Routine * @pdev: PCI device information struct * @ent: entry in igb_pci_tbl * * Returns 0 on success, negative on failure * * igb_probe initializes an adapter identified by a pci_dev structure. * The OS initialization, configuring of the adapter private structure, * and a hardware reset occur. **/ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { // ... netdev-&amp;gt;netdev_ops = &amp;amp;igb_netdev_ops; // 关注 .ndo_open 即 igb_open 函数,会在网卡 up 的时候调用 igb_set_ethtool_ops(netdev); // .. /* setup the private structure */ err = igb_sw_init(adapter); // 会进一步调用 igb_init_interrupt_scheme 设置中断资源,q_vectors 会在这里申请 if (err) goto err_sw_init; // .. } static const struct net_device_ops igb_netdev_ops = { .ndo_open = igb_open, .ndo_stop = igb_close, .ndo_start_xmit = igb_xmit_frame, .ndo_get_stats64 = igb_get_stats64, .ndo_set_rx_mode = igb_set_rx_mode, .ndo_set_mac_address = igb_set_mac, .ndo_change_mtu = igb_change_mtu, .ndo_do_ioctl = igb_ioctl, .ndo_tx_timeout = igb_tx_timeout, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = igb_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = igb_vlan_rx_kill_vid, .ndo_set_vf_mac = igb_ndo_set_vf_mac, .ndo_set_vf_vlan = igb_ndo_set_vf_vlan, .ndo_set_vf_tx_rate = igb_ndo_set_vf_bw, .ndo_set_vf_spoofchk = igb_ndo_set_vf_spoofchk, .ndo_get_vf_config = igb_ndo_get_vf_config, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = igb_netpoll, #endif .ndo_fix_features = igb_fix_features, .ndo_set_features = igb_set_features, };</code></pre> <p>在 <code>igb_sw_init</code> 中会进一步调用 <code>igb_init_interrupt_scheme</code> 初始化 <code>q_vector</code>,如下:</p> <pre><code class="language-c">/** * igb_sw_init - Initialize general software structures (struct igb_adapter) * @adapter: board private structure to initialize * * igb_sw_init initializes the Adapter private data structure. * Fields are initialized based on PCI device information and * OS network device settings (MTU size). **/ static int igb_sw_init(struct igb_adapter *adapter) { // .. /* This call may decrease the number of queues */ if (igb_init_interrupt_scheme(adapter, true)) { dev_err(&amp;amp;pdev-&amp;gt;dev, &amp;quot;Unable to allocate memory for queues\n&amp;quot;); return -ENOMEM; } // .. } /** * igb_init_interrupt_scheme - initialize interrupts, allocate queues/vectors * @adapter: board private structure to initialize * @msix: boolean value of MSIX capability * * This function initializes the interrupts and allocates all of the queues. **/ static int igb_init_interrupt_scheme(struct igb_adapter *adapter, bool msix) { struct pci_dev *pdev = adapter-&amp;gt;pdev; int err; igb_set_interrupt_capability(adapter, msix); err = igb_alloc_q_vectors(adapter); // q_vectors if (err) { dev_err(&amp;amp;pdev-&amp;gt;dev, &amp;quot;Unable to allocate memory for vectors\n&amp;quot;); goto err_alloc_q_vectors; } igb_cache_ring_register(adapter); return 0; err_alloc_q_vectors: igb_reset_interrupt_capability(adapter); return err; } /** * igb_alloc_q_vectors - Allocate memory for interrupt vectors * @adapter: board private structure to initialize * * We allocate one q_vector per queue interrupt. If allocation fails we * return -ENOMEM. **/ static int igb_alloc_q_vectors(struct igb_adapter *adapter) { int q_vectors = adapter-&amp;gt;num_q_vectors; int rxr_remaining = adapter-&amp;gt;num_rx_queues; int txr_remaining = adapter-&amp;gt;num_tx_queues; int rxr_idx = 0, txr_idx = 0, v_idx = 0; int err; if (q_vectors &amp;gt;= (rxr_remaining + txr_remaining)) { for (; rxr_remaining; v_idx++) { err = igb_alloc_q_vector(adapter, q_vectors, v_idx, 0, 0, 1, rxr_idx); // q_vector if (err) goto err_out; /* update counts and index */ rxr_remaining--; rxr_idx++; } } for (; v_idx &amp;lt; q_vectors; v_idx++) { int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - v_idx); int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - v_idx); err = igb_alloc_q_vector(adapter, q_vectors, v_idx, tqpv, txr_idx, rqpv, rxr_idx); if (err) goto err_out; /* update counts and index */ rxr_remaining -= rqpv; txr_remaining -= tqpv; rxr_idx++; txr_idx++; } return 0; err_out: adapter-&amp;gt;num_tx_queues = 0; adapter-&amp;gt;num_rx_queues = 0; adapter-&amp;gt;num_q_vectors = 0; while (v_idx--) igb_free_q_vector(adapter, v_idx); return -ENOMEM; } /** * igb_alloc_q_vector - Allocate memory for a single interrupt vector * @adapter: board private structure to initialize * @v_count: q_vectors allocated on adapter, used for ring interleaving * @v_idx: index of vector in adapter struct * @txr_count: total number of Tx rings to allocate * @txr_idx: index of first Tx ring to allocate * @rxr_count: total number of Rx rings to allocate * @rxr_idx: index of first Rx ring to allocate * * We allocate one q_vector. If allocation fails we return -ENOMEM. **/ static int igb_alloc_q_vector(struct igb_adapter *adapter, int v_count, int v_idx, int txr_count, int txr_idx, int rxr_count, int rxr_idx) { // .. ring_count = txr_count + rxr_count; size = sizeof(struct igb_q_vector) + (sizeof(struct igb_ring) * ring_count); // q_vector 下面就是 igb_ring 数组 /* allocate q_vector and rings */ q_vector = kzalloc(size, GFP_KERNEL); // 为新的 q_vector 对象申请内存 if (!q_vector) return -ENOMEM; /* initialize NAPI */ netif_napi_add(adapter-&amp;gt;netdev, &amp;amp;q_vector-&amp;gt;napi, igb_poll, 64); // 注册了 igb_poll 函数。napi 也是每个中断一个(也就是每个 CPU 一个)。 /* tie q_vector and adapter together */ adapter-&amp;gt;q_vector[v_idx] = q_vector; // 放到 q_vector 数组中 q_vector-&amp;gt;adapter = adapter; // ... ring 初始化,具体存放 skb 的内存,不在这里分配 }</code></pre> <p>画个图:</p> <p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=9333fe445199454e68540d09e8491f95&amp;amp;file=file.png" alt="" /></p> <pre><code class="language-c">// file: net/core/dev.c void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&amp;amp;napi-&amp;gt;poll_list); napi-&amp;gt;gro_count = 0; napi-&amp;gt;gro_list = NULL; napi-&amp;gt;skb = NULL; napi-&amp;gt;poll = poll; // 这里设置 napi 的 poll 函数 if (weight &amp;gt; NAPI_POLL_WEIGHT) pr_err_once(&amp;quot;netif_napi_add() called with weight %d on device %s\n&amp;quot;, weight, dev-&amp;gt;name); napi-&amp;gt;weight = weight; list_add(&amp;amp;napi-&amp;gt;dev_list, &amp;amp;dev-&amp;gt;napi_list); // 有个 napi_list 保存所有 napi napi-&amp;gt;dev = dev; #ifdef CONFIG_NETPOLL spin_lock_init(&amp;amp;napi-&amp;gt;poll_lock); napi-&amp;gt;poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &amp;amp;napi-&amp;gt;state); } EXPORT_SYMBOL(netif_napi_add);</code></pre> <h2>启动网卡</h2> <p>启动网卡,调用 <code>igb_open</code> -&gt; <code>__igb_open</code>,主要完成以下工作: 1、分配发送、接收描述符数组,即环形缓冲区 2、注册中断处理函数,即 <code>igb_misx_ring</code> 3、启用 NAPI</p> <pre><code class="language-c">// file: igb_main.c /** * igb_open - Called when a network interface is made active * @netdev: network interface device structure * * Returns 0 on success, negative value on failure * * The open entry point is called when a network interface is made * active by the system (IFF_UP). At this point all resources needed * for transmit and receive operations are allocated, the interrupt * handler is registered with the OS, the watchdog timer is started, * and the stack is notified that the interface is ready. **/ static int __igb_open(struct net_device *netdev, bool resuming) { struct igb_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &amp;amp;adapter-&amp;gt;hw; struct pci_dev *pdev = adapter-&amp;gt;pdev; int err; int i; /* disallow open during test */ if (test_bit(__IGB_TESTING, &amp;amp;adapter-&amp;gt;state)) { WARN_ON(resuming); return -EBUSY; } if (!resuming) pm_runtime_get_sync(&amp;amp;pdev-&amp;gt;dev); netif_carrier_off(netdev); /* allocate transmit descriptors */ err = igb_setup_all_tx_resources(adapter); // 发送描述符数组 if (err) goto err_setup_tx; /* allocate receive descriptors */ err = igb_setup_all_rx_resources(adapter); // 接收描述符数组 if (err) goto err_setup_rx; igb_power_up_link(adapter); /* before we allocate an interrupt, we must be ready to handle it. * Setting DEBUG_SHIRQ in the kernel makes it fire an interrupt * as soon as we call pci_request_irq, so we have to setup our * clean_rx handler before we do so. */ igb_configure(adapter); err = igb_request_irq(adapter); // 注册中断处理函数 if (err) goto err_req_irq; /* Notify the stack of the actual queue counts. */ err = netif_set_real_num_tx_queues(adapter-&amp;gt;netdev, adapter-&amp;gt;num_tx_queues); if (err) goto err_set_queues; err = netif_set_real_num_rx_queues(adapter-&amp;gt;netdev, adapter-&amp;gt;num_rx_queues); if (err) goto err_set_queues; /* From here on the code is the same as igb_up() */ clear_bit(__IGB_DOWN, &amp;amp;adapter-&amp;gt;state); for (i = 0; i &amp;lt; adapter-&amp;gt;num_q_vectors; i++) // 中断数量 napi_enable(&amp;amp;(adapter-&amp;gt;q_vector[i]-&amp;gt;napi)); // 启用 NAPI,里面只是设置一些标记而言。注意到每个 q_vector 中都有一个 napi 成员,具体见下面 q_vector 结构体定义 /* Clear any pending interrupts. */ rd32(E1000_ICR); igb_irq_enable(adapter); /* notify VFs that reset has been completed */ if (adapter-&amp;gt;vfs_allocated_count) { u32 reg_data = rd32(E1000_CTRL_EXT); reg_data |= E1000_CTRL_EXT_PFRSTD; wr32(E1000_CTRL_EXT, reg_data); } netif_tx_start_all_queues(netdev); if (!resuming) pm_runtime_put(&amp;amp;pdev-&amp;gt;dev); /* start the watchdog. */ hw-&amp;gt;mac.get_link_status = 1; schedule_work(&amp;amp;adapter-&amp;gt;watchdog_task); return 0; err_set_queues: igb_free_irq(adapter); err_req_irq: igb_release_hw_control(adapter); igb_power_down_link(adapter); igb_free_all_rx_resources(adapter); err_setup_rx: igb_free_all_tx_resources(adapter); err_setup_tx: igb_reset(adapter); if (!resuming) pm_runtime_put(&amp;amp;pdev-&amp;gt;dev); return err; }</code></pre> <p>分配描述符数组,即环形缓冲区,这里以接收为例: 1、环境缓冲区 rx_ring 有 2 部分内存:rx_ring-&gt;rx_buffer_info 和 rx_ring-&gt;desc(DMA) 前者是内核使用;后者是网卡硬件使用。</p> <pre><code class="language-c">// file: igb_main.c /** * igb_setup_all_rx_resources - wrapper to allocate Rx resources * (Descriptors) for all queues * @adapter: board private structure * * Return 0 on success, negative on failure **/ static int igb_setup_all_rx_resources(struct igb_adapter *adapter) { struct pci_dev *pdev = adapter-&amp;gt;pdev; int i, err = 0; for (i = 0; i &amp;lt; adapter-&amp;gt;num_rx_queues; i++) { err = igb_setup_rx_resources(adapter-&amp;gt;rx_ring[i]); // 具体为每个 ring 分配可存放 skb 的内存。每个队列都设置 rx_ring(发送队列也是单独的)。是环形缓冲区? if (err) { dev_err(&amp;amp;pdev-&amp;gt;dev, &amp;quot;Allocation for Rx Queue %u failed\n&amp;quot;, i); for (i--; i &amp;gt;= 0; i--) igb_free_rx_resources(adapter-&amp;gt;rx_ring[i]); break; } } return err; } /** * igb_setup_rx_resources - allocate Rx resources (Descriptors) * @rx_ring: Rx descriptor ring (for a specific queue) to setup * * Returns 0 on success, negative on failure **/ int igb_setup_rx_resources(struct igb_ring *rx_ring) { struct device *dev = rx_ring-&amp;gt;dev; int size; size = sizeof(struct igb_rx_buffer) * rx_ring-&amp;gt;count; // 环形缓冲区大小 rx_ring-&amp;gt;rx_buffer_info = vzalloc(size); if (!rx_ring-&amp;gt;rx_buffer_info) goto err; /* Round up to nearest 4K */ rx_ring-&amp;gt;size = rx_ring-&amp;gt;count * sizeof(union e1000_adv_rx_desc); rx_ring-&amp;gt;size = ALIGN(rx_ring-&amp;gt;size, 4096); rx_ring-&amp;gt;desc = dma_alloc_coherent(dev, rx_ring-&amp;gt;size, &amp;amp;rx_ring-&amp;gt;dma, GFP_KERNEL); if (!rx_ring-&amp;gt;desc) goto err; rx_ring-&amp;gt;next_to_alloc = 0; rx_ring-&amp;gt;next_to_clean = 0; rx_ring-&amp;gt;next_to_use = 0; return 0; err: vfree(rx_ring-&amp;gt;rx_buffer_info); rx_ring-&amp;gt;rx_buffer_info = NULL; dev_err(dev, &amp;quot;Unable to allocate memory for the Rx descriptor ring\n&amp;quot;); return -ENOMEM; }</code></pre> <p>注册中断函数: 1、一共申请 adapter-&gt;num_q_vectors 个中断,和队列数量是否相等?(环形缓冲区数量和队列数量是一致的) 2、</p> <pre><code class="language-c">// file: igb_main.c /** * igb_request_irq - initialize interrupts * @adapter: board private structure to initialize * * Attempts to configure interrupts using the best available * capabilities of the hardware and kernel. **/ static int igb_request_irq(struct igb_adapter *adapter) { struct net_device *netdev = adapter-&amp;gt;netdev; struct pci_dev *pdev = adapter-&amp;gt;pdev; int err = 0; if (adapter-&amp;gt;msix_entries) { err = igb_request_msix(adapter); // 一般是这种方式 if (!err) goto request_done; /* fall back to MSI */ igb_free_all_tx_resources(adapter); igb_free_all_rx_resources(adapter); igb_clear_interrupt_scheme(adapter); err = igb_init_interrupt_scheme(adapter, false); if (err) goto request_done; igb_setup_all_tx_resources(adapter); igb_setup_all_rx_resources(adapter); igb_configure(adapter); } igb_assign_vector(adapter-&amp;gt;q_vector[0], 0); if (adapter-&amp;gt;flags &amp;amp; IGB_FLAG_HAS_MSI) { err = request_irq(pdev-&amp;gt;irq, igb_intr_msi, 0, netdev-&amp;gt;name, adapter); if (!err) goto request_done; /* fall back to legacy interrupts */ igb_reset_interrupt_capability(adapter); adapter-&amp;gt;flags &amp;amp;= ~IGB_FLAG_HAS_MSI; } err = request_irq(pdev-&amp;gt;irq, igb_intr, IRQF_SHARED, netdev-&amp;gt;name, adapter); if (err) dev_err(&amp;amp;pdev-&amp;gt;dev, &amp;quot;Error %d getting interrupt\n&amp;quot;, err); request_done: return err; } /** * igb_request_msix - Initialize MSI-X interrupts * @adapter: board private structure to initialize * * igb_request_msix allocates MSI-X vectors and requests interrupts from the * kernel. **/ static int igb_request_msix(struct igb_adapter *adapter) { struct net_device *netdev = adapter-&amp;gt;netdev; struct e1000_hw *hw = &amp;amp;adapter-&amp;gt;hw; int i, err = 0, vector = 0, free_vector = 0; err = request_irq(adapter-&amp;gt;msix_entries[vector].vector, igb_msix_other, 0, netdev-&amp;gt;name, adapter); // 不知道是什么 if (err) goto err_out; for (i = 0; i &amp;lt; adapter-&amp;gt;num_q_vectors; i++) { // 根据 num_q_vectors 中断数量 struct igb_q_vector *q_vector = adapter-&amp;gt;q_vector[i]; // 每个中断都有一个 q_vector 结构 vector++; q_vector-&amp;gt;itr_register = hw-&amp;gt;hw_addr + E1000_EITR(vector); if (q_vector-&amp;gt;rx.ring &amp;amp;&amp;amp; q_vector-&amp;gt;tx.ring) // 发送接收队列 sprintf(q_vector-&amp;gt;name, &amp;quot;%s-TxRx-%u&amp;quot;, netdev-&amp;gt;name, q_vector-&amp;gt;rx.ring-&amp;gt;queue_index); else if (q_vector-&amp;gt;tx.ring) // 接收队列 sprintf(q_vector-&amp;gt;name, &amp;quot;%s-tx-%u&amp;quot;, netdev-&amp;gt;name, q_vector-&amp;gt;tx.ring-&amp;gt;queue_index); else if (q_vector-&amp;gt;rx.ring) // 发送队列 sprintf(q_vector-&amp;gt;name, &amp;quot;%s-rx-%u&amp;quot;, netdev-&amp;gt;name, q_vector-&amp;gt;rx.ring-&amp;gt;queue_index); else sprintf(q_vector-&amp;gt;name, &amp;quot;%s-unused&amp;quot;, netdev-&amp;gt;name); err = request_irq(adapter-&amp;gt;msix_entries[vector].vector, igb_msix_ring, 0, q_vector-&amp;gt;name, q_vector); // 注册中断响应函数。这里的 vector 是 irq 的概念,并非中断向量。另外,这里 q_vector 是作为中断响应函数的回调参数 if (err) goto err_free; } igb_configure_msix(adapter); return 0; err_free: /* free already assigned IRQs */ free_irq(adapter-&amp;gt;msix_entries[free_vector++].vector, adapter); vector--; for (i = 0; i &amp;lt; vector; i++) { free_irq(adapter-&amp;gt;msix_entries[free_vector++].vector, adapter-&amp;gt;q_vector[i]); } err_out: return err; } // file: inlcude/linux/interrupt.h struct igb_q_vector { struct igb_adapter *adapter; /* backlink */ int cpu; /* CPU for DCA */ u32 eims_value; /* EIMS mask value */ u16 itr_val; u8 set_itr; void __iomem *itr_register; struct igb_ring_container rx, tx; struct napi_struct napi; // 中断响应函数会用到 struct rcu_head rcu; /* to avoid race with update stats on free */ char name[IFNAMSIZ + 9]; /* for dynamic allocation of rings associated with this q_vector */ struct igb_ring ring[0] ____cacheline_internodealigned_in_smp; // 零长数组,注意存放的并非指针,而是结构体,但 ring 变量本身属于指针(数组名称) }; static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) // dev 是中断函数回调参数,不知道为什么要取 dev 这个名字 { return request_threaded_irq(irq, handler, NULL, flags, name, dev); } // file: igb_main.c static irqreturn_t igb_msix_ring(int irq, void *data) { struct igb_q_vector *q_vector = data; // 取出回调参数,即 q_vector /* Write the ITR value calculated from the previous interrupt. */ igb_write_itr(q_vector); napi_schedule(&amp;amp;q_vector-&amp;gt;napi); return IRQ_HANDLED; }</code></pre>

页面列表

ITEM_HTML