网卡初始化
<h2>简述</h2>
<p>主要分为 2 个阶段:
1、加载网卡驱动
2、探测到硬件后,调用 probe 函数
3、网卡 up 时,调用 open 函数</p>
<p>probe 函数:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=8bb3ea0000154cf11a813e10ac1878e5&amp;file=file.png" alt="" /></p>
<p>open 函数:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=006d288749ac0aecf96de834fe14e655&amp;file=file.png" alt="" /></p>
<h2>网卡驱动初始化</h2>
<p>网卡驱动在加载时进行的初始化,主要是注册一个 <code>igb_driver</code> :</p>
<pre><code class="language-c">// file: igb_main.c
/**
* igb_init_module - Driver Registration Routine
*
* igb_init_module is the first routine called when the driver is
* loaded. All it does is register with the PCI subsystem.
**/
static int __init igb_init_module(void)
{
int ret;
pr_info(&quot;%s - version %s\n&quot;,
igb_driver_string, igb_driver_version);
pr_info(&quot;%s\n&quot;, igb_copyright);
#ifdef CONFIG_IGB_DCA
dca_register_notify(&amp;dca_notifier);
#endif
ret = pci_register_driver(&amp;igb_driver);
return ret;
}
module_init(igb_init_module);</code></pre>
<p><code>igb_driver</code> 里主要关注 <code>.probe</code> 函数,当网卡设备被识别以后,就会调用 <code>.probe</code> 即 <code>igb_probe</code> 函数,其定义如下:</p>
<pre><code class="language-c">// file: igb_main.c
static struct pci_driver igb_driver = {
.name = igb_driver_name,
.id_table = igb_pci_tbl,
.probe = igb_probe, // 主要关注此函数,会在检测硬件时执行
.remove = igb_remove,
#ifdef CONFIG_PM
.driver.pm = &amp;igb_pm_ops,
#endif
.shutdown = igb_shutdown,
.sriov_configure = igb_pci_sriov_configure,
.err_handler = &amp;igb_err_handler
};</code></pre>
<p><code>igb_probe</code> 函数里会完成以下工作:</p>
<ol>
<li>初始化 DMZ</li>
<li>注册 ethtool 函数</li>
<li>注册 net_ddevice_ops、netdev 等变量</li>
<li>初始化 q_vector,以及 q_vector->napi,即 NAPI 初始化,注册 poll 函数</li>
</ol>
<pre><code class="language-c">// file: igb_main.c
/**
* igb_probe - Device Initialization Routine
* @pdev: PCI device information struct
* @ent: entry in igb_pci_tbl
*
* Returns 0 on success, negative on failure
*
* igb_probe initializes an adapter identified by a pci_dev structure.
* The OS initialization, configuring of the adapter private structure,
* and a hardware reset occur.
**/
static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
// ...
netdev-&gt;netdev_ops = &amp;igb_netdev_ops; // 关注 .ndo_open 即 igb_open 函数,会在网卡 up 的时候调用
igb_set_ethtool_ops(netdev);
// ..
/* setup the private structure */
err = igb_sw_init(adapter); // 会进一步调用 igb_init_interrupt_scheme 设置中断资源,q_vectors 会在这里申请
if (err)
goto err_sw_init;
// ..
}
static const struct net_device_ops igb_netdev_ops = {
.ndo_open = igb_open,
.ndo_stop = igb_close,
.ndo_start_xmit = igb_xmit_frame,
.ndo_get_stats64 = igb_get_stats64,
.ndo_set_rx_mode = igb_set_rx_mode,
.ndo_set_mac_address = igb_set_mac,
.ndo_change_mtu = igb_change_mtu,
.ndo_do_ioctl = igb_ioctl,
.ndo_tx_timeout = igb_tx_timeout,
.ndo_validate_addr = eth_validate_addr,
.ndo_vlan_rx_add_vid = igb_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = igb_vlan_rx_kill_vid,
.ndo_set_vf_mac = igb_ndo_set_vf_mac,
.ndo_set_vf_vlan = igb_ndo_set_vf_vlan,
.ndo_set_vf_tx_rate = igb_ndo_set_vf_bw,
.ndo_set_vf_spoofchk = igb_ndo_set_vf_spoofchk,
.ndo_get_vf_config = igb_ndo_get_vf_config,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = igb_netpoll,
#endif
.ndo_fix_features = igb_fix_features,
.ndo_set_features = igb_set_features,
};</code></pre>
<p>在 <code>igb_sw_init</code> 中会进一步调用 <code>igb_init_interrupt_scheme</code> 初始化 <code>q_vector</code>,如下:</p>
<pre><code class="language-c">/**
* igb_sw_init - Initialize general software structures (struct igb_adapter)
* @adapter: board private structure to initialize
*
* igb_sw_init initializes the Adapter private data structure.
* Fields are initialized based on PCI device information and
* OS network device settings (MTU size).
**/
static int igb_sw_init(struct igb_adapter *adapter)
{
// ..
/* This call may decrease the number of queues */
if (igb_init_interrupt_scheme(adapter, true)) {
dev_err(&amp;pdev-&gt;dev, &quot;Unable to allocate memory for queues\n&quot;);
return -ENOMEM;
}
// ..
}
/**
* igb_init_interrupt_scheme - initialize interrupts, allocate queues/vectors
* @adapter: board private structure to initialize
* @msix: boolean value of MSIX capability
*
* This function initializes the interrupts and allocates all of the queues.
**/
static int igb_init_interrupt_scheme(struct igb_adapter *adapter, bool msix)
{
struct pci_dev *pdev = adapter-&gt;pdev;
int err;
igb_set_interrupt_capability(adapter, msix);
err = igb_alloc_q_vectors(adapter); // q_vectors
if (err) {
dev_err(&amp;pdev-&gt;dev, &quot;Unable to allocate memory for vectors\n&quot;);
goto err_alloc_q_vectors;
}
igb_cache_ring_register(adapter);
return 0;
err_alloc_q_vectors:
igb_reset_interrupt_capability(adapter);
return err;
}
/**
* igb_alloc_q_vectors - Allocate memory for interrupt vectors
* @adapter: board private structure to initialize
*
* We allocate one q_vector per queue interrupt. If allocation fails we
* return -ENOMEM.
**/
static int igb_alloc_q_vectors(struct igb_adapter *adapter)
{
int q_vectors = adapter-&gt;num_q_vectors;
int rxr_remaining = adapter-&gt;num_rx_queues;
int txr_remaining = adapter-&gt;num_tx_queues;
int rxr_idx = 0, txr_idx = 0, v_idx = 0;
int err;
if (q_vectors &gt;= (rxr_remaining + txr_remaining)) {
for (; rxr_remaining; v_idx++) {
err = igb_alloc_q_vector(adapter, q_vectors, v_idx,
0, 0, 1, rxr_idx); // q_vector
if (err)
goto err_out;
/* update counts and index */
rxr_remaining--;
rxr_idx++;
}
}
for (; v_idx &lt; q_vectors; v_idx++) {
int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - v_idx);
int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - v_idx);
err = igb_alloc_q_vector(adapter, q_vectors, v_idx,
tqpv, txr_idx, rqpv, rxr_idx);
if (err)
goto err_out;
/* update counts and index */
rxr_remaining -= rqpv;
txr_remaining -= tqpv;
rxr_idx++;
txr_idx++;
}
return 0;
err_out:
adapter-&gt;num_tx_queues = 0;
adapter-&gt;num_rx_queues = 0;
adapter-&gt;num_q_vectors = 0;
while (v_idx--)
igb_free_q_vector(adapter, v_idx);
return -ENOMEM;
}
/**
* igb_alloc_q_vector - Allocate memory for a single interrupt vector
* @adapter: board private structure to initialize
* @v_count: q_vectors allocated on adapter, used for ring interleaving
* @v_idx: index of vector in adapter struct
* @txr_count: total number of Tx rings to allocate
* @txr_idx: index of first Tx ring to allocate
* @rxr_count: total number of Rx rings to allocate
* @rxr_idx: index of first Rx ring to allocate
*
* We allocate one q_vector. If allocation fails we return -ENOMEM.
**/
static int igb_alloc_q_vector(struct igb_adapter *adapter,
int v_count, int v_idx,
int txr_count, int txr_idx,
int rxr_count, int rxr_idx)
{
// ..
ring_count = txr_count + rxr_count;
size = sizeof(struct igb_q_vector) +
(sizeof(struct igb_ring) * ring_count); // q_vector 下面就是 igb_ring 数组
/* allocate q_vector and rings */
q_vector = kzalloc(size, GFP_KERNEL); // 为新的 q_vector 对象申请内存
if (!q_vector)
return -ENOMEM;
/* initialize NAPI */
netif_napi_add(adapter-&gt;netdev, &amp;q_vector-&gt;napi,
igb_poll, 64); // 注册了 igb_poll 函数。napi 也是每个中断一个(也就是每个 CPU 一个)。
/* tie q_vector and adapter together */
adapter-&gt;q_vector[v_idx] = q_vector; // 放到 q_vector 数组中
q_vector-&gt;adapter = adapter;
// ... ring 初始化,具体存放 skb 的内存,不在这里分配
}</code></pre>
<p>画个图:</p>
<p><img src="https://www.showdoc.com.cn/server/api/attachment/visitFile?sign=9333fe445199454e68540d09e8491f95&amp;file=file.png" alt="" /></p>
<pre><code class="language-c">// file: net/core/dev.c
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&amp;napi-&gt;poll_list);
napi-&gt;gro_count = 0;
napi-&gt;gro_list = NULL;
napi-&gt;skb = NULL;
napi-&gt;poll = poll; // 这里设置 napi 的 poll 函数
if (weight &gt; NAPI_POLL_WEIGHT)
pr_err_once(&quot;netif_napi_add() called with weight %d on device %s\n&quot;,
weight, dev-&gt;name);
napi-&gt;weight = weight;
list_add(&amp;napi-&gt;dev_list, &amp;dev-&gt;napi_list); // 有个 napi_list 保存所有 napi
napi-&gt;dev = dev;
#ifdef CONFIG_NETPOLL
spin_lock_init(&amp;napi-&gt;poll_lock);
napi-&gt;poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &amp;napi-&gt;state);
}
EXPORT_SYMBOL(netif_napi_add);</code></pre>
<h2>启动网卡</h2>
<p>启动网卡,调用 <code>igb_open</code> -> <code>__igb_open</code>,主要完成以下工作:
1、分配发送、接收描述符数组,即环形缓冲区
2、注册中断处理函数,即 <code>igb_misx_ring</code>
3、启用 NAPI</p>
<pre><code class="language-c">// file: igb_main.c
/**
* igb_open - Called when a network interface is made active
* @netdev: network interface device structure
*
* Returns 0 on success, negative value on failure
*
* The open entry point is called when a network interface is made
* active by the system (IFF_UP). At this point all resources needed
* for transmit and receive operations are allocated, the interrupt
* handler is registered with the OS, the watchdog timer is started,
* and the stack is notified that the interface is ready.
**/
static int __igb_open(struct net_device *netdev, bool resuming)
{
struct igb_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &amp;adapter-&gt;hw;
struct pci_dev *pdev = adapter-&gt;pdev;
int err;
int i;
/* disallow open during test */
if (test_bit(__IGB_TESTING, &amp;adapter-&gt;state)) {
WARN_ON(resuming);
return -EBUSY;
}
if (!resuming)
pm_runtime_get_sync(&amp;pdev-&gt;dev);
netif_carrier_off(netdev);
/* allocate transmit descriptors */
err = igb_setup_all_tx_resources(adapter); // 发送描述符数组
if (err)
goto err_setup_tx;
/* allocate receive descriptors */
err = igb_setup_all_rx_resources(adapter); // 接收描述符数组
if (err)
goto err_setup_rx;
igb_power_up_link(adapter);
/* before we allocate an interrupt, we must be ready to handle it.
* Setting DEBUG_SHIRQ in the kernel makes it fire an interrupt
* as soon as we call pci_request_irq, so we have to setup our
* clean_rx handler before we do so.
*/
igb_configure(adapter);
err = igb_request_irq(adapter); // 注册中断处理函数
if (err)
goto err_req_irq;
/* Notify the stack of the actual queue counts. */
err = netif_set_real_num_tx_queues(adapter-&gt;netdev,
adapter-&gt;num_tx_queues);
if (err)
goto err_set_queues;
err = netif_set_real_num_rx_queues(adapter-&gt;netdev,
adapter-&gt;num_rx_queues);
if (err)
goto err_set_queues;
/* From here on the code is the same as igb_up() */
clear_bit(__IGB_DOWN, &amp;adapter-&gt;state);
for (i = 0; i &lt; adapter-&gt;num_q_vectors; i++) // 中断数量
napi_enable(&amp;(adapter-&gt;q_vector[i]-&gt;napi)); // 启用 NAPI,里面只是设置一些标记而言。注意到每个 q_vector 中都有一个 napi 成员,具体见下面 q_vector 结构体定义
/* Clear any pending interrupts. */
rd32(E1000_ICR);
igb_irq_enable(adapter);
/* notify VFs that reset has been completed */
if (adapter-&gt;vfs_allocated_count) {
u32 reg_data = rd32(E1000_CTRL_EXT);
reg_data |= E1000_CTRL_EXT_PFRSTD;
wr32(E1000_CTRL_EXT, reg_data);
}
netif_tx_start_all_queues(netdev);
if (!resuming)
pm_runtime_put(&amp;pdev-&gt;dev);
/* start the watchdog. */
hw-&gt;mac.get_link_status = 1;
schedule_work(&amp;adapter-&gt;watchdog_task);
return 0;
err_set_queues:
igb_free_irq(adapter);
err_req_irq:
igb_release_hw_control(adapter);
igb_power_down_link(adapter);
igb_free_all_rx_resources(adapter);
err_setup_rx:
igb_free_all_tx_resources(adapter);
err_setup_tx:
igb_reset(adapter);
if (!resuming)
pm_runtime_put(&amp;pdev-&gt;dev);
return err;
}</code></pre>
<p>分配描述符数组,即环形缓冲区,这里以接收为例:
1、环境缓冲区 rx_ring 有 2 部分内存:rx_ring->rx_buffer_info 和 rx_ring->desc(DMA)
前者是内核使用;后者是网卡硬件使用。</p>
<pre><code class="language-c">// file: igb_main.c
/**
* igb_setup_all_rx_resources - wrapper to allocate Rx resources
* (Descriptors) for all queues
* @adapter: board private structure
*
* Return 0 on success, negative on failure
**/
static int igb_setup_all_rx_resources(struct igb_adapter *adapter)
{
struct pci_dev *pdev = adapter-&gt;pdev;
int i, err = 0;
for (i = 0; i &lt; adapter-&gt;num_rx_queues; i++) {
err = igb_setup_rx_resources(adapter-&gt;rx_ring[i]); // 具体为每个 ring 分配可存放 skb 的内存。每个队列都设置 rx_ring(发送队列也是单独的)。是环形缓冲区?
if (err) {
dev_err(&amp;pdev-&gt;dev,
&quot;Allocation for Rx Queue %u failed\n&quot;, i);
for (i--; i &gt;= 0; i--)
igb_free_rx_resources(adapter-&gt;rx_ring[i]);
break;
}
}
return err;
}
/**
* igb_setup_rx_resources - allocate Rx resources (Descriptors)
* @rx_ring: Rx descriptor ring (for a specific queue) to setup
*
* Returns 0 on success, negative on failure
**/
int igb_setup_rx_resources(struct igb_ring *rx_ring)
{
struct device *dev = rx_ring-&gt;dev;
int size;
size = sizeof(struct igb_rx_buffer) * rx_ring-&gt;count; // 环形缓冲区大小
rx_ring-&gt;rx_buffer_info = vzalloc(size);
if (!rx_ring-&gt;rx_buffer_info)
goto err;
/* Round up to nearest 4K */
rx_ring-&gt;size = rx_ring-&gt;count * sizeof(union e1000_adv_rx_desc);
rx_ring-&gt;size = ALIGN(rx_ring-&gt;size, 4096);
rx_ring-&gt;desc = dma_alloc_coherent(dev, rx_ring-&gt;size,
&amp;rx_ring-&gt;dma, GFP_KERNEL);
if (!rx_ring-&gt;desc)
goto err;
rx_ring-&gt;next_to_alloc = 0;
rx_ring-&gt;next_to_clean = 0;
rx_ring-&gt;next_to_use = 0;
return 0;
err:
vfree(rx_ring-&gt;rx_buffer_info);
rx_ring-&gt;rx_buffer_info = NULL;
dev_err(dev, &quot;Unable to allocate memory for the Rx descriptor ring\n&quot;);
return -ENOMEM;
}</code></pre>
<p>注册中断函数:
1、一共申请 adapter->num_q_vectors 个中断,和队列数量是否相等?(环形缓冲区数量和队列数量是一致的)
2、</p>
<pre><code class="language-c">// file: igb_main.c
/**
* igb_request_irq - initialize interrupts
* @adapter: board private structure to initialize
*
* Attempts to configure interrupts using the best available
* capabilities of the hardware and kernel.
**/
static int igb_request_irq(struct igb_adapter *adapter)
{
struct net_device *netdev = adapter-&gt;netdev;
struct pci_dev *pdev = adapter-&gt;pdev;
int err = 0;
if (adapter-&gt;msix_entries) {
err = igb_request_msix(adapter); // 一般是这种方式
if (!err)
goto request_done;
/* fall back to MSI */
igb_free_all_tx_resources(adapter);
igb_free_all_rx_resources(adapter);
igb_clear_interrupt_scheme(adapter);
err = igb_init_interrupt_scheme(adapter, false);
if (err)
goto request_done;
igb_setup_all_tx_resources(adapter);
igb_setup_all_rx_resources(adapter);
igb_configure(adapter);
}
igb_assign_vector(adapter-&gt;q_vector[0], 0);
if (adapter-&gt;flags &amp; IGB_FLAG_HAS_MSI) {
err = request_irq(pdev-&gt;irq, igb_intr_msi, 0,
netdev-&gt;name, adapter);
if (!err)
goto request_done;
/* fall back to legacy interrupts */
igb_reset_interrupt_capability(adapter);
adapter-&gt;flags &amp;= ~IGB_FLAG_HAS_MSI;
}
err = request_irq(pdev-&gt;irq, igb_intr, IRQF_SHARED,
netdev-&gt;name, adapter);
if (err)
dev_err(&amp;pdev-&gt;dev, &quot;Error %d getting interrupt\n&quot;,
err);
request_done:
return err;
}
/**
* igb_request_msix - Initialize MSI-X interrupts
* @adapter: board private structure to initialize
*
* igb_request_msix allocates MSI-X vectors and requests interrupts from the
* kernel.
**/
static int igb_request_msix(struct igb_adapter *adapter)
{
struct net_device *netdev = adapter-&gt;netdev;
struct e1000_hw *hw = &amp;adapter-&gt;hw;
int i, err = 0, vector = 0, free_vector = 0;
err = request_irq(adapter-&gt;msix_entries[vector].vector,
igb_msix_other, 0, netdev-&gt;name, adapter); // 不知道是什么
if (err)
goto err_out;
for (i = 0; i &lt; adapter-&gt;num_q_vectors; i++) { // 根据 num_q_vectors 中断数量
struct igb_q_vector *q_vector = adapter-&gt;q_vector[i]; // 每个中断都有一个 q_vector 结构
vector++;
q_vector-&gt;itr_register = hw-&gt;hw_addr + E1000_EITR(vector);
if (q_vector-&gt;rx.ring &amp;&amp; q_vector-&gt;tx.ring) // 发送接收队列
sprintf(q_vector-&gt;name, &quot;%s-TxRx-%u&quot;, netdev-&gt;name,
q_vector-&gt;rx.ring-&gt;queue_index);
else if (q_vector-&gt;tx.ring) // 接收队列
sprintf(q_vector-&gt;name, &quot;%s-tx-%u&quot;, netdev-&gt;name,
q_vector-&gt;tx.ring-&gt;queue_index);
else if (q_vector-&gt;rx.ring) // 发送队列
sprintf(q_vector-&gt;name, &quot;%s-rx-%u&quot;, netdev-&gt;name,
q_vector-&gt;rx.ring-&gt;queue_index);
else
sprintf(q_vector-&gt;name, &quot;%s-unused&quot;, netdev-&gt;name);
err = request_irq(adapter-&gt;msix_entries[vector].vector,
igb_msix_ring, 0, q_vector-&gt;name,
q_vector); // 注册中断响应函数。这里的 vector 是 irq 的概念,并非中断向量。另外,这里 q_vector 是作为中断响应函数的回调参数
if (err)
goto err_free;
}
igb_configure_msix(adapter);
return 0;
err_free:
/* free already assigned IRQs */
free_irq(adapter-&gt;msix_entries[free_vector++].vector, adapter);
vector--;
for (i = 0; i &lt; vector; i++) {
free_irq(adapter-&gt;msix_entries[free_vector++].vector,
adapter-&gt;q_vector[i]);
}
err_out:
return err;
}
// file: inlcude/linux/interrupt.h
struct igb_q_vector {
struct igb_adapter *adapter; /* backlink */
int cpu; /* CPU for DCA */
u32 eims_value; /* EIMS mask value */
u16 itr_val;
u8 set_itr;
void __iomem *itr_register;
struct igb_ring_container rx, tx;
struct napi_struct napi; // 中断响应函数会用到
struct rcu_head rcu; /* to avoid race with update stats on free */
char name[IFNAMSIZ + 9];
/* for dynamic allocation of rings associated with this q_vector */
struct igb_ring ring[0] ____cacheline_internodealigned_in_smp; // 零长数组,注意存放的并非指针,而是结构体,但 ring 变量本身属于指针(数组名称)
};
static inline int __must_check
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
const char *name, void *dev) // dev 是中断函数回调参数,不知道为什么要取 dev 这个名字
{
return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}
// file: igb_main.c
static irqreturn_t igb_msix_ring(int irq, void *data)
{
struct igb_q_vector *q_vector = data; // 取出回调参数,即 q_vector
/* Write the ITR value calculated from the previous interrupt. */
igb_write_itr(q_vector);
napi_schedule(&amp;q_vector-&gt;napi);
return IRQ_HANDLED;
}</code></pre>