网卡驱动
<h2>概述</h2>
<p>调用树:</p>
<pre><code class="language-c">igb_xmit_frame // 选择 igb_ring
igb_xmit_frame_ring // 获取下一下可用缓冲区并放置 skb;将 skb 数据映射到网卡可以访问的内存 DMA 区域</code></pre>
<h2>分析</h2>
<p>上文分析到对于 igb 网卡而言,无论用户态还是软中断发包,最终都会调用 <code>igb_xmit_frame</code>,继续分析:</p>
<pre><code class="language-c">// file: igb_main.c
static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
struct net_device *netdev)
{
struct igb_adapter *adapter = netdev_priv(netdev);
if (test_bit(__IGB_DOWN, &amp;adapter-&gt;state)) {
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
if (skb-&gt;len &lt;= 0) {
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
/* The minimum packet size with TCTL.PSP set is 17 so pad the skb
* in order to meet this minimum size requirement.
*/
if (unlikely(skb-&gt;len &lt; 17)) {
if (skb_pad(skb, 17 - skb-&gt;len))
return NETDEV_TX_OK;
skb-&gt;len = 17;
skb_set_tail_pointer(skb, 17);
}
return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb)); // igb_tx_queue_mapping 会选择出 igb_ring
}
static inline struct igb_ring *igb_tx_queue_mapping(struct igb_adapter *adapter,
struct sk_buff *skb)
{
unsigned int r_idx = skb-&gt;queue_mapping; // 选择 igb_ring 的依据。似乎是 qdisc 那里选择设置的?
if (r_idx &gt;= adapter-&gt;num_tx_queues)
r_idx = r_idx % adapter-&gt;num_tx_queues;
return adapter-&gt;tx_ring[r_idx]; // 选择 ring buffer
}
netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
struct igb_ring *tx_ring)
{
struct igb_tx_buffer *first;
int tso;
u32 tx_flags = 0;
u16 count = TXD_USE_COUNT(skb_headlen(skb));
__be16 protocol = vlan_get_protocol(skb);
u8 hdr_len = 0;
/* need: 1 descriptor per page * PAGE_SIZE/IGB_MAX_DATA_PER_TXD,
* + 1 desc for skb_headlen/IGB_MAX_DATA_PER_TXD,
* + 2 desc gap to keep tail from touching head,
* + 1 desc for context descriptor,
* otherwise try next time
*/
if (NETDEV_FRAG_PAGE_MAX_SIZE &gt; IGB_MAX_DATA_PER_TXD) {
unsigned short f;
for (f = 0; f &lt; skb_shinfo(skb)-&gt;nr_frags; f++)
count += TXD_USE_COUNT(skb_shinfo(skb)-&gt;frags[f].size);
} else {
count += skb_shinfo(skb)-&gt;nr_frags;
}
if (igb_maybe_stop_tx(tx_ring, count + 3)) { // 要求 ring buffer 至少要有的空间
/* this is a hard error */
return NETDEV_TX_BUSY;
}
/* record the location of the first descriptor for this packet */
first = &amp;tx_ring-&gt;tx_buffer_info[tx_ring-&gt;next_to_use]; // 获取下一个可用缓冲区
first-&gt;skb = skb;
first-&gt;bytecount = skb-&gt;len;
first-&gt;gso_segs = 1;
skb_tx_timestamp(skb);
if (unlikely(skb_shinfo(skb)-&gt;tx_flags &amp; SKBTX_HW_TSTAMP)) {
struct igb_adapter *adapter = netdev_priv(tx_ring-&gt;netdev);
if (!(adapter-&gt;ptp_tx_skb)) {
skb_shinfo(skb)-&gt;tx_flags |= SKBTX_IN_PROGRESS;
tx_flags |= IGB_TX_FLAGS_TSTAMP;
adapter-&gt;ptp_tx_skb = skb_get(skb);
adapter-&gt;ptp_tx_start = jiffies;
if (adapter-&gt;hw.mac.type == e1000_82576)
schedule_work(&amp;adapter-&gt;ptp_tx_work);
}
}
if (vlan_tx_tag_present(skb)) {
tx_flags |= IGB_TX_FLAGS_VLAN;
tx_flags |= (vlan_tx_tag_get(skb) &lt;&lt; IGB_TX_FLAGS_VLAN_SHIFT);
}
/* record initial flags and protocol */
first-&gt;tx_flags = tx_flags;
first-&gt;protocol = protocol;
tso = igb_tso(tx_ring, first, &amp;hdr_len); // 支持 tso 则返回 1,里面主要就是一些状态设置
if (tso &lt; 0)
goto out_drop;
else if (!tso)
igb_tx_csum(tx_ring, first);
igb_tx_map(tx_ring, first, hdr_len); // 将 skb 数据映射到网卡可以访问的内存 DMA 区域
/* Make sure there is space in the ring for the next send. */
igb_maybe_stop_tx(tx_ring, DESC_NEEDED); // 检查 ring buffer 空间,如果不够,似乎是控制 qdisc 不要用这个队列,避免包太多将 ring 打满
return NETDEV_TX_OK;
out_drop:
igb_unmap_and_free_tx_resource(tx_ring, first);
return NETDEV_TX_OK;
}
static void igb_tx_map(struct igb_ring *tx_ring,
struct igb_tx_buffer *first,
const u8 hdr_len)
{
struct sk_buff *skb = first-&gt;skb;
struct igb_tx_buffer *tx_buffer;
union e1000_adv_tx_desc *tx_desc;
struct skb_frag_struct *frag;
dma_addr_t dma;
unsigned int data_len, size;
u32 tx_flags = first-&gt;tx_flags;
u32 cmd_type = igb_tx_cmd_type(skb, tx_flags);
u16 i = tx_ring-&gt;next_to_use;
tx_desc = IGB_TX_DESC(tx_ring, i); // 下一个可用描述符指针(DMA 相关?)
igb_tx_olinfo_status(tx_ring, tx_desc, tx_flags, skb-&gt;len - hdr_len);
size = skb_headlen(skb); // 即 skb-&gt;len - skb-&gt;data_len
data_len = skb-&gt;data_len;
// 为 skb-&gt;data 构造内存映射,以允许设备通过 DMA 从 RAM 中读取数据
dma = dma_map_single(tx_ring-&gt;dev, skb-&gt;data, size, DMA_TO_DEVICE); // skb-&gt;data 即是有效数据的开始
tx_buffer = first;
for (frag = &amp;skb_shinfo(skb)-&gt;frags[0];; frag++) {
if (dma_mapping_error(tx_ring-&gt;dev, dma))
goto dma_error;
/* record length, and DMA address */
dma_unmap_len_set(tx_buffer, len, size);
dma_unmap_addr_set(tx_buffer, dma, dma);
tx_desc-&gt;read.buffer_addr = cpu_to_le64(dma);
while (unlikely(size &gt; IGB_MAX_DATA_PER_TXD)) {
tx_desc-&gt;read.cmd_type_len =
cpu_to_le32(cmd_type ^ IGB_MAX_DATA_PER_TXD);
i++;
tx_desc++;
if (i == tx_ring-&gt;count) {
tx_desc = IGB_TX_DESC(tx_ring, 0);
i = 0;
}
tx_desc-&gt;read.olinfo_status = 0;
dma += IGB_MAX_DATA_PER_TXD;
size -= IGB_MAX_DATA_PER_TXD;
tx_desc-&gt;read.buffer_addr = cpu_to_le64(dma);
}
if (likely(!data_len))
break;
tx_desc-&gt;read.cmd_type_len = cpu_to_le32(cmd_type ^ size);
i++;
tx_desc++;
if (i == tx_ring-&gt;count) {
tx_desc = IGB_TX_DESC(tx_ring, 0);
i = 0;
}
tx_desc-&gt;read.olinfo_status = 0;
size = skb_frag_size(frag);
data_len -= size;
dma = skb_frag_dma_map(tx_ring-&gt;dev, frag, 0,
size, DMA_TO_DEVICE);
tx_buffer = &amp;tx_ring-&gt;tx_buffer_info[i];
}
/* write last descriptor with RS and EOP bits */
cmd_type |= size | IGB_TXD_DCMD;
tx_desc-&gt;read.cmd_type_len = cpu_to_le32(cmd_type);
netdev_tx_sent_queue(txring_txq(tx_ring), first-&gt;bytecount);
/* set the timestamp */
first-&gt;time_stamp = jiffies;
/* Force memory writes to complete before letting h/w know there
* are new descriptors to fetch. (Only applicable for weak-ordered
* memory model archs, such as IA-64).
*
* We also need this memory barrier to make certain all of the
* status bits have been updated before next_to_watch is written.
*/
wmb();
/* set next_to_watch value indicating a packet is present */
first-&gt;next_to_watch = tx_desc;
i++;
if (i == tx_ring-&gt;count)
i = 0;
tx_ring-&gt;next_to_use = i;
writel(i, tx_ring-&gt;tail);
/* we need this if more than one processor can write to our tail
* at a time, it synchronizes IO on IA64/Altix systems
*/
mmiowb();
return;
dma_error:
dev_err(tx_ring-&gt;dev, &quot;TX DMA map failed\n&quot;);
/* clear dma mappings for failed tx_buffer_info map */
for (;;) {
tx_buffer = &amp;tx_ring-&gt;tx_buffer_info[i];
igb_unmap_and_free_tx_resource(tx_ring, tx_buffer);
if (tx_buffer == first)
break;
if (i == 0)
i = tx_ring-&gt;count;
i--;
}
tx_ring-&gt;next_to_use = i;
}
</code></pre>
<p>当所有需要的描述符都已建好,且 SKB 的所有数据都映射到 DMA 地址后,驱动就会进入它的最后一步,触发真正的发送。</p>