veth
<h2>概述</h2>
<p>1、特点是必须是成对出现
2、收发数据包流程和 lo 基本一一致,只是在驱动层有一点点不同:将数据包发给 peer
3、两端可以“穿越”不同的网络命名空间进行通信</p>
<h2>分析</h2>
<p>初始化:</p>
<pre><code class="language-c">// file: drivers/net/veth.c
static __init int veth_init(void)
{
return rtnl_link_register(&amp;veth_link_ops);
}
// file: net/core/rtnetlink.c
/**
* rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
*
* Returns 0 on success or a negative error code.
*/
int rtnl_link_register(struct rtnl_link_ops *ops)
{
int err;
rtnl_lock();
err = __rtnl_link_register(ops);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);
/**
* __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
*
* The caller must hold the rtnl_mutex. This function should be used
* by drivers that create devices during module initialization. It
* must be called before registering the devices.
*
* Returns 0 on success or a negative error code.
*/
int __rtnl_link_register(struct rtnl_link_ops *ops)
{
if (rtnl_link_ops_get(ops-&gt;kind)) // 根据 kind 查找。也就是说:每一个 kink 只能注册一个 ops。注意:kink 是个字符串。
return -EEXIST;
if (!ops-&gt;dellink)
ops-&gt;dellink = unregister_netdevice_queue;
list_add_tail(&amp;ops-&gt;list, &amp;link_ops); // 全局变量 link_ops
return 0;
}
EXPORT_SYMBOL_GPL(__rtnl_link_register);
</code></pre>
<p>就是将 veth_link_ops 注册到 link_ops 中。再看看 ops:</p>
<pre><code class="language-c">// file: drivers/net/veth.c
static struct rtnl_link_ops veth_link_ops = {
.kind = DRV_NAME, // 字符串:veth,这就是在命令中指定的类型字符串
.priv_size = sizeof(struct veth_priv),
.setup = veth_setup, // 启动
.validate = veth_validate,
.newlink = veth_newlink, // 创建
.dellink = veth_dellink,
.policy = veth_policy,
.maxtype = VETH_INFO_MAX,
};</code></pre>
<p>看看创建函数:</p>
<pre><code class="language-c">// file: drivers/net/veth.c
static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
int err;
struct net_device *peer;
struct veth_priv *priv;
char ifname[IFNAMSIZ];
struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
struct ifinfomsg *ifmp;
struct net *net;
/*
* create and register peer first
*/
if (data != NULL &amp;&amp; data[VETH_INFO_PEER] != NULL) {
struct nlattr *nla_peer;
nla_peer = data[VETH_INFO_PEER];
ifmp = nla_data(nla_peer);
err = nla_parse(peer_tb, IFLA_MAX,
nla_data(nla_peer) + sizeof(struct ifinfomsg),
nla_len(nla_peer) - sizeof(struct ifinfomsg),
ifla_policy);
if (err &lt; 0)
return err;
err = veth_validate(peer_tb, NULL);
if (err &lt; 0)
return err;
tbp = peer_tb;
} else {
ifmp = NULL;
tbp = tb;
}
if (tbp[IFLA_IFNAME])
nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
else
snprintf(ifname, IFNAMSIZ, DRV_NAME &quot;%%d&quot;);
net = rtnl_link_get_net(src_net, tbp);
if (IS_ERR(net))
return PTR_ERR(net);
peer = rtnl_create_link(net, ifname, &amp;veth_link_ops, tbp); // 创建 peer
if (IS_ERR(peer)) {
put_net(net);
return PTR_ERR(peer);
}
if (tbp[IFLA_ADDRESS] == NULL)
eth_hw_addr_random(peer);
if (ifmp &amp;&amp; (dev-&gt;ifindex != 0))
peer-&gt;ifindex = ifmp-&gt;ifi_index;
err = register_netdevice(peer); // 注册 peer
put_net(net);
net = NULL;
if (err &lt; 0)
goto err_register_peer;
netif_carrier_off(peer);
err = rtnl_configure_link(peer, ifmp);
if (err &lt; 0)
goto err_configure_peer;
/*
* register dev last
*
* note, that since we've registered new device the dev's name
* should be re-allocated
*/
if (tb[IFLA_ADDRESS] == NULL)
eth_hw_addr_random(dev);
if (tb[IFLA_IFNAME])
nla_strlcpy(dev-&gt;name, tb[IFLA_IFNAME], IFNAMSIZ);
else
snprintf(dev-&gt;name, IFNAMSIZ, DRV_NAME &quot;%%d&quot;);
if (strchr(dev-&gt;name, '%')) {
err = dev_alloc_name(dev, dev-&gt;name);
if (err &lt; 0)
goto err_alloc_name;
}
err = register_netdevice(dev); // 注册自己
if (err &lt; 0)
goto err_register_dev;
netif_carrier_off(dev);
/*
* tie the deviced together
*/
priv = netdev_priv(dev); // 获取 private 数据。可以看下代码,就是 dev 后面的数据
rcu_assign_pointer(priv-&gt;peer, peer); // 自己的 peer 就是 peer
priv = netdev_priv(peer);
rcu_assign_pointer(priv-&gt;peer, dev); // peer 的 peer 就是自己
return 0;
err_register_dev:
/* nothing to do */
err_alloc_name:
err_configure_peer:
unregister_netdevice(peer);
return err;
err_register_peer:
free_netdev(peer);
return err;
}
struct veth_priv {
struct net_device __rcu *peer; // 记录结对的设备
atomic64_t dropped;
};</code></pre>
<p>再看启动过程:</p>
<pre><code class="language-c">// file: drivers/net/veth.c
static void veth_setup(struct net_device *dev)
{
ether_setup(dev);
dev-&gt;priv_flags &amp;= ~IFF_TX_SKB_SHARING;
dev-&gt;priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev-&gt;netdev_ops = &amp;veth_netdev_ops; // 网络设备 ops
dev-&gt;ethtool_ops = &amp;veth_ethtool_ops; // 老朋友了
dev-&gt;features |= NETIF_F_LLTX;
dev-&gt;features |= VETH_FEATURES;
dev-&gt;destructor = veth_dev_free;
dev-&gt;hw_features = VETH_FEATURES;
}
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
.ndo_stop = veth_close,
.ndo_start_xmit = veth_xmit, // 发包接口
.ndo_change_mtu = veth_change_mtu,
.ndo_get_stats64 = veth_get_stats64,
.ndo_set_mac_address = eth_mac_addr,
};
static const struct ethtool_ops veth_ethtool_ops = {
.get_settings = veth_get_settings,
.get_drvinfo = veth_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_strings = veth_get_strings,
.get_sset_count = veth_get_sset_count,
.get_ethtool_stats = veth_get_ethtool_stats,
};
</code></pre>
<h2>发包</h2>
<p>上层的逻辑和 lo 一样,只是在驱动层接口不一样:</p>
<pre><code class="language-c">// file: net/core/dev.c
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
// ...
rc = ops-&gt;ndo_start_xmit(skb, dev); // 即 veth_xmit
// ...
}
// file: drivers/net/veth.c
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
struct net_device *rcv;
int length = skb-&gt;len;
rcu_read_lock();
rcv = rcu_dereference(priv-&gt;peer); // 对端 peer
if (unlikely(!rcv)) {
kfree_skb(skb);
goto drop;
}
if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { // 发给 peer。继续
struct pcpu_vstats *stats = this_cpu_ptr(dev-&gt;vstats);
u64_stats_update_begin(&amp;stats-&gt;syncp);
stats-&gt;bytes += length;
stats-&gt;packets++;
u64_stats_update_end(&amp;stats-&gt;syncp);
} else {
drop:
atomic64_inc(&amp;priv-&gt;dropped);
}
rcu_read_unlock();
return NETDEV_TX_OK;
}
// file: net/core/dev.c
/**
* dev_forward_skb - loopback an skb to another netif
*
* @dev: destination network device
* @skb: buffer to forward
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped, but freed)
*
* dev_forward_skb can be used for injecting an skb from the
* start_xmit function of one device into the receive queue
* of another device.
*
* The receiving device may be in another namespace, so
* we have to clear all information in the skb that could
* impact namespace isolation.
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
if (skb_shinfo(skb)-&gt;tx_flags &amp; SKBTX_DEV_ZEROCOPY) {
if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
atomic_long_inc(&amp;dev-&gt;rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
}
skb_orphan(skb);
if (unlikely(!is_skb_forwardable(dev, skb))) {
atomic_long_inc(&amp;dev-&gt;rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
skb-&gt;skb_iif = 0;
skb-&gt;dev = dev;
skb_dst_drop(skb);
skb-&gt;tstamp.tv64 = 0;
skb-&gt;pkt_type = PACKET_HOST;
skb-&gt;protocol = eth_type_trans(skb, dev);
skb-&gt;mark = 0;
secpath_reset(skb);
nf_reset(skb);
nf_reset_trace(skb);
return netif_rx(skb); // 老朋友了
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
</code></pre>
<p><code>netif_rx</code> 继续调用 <code>enqueue_to_backlog</code> 将 skb 放到 <code>sd-&gt;input_pkt_queue</code> 中,并调用 <code>____napi_schedule(sd, &amp;sd-&gt;backlog)</code> 将 sd->backlog 加入到 sd->poll_list 中。待下次响应处理 NET_RX_SOFTIRQ 时,就会调用 <code>sd-&gt;backlog-&gt;poll</code> 函数进行处理。这一套逻辑是和 lo 一样的。</p>
<h2>收包</h2>
<p>和 lo 完全一样,都是由 <code>sd-&gt;backlog-&gt;poll</code> 函数进行处理。</p>