虚拟bridge
<h2>概述</h2>
<h2>创建 bridge</h2>
<pre><code class="language-c">// file: net/bridge/br_if.c
int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int res;
dev = alloc_netdev(sizeof(struct net_bridge), name,
br_dev_setup); // 注意:第 1 个参数是私有数据大小;第 3 个参数是初始化函数
if (!dev)
return -ENOMEM;
dev_net_set(dev, net);
dev-&gt;rtnl_link_ops = &amp;br_link_ops;
res = register_netdev(dev);
if (res)
free_netdev(dev);
return res;
}
// file: include/linux/netdevice.h
#define alloc_netdev(sizeof_priv, name, setup) \
alloc_netdev_mqs(sizeof_priv, name, setup, 1, 1)
// file: net/core/dev.c
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
* @txqs: the number of TX subqueues to allocate
* @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subquue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
size_t alloc_size;
struct net_device *p;
BUG_ON(strlen(name) &gt;= sizeof(dev-&gt;name));
if (txqs &lt; 1) {
pr_err(&quot;alloc_netdev: Unable to allocate device with zero queues\n&quot;);
return NULL;
}
#ifdef CONFIG_RPS
if (rxqs &lt; 1) {
pr_err(&quot;alloc_netdev: Unable to allocate device with zero RX queues\n&quot;);
return NULL;
}
#endif
alloc_size = sizeof(struct net_device);
if (sizeof_priv) { // 如果有私有数据,则增加大小
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
alloc_size += sizeof_priv;
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
p = kzalloc(alloc_size, GFP_KERNEL);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev-&gt;padded = (char *)dev - (char *)p;
dev-&gt;pcpu_refcnt = alloc_percpu(int);
if (!dev-&gt;pcpu_refcnt)
goto free_p;
if (dev_addr_init(dev))
goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &amp;init_net);
dev-&gt;gso_max_size = GSO_MAX_SIZE;
dev-&gt;gso_max_segs = GSO_MAX_SEGS;
INIT_LIST_HEAD(&amp;dev-&gt;napi_list);
INIT_LIST_HEAD(&amp;dev-&gt;unreg_list);
INIT_LIST_HEAD(&amp;dev-&gt;link_watch_list);
INIT_LIST_HEAD(&amp;dev-&gt;upper_dev_list);
dev-&gt;priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev); // 即传进来的参数
dev-&gt;num_tx_queues = txqs;
dev-&gt;real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
goto free_all;
#ifdef CONFIG_RPS
dev-&gt;num_rx_queues = rxqs;
dev-&gt;real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
#endif
strcpy(dev-&gt;name, name);
dev-&gt;group = INIT_NETDEV_GROUP;
if (!dev-&gt;ethtool_ops)
dev-&gt;ethtool_ops = &amp;default_ethtool_ops;
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
free_percpu(dev-&gt;pcpu_refcnt);
kfree(dev-&gt;_tx);
#ifdef CONFIG_RPS
kfree(dev-&gt;_rx);
#endif
free_p:
kfree(p);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);</code></pre>
<p>看看 <code>br_dev_setup</code>:</p>
<pre><code class="language-c">// file: net/bridge/br_device.h
void br_dev_setup(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
eth_hw_addr_random(dev);
ether_setup(dev);
dev-&gt;netdev_ops = &amp;br_netdev_ops; // ops
dev-&gt;destructor = br_dev_free;
SET_ETHTOOL_OPS(dev, &amp;br_ethtool_ops);
SET_NETDEV_DEVTYPE(dev, &amp;br_type);
dev-&gt;tx_queue_len = 0;
dev-&gt;priv_flags = IFF_EBRIDGE;
dev-&gt;features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX |
NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX;
dev-&gt;hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
NETIF_F_HW_VLAN_CTAG_TX;
br-&gt;dev = dev;
spin_lock_init(&amp;br-&gt;lock);
INIT_LIST_HEAD(&amp;br-&gt;port_list);
spin_lock_init(&amp;br-&gt;hash_lock);
br-&gt;bridge_id.prio[0] = 0x80;
br-&gt;bridge_id.prio[1] = 0x00;
memcpy(br-&gt;group_addr, eth_reserved_addr_base, ETH_ALEN);
br-&gt;stp_enabled = BR_NO_STP;
br-&gt;group_fwd_mask = BR_GROUPFWD_DEFAULT;
br-&gt;designated_root = br-&gt;bridge_id;
br-&gt;bridge_max_age = br-&gt;max_age = 20 * HZ;
br-&gt;bridge_hello_time = br-&gt;hello_time = 2 * HZ;
br-&gt;bridge_forward_delay = br-&gt;forward_delay = 15 * HZ;
br-&gt;ageing_time = 300 * HZ;
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
br_multicast_init(br);
}
static const struct net_device_ops br_netdev_ops = {
.ndo_open = br_dev_open,
.ndo_stop = br_dev_stop,
.ndo_init = br_dev_init,
.ndo_start_xmit = br_dev_xmit, // 驱动发包函数
.ndo_get_stats64 = br_get_stats64,
.ndo_set_mac_address = br_set_mac_address,
.ndo_set_rx_mode = br_dev_set_multicast_list,
.ndo_change_mtu = br_change_mtu,
.ndo_do_ioctl = br_dev_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_netpoll_setup = br_netpoll_setup,
.ndo_netpoll_cleanup = br_netpoll_cleanup,
.ndo_poll_controller = br_poll_controller,
#endif
.ndo_add_slave = br_add_slave,
.ndo_del_slave = br_del_slave,
.ndo_fix_features = br_fix_features,
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
.ndo_fdb_dump = br_fdb_dump,
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
};</code></pre>
<h2>添加设备</h2>
<pre><code class="language-c">// file: net/bridge/br_if.c
/* called with RTNL */
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
bool changed_addr;
/* Don't allow bridging non-ethernet like devices */
if ((dev-&gt;flags &amp; IFF_LOOPBACK) ||
dev-&gt;type != ARPHRD_ETHER || dev-&gt;addr_len != ETH_ALEN ||
!is_valid_ether_addr(dev-&gt;dev_addr))
return -EINVAL;
/* No bridging of bridges */
if (dev-&gt;netdev_ops-&gt;ndo_start_xmit == br_dev_xmit) // 不能再接 bridge
return -ELOOP;
/* Device is already being bridged */
if (br_port_exists(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
if (dev-&gt;priv_flags &amp; IFF_DONT_BRIDGE)
return -EOPNOTSUPP;
p = new_nbp(br, dev); // 申请 net_bridge_port
if (IS_ERR(p))
return PTR_ERR(p);
call_netdevice_notifiers(NETDEV_JOIN, dev);
err = dev_set_promiscuity(dev, 1);
if (err)
goto put_back;
err = kobject_init_and_add(&amp;p-&gt;kobj, &amp;brport_ktype, &amp;(dev-&gt;dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err1;
err = br_sysfs_addif(p);
if (err)
goto err2;
if (br_netpoll_info(br) &amp;&amp; ((err = br_netpoll_enable(p, GFP_KERNEL))))
goto err3;
err = netdev_master_upper_dev_link(dev, br-&gt;dev);
if (err)
goto err4;
// 添加哪个设备,就将哪个设备的 dev-&gt;rx_handler 设置上
err = netdev_rx_handler_register(dev, br_handle_frame, p); // 设置设备帧接收函数,这会导致不会根据 ptype_base 进行分发。作为 bridge 也不需要进行协议处理。
if (err)
goto err5;
dev-&gt;priv_flags |= IFF_BRIDGE_PORT; // 表示已经添加到 bridge 中
dev_disable_lro(dev);
list_add_rcu(&amp;p-&gt;list, &amp;br-&gt;port_list); // 添加到已用端口列表中
netdev_update_features(br-&gt;dev);
spin_lock_bh(&amp;br-&gt;lock);
changed_addr = br_stp_recalculate_bridge_id(br);
if (netif_running(dev) &amp;&amp; netif_oper_up(dev) &amp;&amp;
(br-&gt;dev-&gt;flags &amp; IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&amp;br-&gt;lock);
br_ifinfo_notify(RTM_NEWLINK, p);
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br-&gt;dev);
dev_set_mtu(br-&gt;dev, br_min_mtu(br));
if (br_fdb_insert(br, p, dev-&gt;dev_addr, 0))
netdev_err(dev, &quot;failed insert local address bridge forwarding table\n&quot;);
kobject_uevent(&amp;p-&gt;kobj, KOBJ_ADD);
return 0;
err5:
netdev_upper_dev_unlink(dev, br-&gt;dev);
err4:
br_netpoll_disable(p);
err3:
sysfs_remove_link(br-&gt;ifobj, p-&gt;dev-&gt;name);
err2:
kobject_put(&amp;p-&gt;kobj);
p = NULL; /* kobject_put frees */
err1:
dev_set_promiscuity(dev, -1);
put_back:
dev_put(dev);
kfree(p);
return err;
}
/* called with RTNL but without bridge lock */
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
int index;
struct net_bridge_port *p;
index = find_portno(br); // 根据 br-&gt;port_list 找到一个可用的端口序号
if (index &lt; 0)
return ERR_PTR(index);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return ERR_PTR(-ENOMEM);
p-&gt;br = br;
dev_hold(dev);
p-&gt;dev = dev;
p-&gt;path_cost = port_cost(dev);
p-&gt;priority = 0x8000 &gt;&gt; BR_PORT_BITS;
p-&gt;port_no = index;
p-&gt;flags = 0;
br_init_port(p);
p-&gt;state = BR_STATE_DISABLED;
br_stp_port_timer_init(p);
br_multicast_add_port(p);
return p;
}
// file: net/core/dev.c
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
* Register a receive hander for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
*
* For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev-&gt;rx_handler)
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev-&gt;rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev-&gt;rx_handler, rx_handler); // 即上文的 br_handle_frame
return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
</code></pre>
<h2>收包</h2>
<p>和以往分析的不太一样,bridge 收包后并不需要处理协议栈,只需要转发即可:</p>
<pre><code class="language-c">// file: net/core/dev.c
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
// ...
list_for_each_entry_rcu(ptype, &amp;ptype_all, list) {
if (!ptype-&gt;dev || ptype-&gt;dev == skb-&gt;dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
// ...
rx_handler = rcu_dereference(skb-&gt;dev-&gt;rx_handler); // 在 ptype_base 分发之前,先检查 rx_handler
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&amp;skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
// ...
}</code></pre>
<p>skb->dev->rx_handler 即上文的 <code>br_handle_frame</code>:</p>
<pre><code class="language-c">// file: net/bridge/br_input.c
/*
* Return NULL if skb is handled
* note: already called with rcu_read_lock
*/
rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)-&gt;h_dest;
br_should_route_hook_t *rhook;
if (unlikely(skb-&gt;pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
if (!is_valid_ether_addr(eth_hdr(skb)-&gt;h_source))
goto drop;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return RX_HANDLER_CONSUMED;
p = br_port_get_rcu(skb-&gt;dev); // 获取 net_bridge_port 对象
if (unlikely(is_link_local_ether_addr(dest))) {
/*
* See IEEE 802.1D Table 7-10 Reserved addresses
*
* Assignment Value
* Bridge Group Address 01-80-C2-00-00-00
* (MAC Control) 802.3 01-80-C2-00-00-01
* (Link Aggregation) 802.3 01-80-C2-00-00-02
* 802.1X PAE address 01-80-C2-00-00-03
*
* 802.1AB LLDP 01-80-C2-00-00-0E
*
* Others reserved for future standardization
*/
switch (dest[5]) {
case 0x00: /* Bridge Group Address */
/* If STP is turned off,
then must forward to keep loop detection */
if (p-&gt;br-&gt;stp_enabled == BR_NO_STP)
goto forward;
break;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
default:
/* Allow selective forwarding for most other protocols */
if (p-&gt;br-&gt;group_fwd_mask &amp; (1u &lt;&lt; dest[5]))
goto forward;
}
/* Deliver packet to local host only */
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb-&gt;dev,
NULL, br_handle_local_finish)) {
return RX_HANDLER_CONSUMED; /* consumed by filter */
} else {
*pskb = skb;
return RX_HANDLER_PASS; /* continue processing */
}
}
forward:
switch (p-&gt;state) {
case BR_STATE_FORWARDING:
rhook = rcu_dereference(br_should_route_hook);
if (rhook) {
if ((*rhook)(skb)) {
*pskb = skb;
return RX_HANDLER_PASS;
}
dest = eth_hdr(skb)-&gt;h_dest;
}
/* fall through */
case BR_STATE_LEARNING:
if (ether_addr_equal(p-&gt;br-&gt;dev-&gt;dev_addr, dest))
skb-&gt;pkt_type = PACKET_HOST;
NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb-&gt;dev, NULL,
br_handle_frame_finish); // 走钩子
break;
default:
drop:
kfree_skb(skb);
}
return RX_HANDLER_CONSUMED;
}
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)-&gt;h_dest;
struct net_bridge_port *p = br_port_get_rcu(skb-&gt;dev); // 获取
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct sk_buff *skb2;
u16 vid = 0;
if (!p || p-&gt;state == BR_STATE_DISABLED)
goto drop;
if (!br_allowed_ingress(p-&gt;br, nbp_get_vlan_info(p), skb, &amp;vid))
goto out;
/* insert into forwarding database after filtering to avoid spoofing */
br = p-&gt;br;
br_fdb_update(br, p, eth_hdr(skb)-&gt;h_source, vid); // 更新转发表
if (!is_broadcast_ether_addr(dest) &amp;&amp; is_multicast_ether_addr(dest) &amp;&amp;
br_multicast_rcv(br, p, skb))
goto drop;
if (p-&gt;state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)-&gt;brdev = br-&gt;dev;
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
if (br-&gt;dev-&gt;flags &amp; IFF_PROMISC)
skb2 = skb;
dst = NULL;
if (is_broadcast_ether_addr(dest))
skb2 = skb;
else if (is_multicast_ether_addr(dest)) {
mdst = br_mdb_get(br, skb, vid);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
if ((mdst &amp;&amp; mdst-&gt;mglist) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
skb = NULL;
if (!skb2)
goto out;
} else
skb2 = skb;
br-&gt;dev-&gt;stats.multicast++;
} else if ((dst = __br_fdb_get(br, dest, vid)) &amp;&amp;
dst-&gt;is_local) { // 查找转发表
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
if (skb) {
if (dst) {
dst-&gt;used = jiffies;
br_forward(dst-&gt;dst, skb, skb2); // 转发
} else
br_flood_forward(br, skb, skb2);
}
if (skb2)
return br_pass_frame_up(skb2);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
// file: net/bridge/br_forward.c
/* called with rcu_read_lock */
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
{
if (should_deliver(to, skb)) {
if (skb0)
deliver_clone(to, skb, __br_forward);
else
__br_forward(to, skb); // 继续
return;
}
if (!skb0)
kfree_skb(skb);
}
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
struct net_device *indev;
if (skb_warn_if_lro(skb)) {
kfree_skb(skb);
return;
}
skb = br_handle_vlan(to-&gt;br, nbp_get_vlan_info(to), skb);
if (!skb)
return;
indev = skb-&gt;dev;
skb-&gt;dev = to-&gt;dev; // 更新
skb_forward_csum(skb);
NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb-&gt;dev,
br_forward_finish); // 走钩子
}
int br_forward_finish(struct sk_buff *skb)
{
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb-&gt;dev,
br_dev_queue_push_xmit);
}
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
/* ip_fragment doesn't copy the MAC header */
if (nf_bridge_maybe_copy_header(skb) ||
(packet_length(skb) &gt; skb-&gt;dev-&gt;mtu &amp;&amp; !skb_is_gso(skb))) {
kfree_skb(skb);
} else {
skb_push(skb, ETH_HLEN);
br_drop_fake_rtable(skb);
dev_queue_xmit(skb); // 熟悉的味道
}
return 0;
}</code></pre>
<p>对于 veth 而言,后面的调用栈是:dev_queue_xmit -> dev_hard_start_xmit -> veth_xmit</p>