公开学习文档

公开学习文档


虚拟bridge

<h2>概述</h2> <h2>创建 bridge</h2> <pre><code class="language-c">// file: net/bridge/br_if.c int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; int res; dev = alloc_netdev(sizeof(struct net_bridge), name, br_dev_setup); // 注意:第 1 个参数是私有数据大小;第 3 个参数是初始化函数 if (!dev) return -ENOMEM; dev_net_set(dev, net); dev-&amp;gt;rtnl_link_ops = &amp;amp;br_link_ops; res = register_netdev(dev); if (res) free_netdev(dev); return res; } // file: include/linux/netdevice.h #define alloc_netdev(sizeof_priv, name, setup) \ alloc_netdev_mqs(sizeof_priv, name, setup, 1, 1) // file: net/core/dev.c /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subquue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t alloc_size; struct net_device *p; BUG_ON(strlen(name) &amp;gt;= sizeof(dev-&amp;gt;name)); if (txqs &amp;lt; 1) { pr_err(&amp;quot;alloc_netdev: Unable to allocate device with zero queues\n&amp;quot;); return NULL; } #ifdef CONFIG_RPS if (rxqs &amp;lt; 1) { pr_err(&amp;quot;alloc_netdev: Unable to allocate device with zero RX queues\n&amp;quot;); return NULL; } #endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { // 如果有私有数据,则增加大小 /* ensure 32-byte alignment of private area */ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); alloc_size += sizeof_priv; } /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; p = kzalloc(alloc_size, GFP_KERNEL); if (!p) return NULL; dev = PTR_ALIGN(p, NETDEV_ALIGN); dev-&amp;gt;padded = (char *)dev - (char *)p; dev-&amp;gt;pcpu_refcnt = alloc_percpu(int); if (!dev-&amp;gt;pcpu_refcnt) goto free_p; if (dev_addr_init(dev)) goto free_pcpu; dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &amp;amp;init_net); dev-&amp;gt;gso_max_size = GSO_MAX_SIZE; dev-&amp;gt;gso_max_segs = GSO_MAX_SEGS; INIT_LIST_HEAD(&amp;amp;dev-&amp;gt;napi_list); INIT_LIST_HEAD(&amp;amp;dev-&amp;gt;unreg_list); INIT_LIST_HEAD(&amp;amp;dev-&amp;gt;link_watch_list); INIT_LIST_HEAD(&amp;amp;dev-&amp;gt;upper_dev_list); dev-&amp;gt;priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); // 即传进来的参数 dev-&amp;gt;num_tx_queues = txqs; dev-&amp;gt;real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; #ifdef CONFIG_RPS dev-&amp;gt;num_rx_queues = rxqs; dev-&amp;gt;real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; #endif strcpy(dev-&amp;gt;name, name); dev-&amp;gt;group = INIT_NETDEV_GROUP; if (!dev-&amp;gt;ethtool_ops) dev-&amp;gt;ethtool_ops = &amp;amp;default_ethtool_ops; return dev; free_all: free_netdev(dev); return NULL; free_pcpu: free_percpu(dev-&amp;gt;pcpu_refcnt); kfree(dev-&amp;gt;_tx); #ifdef CONFIG_RPS kfree(dev-&amp;gt;_rx); #endif free_p: kfree(p); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs);</code></pre> <p>看看 <code>br_dev_setup</code>:</p> <pre><code class="language-c">// file: net/bridge/br_device.h void br_dev_setup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); eth_hw_addr_random(dev); ether_setup(dev); dev-&amp;gt;netdev_ops = &amp;amp;br_netdev_ops; // ops dev-&amp;gt;destructor = br_dev_free; SET_ETHTOOL_OPS(dev, &amp;amp;br_ethtool_ops); SET_NETDEV_DEVTYPE(dev, &amp;amp;br_type); dev-&amp;gt;tx_queue_len = 0; dev-&amp;gt;priv_flags = IFF_EBRIDGE; dev-&amp;gt;features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX; dev-&amp;gt;hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX; br-&amp;gt;dev = dev; spin_lock_init(&amp;amp;br-&amp;gt;lock); INIT_LIST_HEAD(&amp;amp;br-&amp;gt;port_list); spin_lock_init(&amp;amp;br-&amp;gt;hash_lock); br-&amp;gt;bridge_id.prio[0] = 0x80; br-&amp;gt;bridge_id.prio[1] = 0x00; memcpy(br-&amp;gt;group_addr, eth_reserved_addr_base, ETH_ALEN); br-&amp;gt;stp_enabled = BR_NO_STP; br-&amp;gt;group_fwd_mask = BR_GROUPFWD_DEFAULT; br-&amp;gt;designated_root = br-&amp;gt;bridge_id; br-&amp;gt;bridge_max_age = br-&amp;gt;max_age = 20 * HZ; br-&amp;gt;bridge_hello_time = br-&amp;gt;hello_time = 2 * HZ; br-&amp;gt;bridge_forward_delay = br-&amp;gt;forward_delay = 15 * HZ; br-&amp;gt;ageing_time = 300 * HZ; br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); } static const struct net_device_ops br_netdev_ops = { .ndo_open = br_dev_open, .ndo_stop = br_dev_stop, .ndo_init = br_dev_init, .ndo_start_xmit = br_dev_xmit, // 驱动发包函数 .ndo_get_stats64 = br_get_stats64, .ndo_set_mac_address = br_set_mac_address, .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_mtu = br_change_mtu, .ndo_do_ioctl = br_dev_ioctl, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = br_netpoll_setup, .ndo_netpoll_cleanup = br_netpoll_cleanup, .ndo_poll_controller = br_poll_controller, #endif .ndo_add_slave = br_add_slave, .ndo_del_slave = br_del_slave, .ndo_fix_features = br_fix_features, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, };</code></pre> <h2>添加设备</h2> <pre><code class="language-c">// file: net/bridge/br_if.c /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; bool changed_addr; /* Don't allow bridging non-ethernet like devices */ if ((dev-&amp;gt;flags &amp;amp; IFF_LOOPBACK) || dev-&amp;gt;type != ARPHRD_ETHER || dev-&amp;gt;addr_len != ETH_ALEN || !is_valid_ether_addr(dev-&amp;gt;dev_addr)) return -EINVAL; /* No bridging of bridges */ if (dev-&amp;gt;netdev_ops-&amp;gt;ndo_start_xmit == br_dev_xmit) // 不能再接 bridge return -ELOOP; /* Device is already being bridged */ if (br_port_exists(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev-&amp;gt;priv_flags &amp;amp; IFF_DONT_BRIDGE) return -EOPNOTSUPP; p = new_nbp(br, dev); // 申请 net_bridge_port if (IS_ERR(p)) return PTR_ERR(p); call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_promiscuity(dev, 1); if (err) goto put_back; err = kobject_init_and_add(&amp;amp;p-&amp;gt;kobj, &amp;amp;brport_ktype, &amp;amp;(dev-&amp;gt;dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err1; err = br_sysfs_addif(p); if (err) goto err2; if (br_netpoll_info(br) &amp;amp;&amp;amp; ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; err = netdev_master_upper_dev_link(dev, br-&amp;gt;dev); if (err) goto err4; // 添加哪个设备,就将哪个设备的 dev-&amp;gt;rx_handler 设置上 err = netdev_rx_handler_register(dev, br_handle_frame, p); // 设置设备帧接收函数,这会导致不会根据 ptype_base 进行分发。作为 bridge 也不需要进行协议处理。 if (err) goto err5; dev-&amp;gt;priv_flags |= IFF_BRIDGE_PORT; // 表示已经添加到 bridge 中 dev_disable_lro(dev); list_add_rcu(&amp;amp;p-&amp;gt;list, &amp;amp;br-&amp;gt;port_list); // 添加到已用端口列表中 netdev_update_features(br-&amp;gt;dev); spin_lock_bh(&amp;amp;br-&amp;gt;lock); changed_addr = br_stp_recalculate_bridge_id(br); if (netif_running(dev) &amp;amp;&amp;amp; netif_oper_up(dev) &amp;amp;&amp;amp; (br-&amp;gt;dev-&amp;gt;flags &amp;amp; IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&amp;amp;br-&amp;gt;lock); br_ifinfo_notify(RTM_NEWLINK, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br-&amp;gt;dev); dev_set_mtu(br-&amp;gt;dev, br_min_mtu(br)); if (br_fdb_insert(br, p, dev-&amp;gt;dev_addr, 0)) netdev_err(dev, &amp;quot;failed insert local address bridge forwarding table\n&amp;quot;); kobject_uevent(&amp;amp;p-&amp;gt;kobj, KOBJ_ADD); return 0; err5: netdev_upper_dev_unlink(dev, br-&amp;gt;dev); err4: br_netpoll_disable(p); err3: sysfs_remove_link(br-&amp;gt;ifobj, p-&amp;gt;dev-&amp;gt;name); err2: kobject_put(&amp;amp;p-&amp;gt;kobj); p = NULL; /* kobject_put frees */ err1: dev_set_promiscuity(dev, -1); put_back: dev_put(dev); kfree(p); return err; } /* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { int index; struct net_bridge_port *p; index = find_portno(br); // 根据 br-&amp;gt;port_list 找到一个可用的端口序号 if (index &amp;lt; 0) return ERR_PTR(index); p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return ERR_PTR(-ENOMEM); p-&amp;gt;br = br; dev_hold(dev); p-&amp;gt;dev = dev; p-&amp;gt;path_cost = port_cost(dev); p-&amp;gt;priority = 0x8000 &amp;gt;&amp;gt; BR_PORT_BITS; p-&amp;gt;port_no = index; p-&amp;gt;flags = 0; br_init_port(p); p-&amp;gt;state = BR_STATE_DISABLED; br_stp_port_timer_init(p); br_multicast_add_port(p); return p; } // file: net/core/dev.c /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive hander for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { ASSERT_RTNL(); if (dev-&amp;gt;rx_handler) return -EBUSY; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev-&amp;gt;rx_handler_data, rx_handler_data); rcu_assign_pointer(dev-&amp;gt;rx_handler, rx_handler); // 即上文的 br_handle_frame return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); </code></pre> <h2>收包</h2> <p>和以往分析的不太一样,bridge 收包后并不需要处理协议栈,只需要转发即可:</p> <pre><code class="language-c">// file: net/core/dev.c static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { // ... list_for_each_entry_rcu(ptype, &amp;amp;ptype_all, list) { if (!ptype-&amp;gt;dev || ptype-&amp;gt;dev == skb-&amp;gt;dev) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } // ... rx_handler = rcu_dereference(skb-&amp;gt;dev-&amp;gt;rx_handler); // 在 ptype_base 分发之前,先检查 rx_handler if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&amp;amp;skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; case RX_HANDLER_PASS: break; default: BUG(); } } // ... }</code></pre> <p>skb-&gt;dev-&gt;rx_handler 即上文的 <code>br_handle_frame</code>:</p> <pre><code class="language-c">// file: net/bridge/br_input.c /* * Return NULL if skb is handled * note: already called with rcu_read_lock */ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)-&amp;gt;h_dest; br_should_route_hook_t *rhook; if (unlikely(skb-&amp;gt;pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)-&amp;gt;h_source)) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; p = br_port_get_rcu(skb-&amp;gt;dev); // 获取 net_bridge_port 对象 if (unlikely(is_link_local_ether_addr(dest))) { /* * See IEEE 802.1D Table 7-10 Reserved addresses * * Assignment Value * Bridge Group Address 01-80-C2-00-00-00 * (MAC Control) 802.3 01-80-C2-00-00-01 * (Link Aggregation) 802.3 01-80-C2-00-00-02 * 802.1X PAE address 01-80-C2-00-00-03 * * 802.1AB LLDP 01-80-C2-00-00-0E * * Others reserved for future standardization */ switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, then must forward to keep loop detection */ if (p-&amp;gt;br-&amp;gt;stp_enabled == BR_NO_STP) goto forward; break; case 0x01: /* IEEE MAC (Pause) */ goto drop; default: /* Allow selective forwarding for most other protocols */ if (p-&amp;gt;br-&amp;gt;group_fwd_mask &amp;amp; (1u &amp;lt;&amp;lt; dest[5])) goto forward; } /* Deliver packet to local host only */ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb-&amp;gt;dev, NULL, br_handle_local_finish)) { return RX_HANDLER_CONSUMED; /* consumed by filter */ } else { *pskb = skb; return RX_HANDLER_PASS; /* continue processing */ } } forward: switch (p-&amp;gt;state) { case BR_STATE_FORWARDING: rhook = rcu_dereference(br_should_route_hook); if (rhook) { if ((*rhook)(skb)) { *pskb = skb; return RX_HANDLER_PASS; } dest = eth_hdr(skb)-&amp;gt;h_dest; } /* fall through */ case BR_STATE_LEARNING: if (ether_addr_equal(p-&amp;gt;br-&amp;gt;dev-&amp;gt;dev_addr, dest)) skb-&amp;gt;pkt_type = PACKET_HOST; NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb-&amp;gt;dev, NULL, br_handle_frame_finish); // 走钩子 break; default: drop: kfree_skb(skb); } return RX_HANDLER_CONSUMED; } /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct sk_buff *skb) { const unsigned char *dest = eth_hdr(skb)-&amp;gt;h_dest; struct net_bridge_port *p = br_port_get_rcu(skb-&amp;gt;dev); // 获取 struct net_bridge *br; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct sk_buff *skb2; u16 vid = 0; if (!p || p-&amp;gt;state == BR_STATE_DISABLED) goto drop; if (!br_allowed_ingress(p-&amp;gt;br, nbp_get_vlan_info(p), skb, &amp;amp;vid)) goto out; /* insert into forwarding database after filtering to avoid spoofing */ br = p-&amp;gt;br; br_fdb_update(br, p, eth_hdr(skb)-&amp;gt;h_source, vid); // 更新转发表 if (!is_broadcast_ether_addr(dest) &amp;amp;&amp;amp; is_multicast_ether_addr(dest) &amp;amp;&amp;amp; br_multicast_rcv(br, p, skb)) goto drop; if (p-&amp;gt;state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)-&amp;gt;brdev = br-&amp;gt;dev; /* The packet skb2 goes to the local host (NULL to skip). */ skb2 = NULL; if (br-&amp;gt;dev-&amp;gt;flags &amp;amp; IFF_PROMISC) skb2 = skb; dst = NULL; if (is_broadcast_ether_addr(dest)) skb2 = skb; else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { if ((mdst &amp;amp;&amp;amp; mdst-&amp;gt;mglist) || br_multicast_is_router(br)) skb2 = skb; br_multicast_forward(mdst, skb, skb2); skb = NULL; if (!skb2) goto out; } else skb2 = skb; br-&amp;gt;dev-&amp;gt;stats.multicast++; } else if ((dst = __br_fdb_get(br, dest, vid)) &amp;amp;&amp;amp; dst-&amp;gt;is_local) { // 查找转发表 skb2 = skb; /* Do not forward the packet since it's local. */ skb = NULL; } if (skb) { if (dst) { dst-&amp;gt;used = jiffies; br_forward(dst-&amp;gt;dst, skb, skb2); // 转发 } else br_flood_forward(br, skb, skb2); } if (skb2) return br_pass_frame_up(skb2); out: return 0; drop: kfree_skb(skb); goto out; } // file: net/bridge/br_forward.c /* called with rcu_read_lock */ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0) { if (should_deliver(to, skb)) { if (skb0) deliver_clone(to, skb, __br_forward); else __br_forward(to, skb); // 继续 return; } if (!skb0) kfree_skb(skb); } static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) { struct net_device *indev; if (skb_warn_if_lro(skb)) { kfree_skb(skb); return; } skb = br_handle_vlan(to-&amp;gt;br, nbp_get_vlan_info(to), skb); if (!skb) return; indev = skb-&amp;gt;dev; skb-&amp;gt;dev = to-&amp;gt;dev; // 更新 skb_forward_csum(skb); NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb-&amp;gt;dev, br_forward_finish); // 走钩子 } int br_forward_finish(struct sk_buff *skb) { return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb-&amp;gt;dev, br_dev_queue_push_xmit); } int br_dev_queue_push_xmit(struct sk_buff *skb) { /* ip_fragment doesn't copy the MAC header */ if (nf_bridge_maybe_copy_header(skb) || (packet_length(skb) &amp;gt; skb-&amp;gt;dev-&amp;gt;mtu &amp;amp;&amp;amp; !skb_is_gso(skb))) { kfree_skb(skb); } else { skb_push(skb, ETH_HLEN); br_drop_fake_rtable(skb); dev_queue_xmit(skb); // 熟悉的味道 } return 0; }</code></pre> <p>对于 veth 而言,后面的调用栈是:dev_queue_xmit -&gt; dev_hard_start_xmit -&gt; veth_xmit</p>

页面列表

ITEM_HTML