本机发包
<h2>概述</h2>
<p>与通常的网卡发包不同的地方:
1、路由不同 -- 只是选择不同,并非不查路由
2、发包不需要选择队列、排队等
3、驱动层的发包程序不同,lo 发包逻辑很简单</p>
<p>lo 驱动发包逻辑:
1、尝试将 skb 放到 sd->input_pkt_queue 中
2、如果在放包之前队列中没有数据,则触发 NET_RX_SOFTIRQ 软中断,后续由软中断调用 sd->backlog 的 poll 函数进行处理</p>
<h2>分析</h2>
<p>通过本机发送数据,也是调用 send 等系统调用,同样要走协议栈。区分不同的地方就是查路由那里,这就是在网络层的 ip_queue_xmit 函数。</p>
<pre><code class="language-c">// file: net/ipv4/ip_output.c
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
{
struct sock *sk = skb-&gt;sk;
struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet-&gt;inet_opt);
fl4 = &amp;fl-&gt;u.ip4;
rt = skb_rtable(skb);
if (rt != NULL)
goto packet_routed;
/* Make sure we can route this packet. */
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (rt == NULL) {
__be32 daddr;
/* Use correct destination address if we have options. */
daddr = inet-&gt;inet_daddr;
if (inet_opt &amp;&amp; inet_opt-&gt;opt.srr)
daddr = inet_opt-&gt;opt.faddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
rt = ip_route_output_ports(sock_net(sk), fl4, sk,
daddr, inet-&gt;inet_saddr,
inet-&gt;inet_dport,
inet-&gt;inet_sport,
sk-&gt;sk_protocol,
RT_CONN_FLAGS(sk),
sk-&gt;sk_bound_dev_if); // 查路由
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &amp;rt-&gt;dst); // 缓存到 sk 中
}
skb_dst_set_noref(skb, &amp;rt-&gt;dst);
packet_routed:
if (inet_opt &amp;&amp; inet_opt-&gt;opt.is_strictroute &amp;&amp; rt-&gt;rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt-&gt;opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 &lt;&lt; 12) | (5 &lt;&lt; 8) | (inet-&gt;tos &amp; 0xff));
if (ip_dont_fragment(sk, &amp;rt-&gt;dst) &amp;&amp; !skb-&gt;local_df)
iph-&gt;frag_off = htons(IP_DF);
else
iph-&gt;frag_off = 0;
iph-&gt;ttl = ip_select_ttl(inet, &amp;rt-&gt;dst);
iph-&gt;protocol = sk-&gt;sk_protocol;
ip_copy_addrs(iph, fl4);
/* Transport layer set skb-&gt;h.foo itself. */
if (inet_opt &amp;&amp; inet_opt-&gt;opt.optlen) {
iph-&gt;ihl += inet_opt-&gt;opt.optlen &gt;&gt; 2;
ip_options_build(skb, &amp;inet_opt-&gt;opt, inet-&gt;inet_daddr, rt, 0);
}
ip_select_ident_segs(skb, sk, skb_shinfo(skb)-&gt;gso_segs ?: 1);
skb-&gt;priority = sk-&gt;sk_priority;
skb-&gt;mark = sk-&gt;sk_mark;
res = ip_local_out(skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
EXPORT_SYMBOL(ip_queue_xmit);
// file: include/net/route.h
static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4,
struct sock *sk,
__be32 daddr, __be32 saddr,
__be16 dport, __be16 sport,
__u8 proto, __u8 tos, int oif)
{
flowi4_init_output(fl4, oif, sk ? sk-&gt;sk_mark : 0, tos,
RT_SCOPE_UNIVERSE, proto,
sk ? inet_sk_flowi_flags(sk) : 0,
daddr, saddr, dport, sport); // 初始化 fl4 中的参数,以便后面查找路由
if (sk)
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk); // 继续
}
// file: include/net/flow.h
static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
__u32 mark, __u8 tos, __u8 scope,
__u8 proto, __u8 flags,
__be32 daddr, __be32 saddr,
__be16 dport, __be16 sport)
{
fl4-&gt;flowi4_oif = oif;
fl4-&gt;flowi4_iif = 0;
fl4-&gt;flowi4_mark = mark;
fl4-&gt;flowi4_tos = tos;
fl4-&gt;flowi4_scope = scope;
fl4-&gt;flowi4_proto = proto; // 包含经典五元组
fl4-&gt;flowi4_flags = flags;
fl4-&gt;flowi4_secid = 0;
fl4-&gt;daddr = daddr;
fl4-&gt;saddr = saddr;
fl4-&gt;fl4_dport = dport;
fl4-&gt;fl4_sport = sport;
}
// file: net/ipv4/route.c
struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
struct sock *sk)
{
struct rtable *rt = __ip_route_output_key(net, flp4); // 继续
if (IS_ERR(rt))
return rt;
if (flp4-&gt;flowi4_proto)
rt = (struct rtable *) xfrm_lookup(net, &amp;rt-&gt;dst,
flowi4_to_flowi(flp4),
sk, 0); // ipsec 相关?
return rt;
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
/*
* Major route resolver routine.
*/
struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
int orig_oif;
res.tclassid = 0;
res.fi = NULL;
res.table = NULL;
orig_oif = fl4-&gt;flowi4_oif;
fl4-&gt;flowi4_iif = LOOPBACK_IFINDEX;
fl4-&gt;flowi4_tos = tos &amp; IPTOS_RT_MASK;
fl4-&gt;flowi4_scope = ((tos &amp; RTO_ONLINK) ?
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
rcu_read_lock();
if (fl4-&gt;saddr) {
rth = ERR_PTR(-EINVAL);
if (ipv4_is_multicast(fl4-&gt;saddr) ||
ipv4_is_lbcast(fl4-&gt;saddr) ||
ipv4_is_zeronet(fl4-&gt;saddr))
goto out;
/* I removed check for oif == dev_out-&gt;oif here.
It was wrong for two reasons:
1. ip_dev_find(net, saddr) can return wrong iface, if saddr
is assigned to multiple interfaces.
2. Moreover, we are allowed to send packets with saddr
of another iface. --ANK
*/
if (fl4-&gt;flowi4_oif == 0 &amp;&amp;
(ipv4_is_multicast(fl4-&gt;daddr) ||
ipv4_is_lbcast(fl4-&gt;daddr))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = __ip_dev_find(net, fl4-&gt;saddr, false);
if (dev_out == NULL)
goto out;
/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
This hack is not just for fun, it allows
vic,vat and friends to work.
They bind socket to loopback, set ttl to zero
and expect that it will work.
From the viewpoint of routing cache they are broken,
because we are not allowed to build multicast path
with loopback source addr (look, routing cache
cannot know, that ttl is zero, so that packet
will not leave this host and route is valid).
Luckily, this hack is good workaround.
*/
fl4-&gt;flowi4_oif = dev_out-&gt;ifindex;
goto make_route;
}
if (!(fl4-&gt;flowi4_flags &amp; FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
if (!__ip_dev_find(net, fl4-&gt;saddr, false))
goto out;
}
}
if (fl4-&gt;flowi4_oif) {
dev_out = dev_get_by_index_rcu(net, fl4-&gt;flowi4_oif);
rth = ERR_PTR(-ENODEV);
if (dev_out == NULL)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
if (!(dev_out-&gt;flags &amp; IFF_UP) || !__in_dev_get_rcu(dev_out)) {
rth = ERR_PTR(-ENETUNREACH);
goto out;
}
if (ipv4_is_local_multicast(fl4-&gt;daddr) ||
ipv4_is_lbcast(fl4-&gt;daddr)) {
if (!fl4-&gt;saddr)
fl4-&gt;saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
goto make_route;
}
if (!fl4-&gt;saddr) {
if (ipv4_is_multicast(fl4-&gt;daddr))
fl4-&gt;saddr = inet_select_addr(dev_out, 0,
fl4-&gt;flowi4_scope);
else if (!fl4-&gt;daddr)
fl4-&gt;saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
}
if (!fl4-&gt;daddr) {
fl4-&gt;daddr = fl4-&gt;saddr;
if (!fl4-&gt;daddr)
fl4-&gt;daddr = fl4-&gt;saddr = htonl(INADDR_LOOPBACK);
dev_out = net-&gt;loopback_dev;
fl4-&gt;flowi4_oif = LOOPBACK_IFINDEX;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
if (fib_lookup(net, fl4, &amp;res)) { // 查找路由,继续
res.fi = NULL;
res.table = NULL;
if (fl4-&gt;flowi4_oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
WHY? DW.
Because we are allowed to send to iface
even if it has NO routes and NO assigned
addresses. When oif is specified, routing
tables are looked up with only one purpose:
to catch if destination is gatewayed, rather than
direct. Moreover, if MSG_DONTROUTE is set,
we send packet, ignoring both routing tables
and ifaddr state. --ANK
We could make it even if oif is unknown,
likely IPv6, but we do not.
*/
if (fl4-&gt;saddr == 0)
fl4-&gt;saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
res.type = RTN_UNICAST;
goto make_route;
}
rth = ERR_PTR(-ENETUNREACH);
goto out;
}
if (res.type == RTN_LOCAL) { // 发给本机
if (!fl4-&gt;saddr) {
if (res.fi-&gt;fib_prefsrc)
fl4-&gt;saddr = res.fi-&gt;fib_prefsrc;
else
fl4-&gt;saddr = fl4-&gt;daddr;
}
dev_out = net-&gt;loopback_dev; // loopback
fl4-&gt;flowi4_oif = dev_out-&gt;ifindex;
flags |= RTCF_LOCAL;
goto make_route;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi-&gt;fib_nhs &gt; 1 &amp;&amp; fl4-&gt;flowi4_oif == 0)
fib_select_multipath(&amp;res);
else
#endif
if (!res.prefixlen &amp;&amp;
res.table-&gt;tb_num_default &gt; 1 &amp;&amp;
res.type == RTN_UNICAST &amp;&amp; !fl4-&gt;flowi4_oif)
fib_select_default(&amp;res);
if (!fl4-&gt;saddr)
fl4-&gt;saddr = FIB_RES_PREFSRC(net, res);
dev_out = FIB_RES_DEV(res);
fl4-&gt;flowi4_oif = dev_out-&gt;ifindex;
make_route:
rth = __mkroute_output(&amp;res, fl4, orig_oif, dev_out, flags); // 找到路由,继续
out:
rcu_read_unlock();
return rth;
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
// file: include/net/ip_fib.h
static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
struct fib_result *res)
{
struct fib_table *table;
table = fib_get_table(net, RT_TABLE_LOCAL);
if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF)) // 自己去搜索吧
return 0;
table = fib_get_table(net, RT_TABLE_MAIN);
if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
return 0;
return -ENETUNREACH;
}
// file: net/ipv4/route.c
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
const struct flowi4 *fl4, int orig_oif,
struct net_device *dev_out,
unsigned int flags)
{
struct fib_info *fi = res-&gt;fi;
struct fib_nh_exception *fnhe;
struct in_device *in_dev;
u16 type = res-&gt;type;
struct rtable *rth;
bool do_cache;
in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
return ERR_PTR(-EINVAL);
if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
if (ipv4_is_loopback(fl4-&gt;saddr) &amp;&amp; !(dev_out-&gt;flags &amp; IFF_LOOPBACK))
return ERR_PTR(-EINVAL);
if (ipv4_is_lbcast(fl4-&gt;daddr))
type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl4-&gt;daddr))
type = RTN_MULTICAST;
else if (ipv4_is_zeronet(fl4-&gt;daddr))
return ERR_PTR(-EINVAL);
if (dev_out-&gt;flags &amp; IFF_LOOPBACK)
flags |= RTCF_LOCAL;
do_cache = true;
if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
fi = NULL;
} else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc_rcu(in_dev, fl4-&gt;daddr, fl4-&gt;saddr,
fl4-&gt;flowi4_proto))
flags &amp;= ~RTCF_LOCAL;
else
do_cache = false;
/* If multicast route do not exist use
* default one, but do not gateway in this case.
* Yes, it is hack.
*/
if (fi &amp;&amp; res-&gt;prefixlen &lt; 4)
fi = NULL;
} else if ((type == RTN_LOCAL) &amp;&amp; (orig_oif != 0) &amp;&amp;
(orig_oif != dev_out-&gt;ifindex)) {
/* For local routes that require a particular output interface
* we do not want to cache the result. Caching the result
* causes incorrect behaviour when there are multiple source
* addresses on the interface, the end result being that if the
* intended recipient is waiting on that interface for the
* packet he won't receive it because it will be delivered on
* the loopback interface and the IP_PKTINFO ipi_ifindex will
* be set to the loopback interface as well.
*/
fi = NULL;
}
fnhe = NULL;
do_cache &amp;= fi != NULL;
if (do_cache) {
struct rtable __rcu **prth;
struct fib_nh *nh = &amp;FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4-&gt;daddr);
if (fnhe)
prth = &amp;fnhe-&gt;fnhe_rth;
else {
if (unlikely(fl4-&gt;flowi4_flags &amp;
FLOWI_FLAG_KNOWN_NH &amp;&amp;
!(nh-&gt;nh_gw &amp;&amp;
nh-&gt;nh_scope == RT_SCOPE_LINK))) {
do_cache = false;
goto add;
}
prth = __this_cpu_ptr(nh-&gt;nh_pcpu_rth_output);
}
rth = rcu_dereference(*prth);
if (rt_cache_valid(rth)) {
dst_hold(&amp;rth-&gt;dst);
return rth;
}
}
add:
rth = rt_dst_alloc(dev_out,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM),
do_cache);
if (!rth)
return ERR_PTR(-ENOBUFS);
rth-&gt;dst.output = ip_output;
rth-&gt;rt_genid = rt_genid(dev_net(dev_out));
rth-&gt;rt_flags = flags;
rth-&gt;rt_type = type;
rth-&gt;rt_is_input = 0;
rth-&gt;rt_iif = orig_oif ? : 0;
rth-&gt;rt_pmtu = 0;
rth-&gt;rt_gateway = 0;
rth-&gt;rt_uses_gateway = 0;
INIT_LIST_HEAD(&amp;rth-&gt;rt_uncached);
RT_CACHE_STAT_INC(out_slow_tot);
if (flags &amp; RTCF_LOCAL)
rth-&gt;dst.input = ip_local_deliver;
if (flags &amp; (RTCF_BROADCAST | RTCF_MULTICAST)) {
if (flags &amp; RTCF_LOCAL &amp;&amp;
!(dev_out-&gt;flags &amp; IFF_LOOPBACK)) {
rth-&gt;dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
if (type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &amp;&amp;
!ipv4_is_local_multicast(fl4-&gt;daddr)) {
rth-&gt;dst.input = ip_mr_input;
rth-&gt;dst.output = ip_mc_output;
}
}
#endif
}
rt_set_nexthop(rth, fl4-&gt;daddr, res, fnhe, fi, type, 0);
return rth;
}
</code></pre>
<p>接下来,也会执行到 ip_finish_output,进入邻居子系统的入口函数 dst_neigh_output。</p>
<p>继续进入网络设备子系统入口函数 dev_queue_xmit:</p>
<pre><code class="language-c">// file: net/core/dev.c
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb-&gt;dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
skb_reset_mac_header(skb);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
skb_update_prio(skb);
txq = netdev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq-&gt;qdisc);
#ifdef CONFIG_NET_CLS_ACT
skb-&gt;tc_verd = SET_TC_AT(skb-&gt;tc_verd, AT_EGRESS);
#endif
trace_net_dev_queue(skb);
if (q-&gt;enqueue) { // 回环设备这里为 false
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that netif_tx_lock protection is necessary
here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev-&gt;flags &amp; IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq-&gt;xmit_lock_owner != cpu) {
if (__this_cpu_read(xmit_recursion) &gt; RECURSION_LIMIT)
goto recursion_alert;
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq); // 继续
__this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
HARD_TX_UNLOCK(dev, txq);
net_crit_ratelimited(&quot;Virtual device %s asks to queue packet!\n&quot;,
dev-&gt;name);
} else {
/* Recursion is detected! It is possible,
* unfortunately
*/
recursion_alert:
net_crit_ratelimited(&quot;Dead loop on virtual device %s, fix it urgently!\n&quot;,
dev-&gt;name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
EXPORT_SYMBOL(dev_queue_xmit);
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
const struct net_device_ops *ops = dev-&gt;netdev_ops;
int rc = NETDEV_TX_OK;
unsigned int skb_len;
if (likely(!skb-&gt;next)) {
netdev_features_t features;
/*
* If device doesn't need skb-&gt;dst, release it right now while
* its hot in this cpu cache
*/
if (dev-&gt;priv_flags &amp; IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &amp;&amp;
!vlan_hw_offload_capable(features, skb-&gt;vlan_proto)) {
skb = __vlan_put_tag(skb, skb-&gt;vlan_proto,
vlan_tx_tag_get(skb));
if (unlikely(!skb))
goto out;
skb-&gt;vlan_tci = 0;
}
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
* features for the netdev
*/
if (skb-&gt;encapsulation)
features &amp;= dev-&gt;hw_enc_features;
if (netif_needs_gso(skb, features)) {
if (unlikely(dev_gso_segment(skb, features)))
goto out_kfree_skb;
if (skb-&gt;next)
goto gso;
} else {
if (skb_needs_linearize(skb, features) &amp;&amp;
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb-&gt;ip_summed == CHECKSUM_PARTIAL) {
if (skb-&gt;encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (!(features &amp; NETIF_F_ALL_CSUM) &amp;&amp;
skb_checksum_help(skb))
goto out_kfree_skb;
}
}
if (!list_empty(&amp;ptype_all))
dev_queue_xmit_nit(skb, dev);
skb_len = skb-&gt;len;
rc = ops-&gt;ndo_start_xmit(skb, dev); // 调用驱动函数发送
trace_net_dev_xmit(skb, rc, dev, skb_len);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
}
gso:
do {
struct sk_buff *nskb = skb-&gt;next;
skb-&gt;next = nskb-&gt;next;
nskb-&gt;next = NULL;
if (!list_empty(&amp;ptype_all))
dev_queue_xmit_nit(nskb, dev);
skb_len = nskb-&gt;len;
rc = ops-&gt;ndo_start_xmit(nskb, dev);
trace_net_dev_xmit(nskb, rc, dev, skb_len);
if (unlikely(rc != NETDEV_TX_OK)) {
if (rc &amp; ~NETDEV_TX_MASK)
goto out_kfree_gso_skb;
nskb-&gt;next = skb-&gt;next;
skb-&gt;next = nskb;
return rc;
}
txq_trans_update(txq);
if (unlikely(netif_xmit_stopped(txq) &amp;&amp; skb-&gt;next))
return NETDEV_TX_BUSY;
} while (skb-&gt;next);
out_kfree_gso_skb:
if (likely(skb-&gt;next == NULL)) {
skb-&gt;destructor = DEV_GSO_CB(skb)-&gt;destructor;
consume_skb(skb);
return rc;
}
out_kfree_skb:
kfree_skb(skb);
out:
return rc;
}
// file: drivers/net/loopback.c
static const struct net_device_ops loopback_ops = {
.ndo_init = loopback_dev_init,
.ndo_start_xmit= loopback_xmit,
.ndo_get_stats64 = loopback_get_stats64,
};
/*
* The higher levels take care of making this non-reentrant (it's
* called with bh's disabled).
*/
static netdev_tx_t loopback_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct pcpu_lstats *lb_stats;
int len;
skb_orphan(skb); // 剥离掉和原 socket 的联系
/* Before queueing this packet to netif_rx(),
* make sure dst is refcounted.
*/
skb_dst_force(skb);
skb-&gt;protocol = eth_type_trans(skb, dev);
/* it's OK to use per_cpu_ptr() because BHs are off */
lb_stats = this_cpu_ptr(dev-&gt;lstats);
len = skb-&gt;len;
if (likely(netif_rx(skb) == NET_RX_SUCCESS)) { // 继续
u64_stats_update_begin(&amp;lb_stats-&gt;syncp);
lb_stats-&gt;bytes += len;
lb_stats-&gt;packets++;
u64_stats_update_end(&amp;lb_stats-&gt;syncp);
}
return NETDEV_TX_OK;
}
// file: net/core/dev.c
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
int ret;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
if (static_key_false(&amp;rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &amp;voidflow;
int cpu;
preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb-&gt;dev, skb, &amp;rflow);
if (cpu &lt; 0)
cpu = smp_processor_id();
ret = enqueue_to_backlog(skb, cpu, &amp;rflow-&gt;last_qtail);
rcu_read_unlock();
preempt_enable();
} else
#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &amp;qtail); // 继续
put_cpu();
}
return ret;
}
EXPORT_SYMBOL(netif_rx);
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
sd = &amp;per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
if (skb_queue_len(&amp;sd-&gt;input_pkt_queue) &lt;= netdev_max_backlog) { // 会尝试放到 sd-&gt;input_pkt_queue 中
if (skb_queue_len(&amp;sd-&gt;input_pkt_queue)) {
enqueue:
__skb_queue_tail(&amp;sd-&gt;input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &amp;sd-&gt;backlog.state)) {
if (!rps_ipi_queued(sd))
____napi_schedule(sd, &amp;sd-&gt;backlog); // 触发 NET_RX_SOFTIRQ 软中断,执行 poll_list 中对象的 poll 函数。注意这里是向 sd-&gt;poll_list 中添加 sd-&gt;backlog
}
goto enqueue;
}
sd-&gt;dropped++;
rps_unlock(sd);
local_irq_restore(flags);
atomic_long_inc(&amp;skb-&gt;dev-&gt;rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&amp;napi-&gt;poll_list, &amp;sd-&gt;poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
</code></pre>