softirq
<h2>概述</h2>
<p>ksoftirqd 是内核创建的线程,它本身不断循环执行,检测到有中断时(即每个 CPU 的 penging 位图)就执行对应的中断响应函数。
对于网络来说,涉及的软中断有:
1、NET_TX_SOFTIRQ -> net_tx_action (由 net/core/dev.c: net_dev_init()注册)
2、NET_RX_SOFTIRQ -> net_rx_action (同上)</p>
<p>> 一些资料:<a href="https://zhuanlan.zhihu.com/p/80680484">https://zhuanlan.zhihu.com/p/80680484</a>
> <a href="https://zhuanlan.zhihu.com/p/88883239">https://zhuanlan.zhihu.com/p/88883239</a></p>
<h2>创建 ksoftirqd</h2>
<p>开机的时候,会调用 <code>spawn_ksoftirqd</code> 创建 ncpu 个 ksoftirqd 线程,如下:</p>
<pre><code class="language-c">// file: kernel/softirq.c
static struct smp_hotplug_thread softirq_threads = {
.store = &amp;ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd, // 主函数
.thread_comm = &quot;ksoftirqd/%u&quot;,
};
static __init int spawn_ksoftirqd(void)
{
register_cpu_notifier(&amp;cpu_nfb);
BUG_ON(smpboot_register_percpu_thread(&amp;softirq_threads)); // 这里创建
return 0;
}
early_initcall(spawn_ksoftirqd);
</code></pre>
<p>调用关系:<code>smpboot_register_percpu_thread -&gt; __smpboot_create_thread</code>。实际创建的线程函数是 <code>smpboot_thread_fn</code>,它里是一个循环,不断判断是否要执行具体业务逻辑。</p>
<pre><code class="language-c">// file: kernel/smpboot.c
static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
struct task_struct *tsk = *per_cpu_ptr(ht-&gt;store, cpu);
struct smpboot_thread_data *td;
if (tsk)
return 0;
td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
td-&gt;cpu = cpu;
td-&gt;ht = ht;
tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
ht-&gt;thread_comm); // 实际运行的主函数是 smpboot_thread_fn,里面是一个循环
if (IS_ERR(tsk)) {
kfree(td);
return PTR_ERR(tsk);
}
get_task_struct(tsk);
*per_cpu_ptr(ht-&gt;store, cpu) = tsk;
if (ht-&gt;create) {
/*
* Make sure that the task has actually scheduled out
* into park position, before calling the create
* callback. At least the migration thread callback
* requires that the task is off the runqueue.
*/
if (!wait_task_inactive(tsk, TASK_PARKED))
WARN_ON(1);
else
ht-&gt;create(cpu);
}
return 0;
}
/**
* smpboot_thread_fn - percpu hotplug thread loop function
* @data: thread data pointer
*
* Checks for thread stop and park conditions. Calls the necessary
* setup, cleanup, park and unpark functions for the registered
* thread.
*
* Returns 1 when the thread should exit, 0 otherwise.
*/
static int smpboot_thread_fn(void *data)
{
struct smpboot_thread_data *td = data;
struct smp_hotplug_thread *ht = td-&gt;ht;
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
set_current_state(TASK_RUNNING);
preempt_enable();
if (ht-&gt;cleanup)
ht-&gt;cleanup(td-&gt;cpu, cpu_online(td-&gt;cpu));
kfree(td);
return 0;
}
if (kthread_should_park()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht-&gt;park &amp;&amp; td-&gt;status == HP_THREAD_ACTIVE) {
BUG_ON(td-&gt;cpu != smp_processor_id());
ht-&gt;park(td-&gt;cpu);
td-&gt;status = HP_THREAD_PARKED;
}
kthread_parkme();
/* We might have been woken for stop */
continue;
}
BUG_ON(td-&gt;cpu != smp_processor_id());
/* Check for state change setup */
switch (td-&gt;status) {
case HP_THREAD_NONE:
preempt_enable();
if (ht-&gt;setup)
ht-&gt;setup(td-&gt;cpu);
td-&gt;status = HP_THREAD_ACTIVE;
preempt_disable();
break;
case HP_THREAD_PARKED:
preempt_enable();
if (ht-&gt;unpark)
ht-&gt;unpark(td-&gt;cpu);
td-&gt;status = HP_THREAD_ACTIVE;
preempt_disable();
break;
}
// 主要逻辑在这里
if (!ht-&gt;thread_should_run(td-&gt;cpu)) { // shold_run 字段
preempt_enable();
schedule();
} else {
set_current_state(TASK_RUNNING);
preempt_enable();
ht-&gt;thread_fn(td-&gt;cpu); // thread_fn 字段
}
}
}</code></pre>
<h2>运行判断条件</h2>
<p>ksoftirqd 运行的条件是:是否每个 CPU 上的 <code>__softirq_pending</code> 位图变量非空。</p>
<pre><code class="language-c">// file: kernel/softirq.c
static int ksoftirqd_should_run(unsigned int cpu)
{
return local_softirq_pending();
}
</code></pre>
<pre><code class="language-c">// file: include/linux/irq_cpustat.h
/*
* Simple wrappers reducing source bloat. Define all irq_stat fields
* here, even ones that are arch dependent. That way we get common
* definitions instead of differing sets for each arch.
*/
#ifndef __ARCH_IRQ_STAT
extern irq_cpustat_t irq_stat[]; /* defined in asm/hardirq.h */
#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member) // 每个 CPU 上的 __softirq_pending 字段
#endif
/* arch independent irq_stat fields */
#define local_softirq_pending() \
__IRQ_STAT(smp_processor_id(), __softirq_pending)</code></pre>
<p>那么 pending 字段是什么时候设置呢?其中的一个调用接口就是:</p>
<pre><code class="language-c">// file: kernel/softirq.c
void raise_softirq(unsigned int nr)
{
unsigned long flags;
local_irq_save(flags);
raise_softirq_irqoff(nr);
local_irq_restore(flags);
}
/*
* This function must run with irqs disabled!
*/
inline void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
/*
* If we're in an interrupt or softirq, we're done
* (this also catches softirq-disabled code). We will
* actually run the softirq once we return from
* the irq or softirq.
*
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
if (!in_interrupt())
wakeup_softirqd();
}
void __raise_softirq_irqoff(unsigned int nr)
{
trace_softirq_raise(nr);
or_softirq_pending(1UL &lt;&lt; nr); // 在此设置 pending 位图
}
</code></pre>
<h2>主要业务</h2>
<p>ksoftirqd 具体业务逻辑如下:</p>
<pre><code class="language-c">// file: kernel/softirq.c
static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable();
if (local_softirq_pending()) { // 条件
__do_softirq(); // 具体业务
local_irq_enable();
cond_resched();
preempt_disable();
rcu_note_context_switch(cpu);
preempt_enable();
return;
}
local_irq_enable();
}
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
* but break the loop if need_resched() is set or after 2 ms.
* The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
* certain cases, such as stop_machine(), jiffies may cease to
* increment and so we need the MAX_SOFTIRQ_RESTART limit as
* well to make sure we eventually return from this method.
*
* These limits have been established via experimentation.
* The two things to balance is latency against fairness -
* we want to handle softirqs as soon as possible, but they
* should not be able to lock up the box.
*/
#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
#define MAX_SOFTIRQ_RESTART 10
// 注意:不一定是在 ksoftirqd 中运行
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
__u32 pending;
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
unsigned long old_flags = current-&gt;flags;
int max_restart = MAX_SOFTIRQ_RESTART;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current-&gt;flags &amp;= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable();
h = softirq_vec; // 数组?
do {
if (pending &amp; 1) {
unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h-&gt;action(h); // 调用函数
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
printk(KERN_ERR &quot;huh, entered softirq %u %s %p&quot;
&quot;with preempt_count %08x,&quot;
&quot; exited with %08x?\n&quot;, vec_nr,
softirq_to_name[vec_nr], h-&gt;action,
prev_count, preempt_count());
preempt_count() = prev_count;
}
rcu_bh_qs(cpu);
}
h++;
pending &gt;&gt;= 1; // 依次移位
} while (pending);
local_irq_disable();
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) &amp;&amp; !need_resched() &amp;&amp;
--max_restart)
goto restart;
wakeup_softirqd();
}
lockdep_softirq_exit();
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}</code></pre>