panic宕机处理
<p>以 linux4.19 x86 架构为例。</p>
<h2>panic 流程</h2>
<p>调用 panic:</p>
<pre><code class="language-c">// file: kernel/panic.c
/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
void panic(const char *fmt, ...)
{
static char buf[1024];
va_list args;
long i, i_next = 0;
int state = 0;
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; // 内核启动参数,一般为 false。可以查看 cat /sys/module/kernel/parameters/crash_kexec_post_notifiers
if (panic_on_warn) {
/*
* This thread may hit another WARN() in the panic path.
* Resetting this prevents additional WARN() from panicking the
* system on this thread. Other threads are blocked by the
* panic_mutex in panic().
*/
panic_on_warn = 0;
}
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
* there is nothing to prevent an interrupt handler (that runs
* after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
*
* Only one CPU is allowed to execute the panic code from here. For
* multiple parallel invocations of panic, all other CPUs either
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*
* `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
* comes here, so go ahead.
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
this_cpu = raw_smp_processor_id();
old_cpu = atomic_cmpxchg(&amp;panic_cpu, PANIC_CPU_INVALID, this_cpu); // 设置 panic_cpu
if (old_cpu != PANIC_CPU_INVALID &amp;&amp; old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
pr_emerg(&quot;Kernel panic - not syncing: %s\n&quot;, buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
if (!test_taint(TAINT_DIE) &amp;&amp; oops_in_progress &lt;= 1)
dump_stack();
#endif
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* If we want to run this after calling panic_notifiers, pass
* the &quot;crash_kexec_post_notifiers&quot; option to the kernel.
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) { // 实测会进入此分支,代码待分析
printk_safe_flush_on_panic();
__crash_kexec(NULL); // 继续
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a
* panic situation.
*/
smp_send_stop();
} else {
/*
* If we want to do crash dump after notifier calls and
* kmsg_dump, we will need architecture dependent extra
* works in addition to stopping other CPUs.
*/
crash_smp_send_stop(); // 这里就让其它 CPU 停机?
}
/*
* Run any panic handlers, including those that might need to
* add information to the kmsg dump output.
*/
atomic_notifier_call_chain(&amp;panic_notifier_list, 0, buf);
/* Call flush even twice. It tries harder with a single online CPU */
printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
* If you doubt kdump always works fine in any situation,
* &quot;crash_kexec_post_notifiers&quot; offers you a chance to run
* panic_notifiers and dumping kmsg before kdump.
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
*
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
#ifdef CONFIG_VT
unblank_screen();
#endif
console_unblank();
/*
* We may have ended up stopping the CPU holding the lock (in
* smp_send_stop()) while still having some valuable data in the console
* buffer. Try to acquire the lock then release it regardless of the
* result. The release will also print the buffers out. Locks debug
* should be disabled to avoid reporting bad unlock balance when
* panic() is not being callled from OOPS.
*/
debug_locks_off();
console_flush_on_panic();
if (!panic_blink)
panic_blink = no_blink;
if (panic_timeout &gt; 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the &quot;normal&quot; timers since we just panicked.
*/
pr_emerg(&quot;Rebooting in %d seconds..\n&quot;, panic_timeout);
for (i = 0; i &lt; panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
if (i &gt;= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
if (panic_timeout != 0) {
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
emergency_restart();
}
#ifdef __sparc__
{
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
pr_emerg(&quot;Press Stop-A (L1-A) from sun keyboard or send break\n&quot;
&quot;twice on console to return to the boot prom\n&quot;);
}
#endif
#if defined(CONFIG_S390)
{
unsigned long caller;
caller = (unsigned long)__builtin_return_address(0);
disabled_wait(caller);
}
#endif
pr_emerg(&quot;---[ end Kernel panic - not syncing: %s ]---\n&quot;, buf);
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
if (i &gt;= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
EXPORT_SYMBOL(panic);</code></pre>
<pre><code class="language-c">// file: kernel/kexec_core.c
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
* which processes crash_kexec routines.
*/
void __noclone __crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
* If the crash kernel was not located in a fixed area
* of memory the xchg(&amp;kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
if (mutex_trylock(&amp;kexec_mutex)) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
crash_setup_regs(&amp;fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&amp;fixed_regs); // 继续
machine_kexec(kexec_crash_image);
}
mutex_unlock(&amp;kexec_mutex);
}
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
</code></pre>
<p>以 x86 架构为例:</p>
<pre><code class="language-c">// file: arch/x86/kernel/reboot.c
#ifdef CONFIG_KEXEC_CORE
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
}
#endif</code></pre>
<p>赋值的地方,对应函数为 native_machine_crash_shutdown:</p>
<pre><code class="language-c">// file: arch/x86/kernel/reboot.c
struct machine_ops machine_ops __ro_after_init = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
#ifdef CONFIG_KEXEC_CORE
.crash_shutdown = native_machine_crash_shutdown,
#endif
};</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: arch/x86/kernel/crash.c
void native_machine_crash_shutdown(struct pt_regs *regs)
{
/* This function is only called after the system
* has panicked or is otherwise in a critical state.
* The minimum amount of code to allow a kexec'd kernel
* to run successfully needs to happen here.
*
* In practice this means shooting down the other cpus in
* an SMP system.
*/
/* The kernel is broken so disable interrupts */
local_irq_disable();
crash_smp_send_stop(); // 发送 IPI 中断,让其它 CPU 停机
/*
* VMCLEAR VMCSs loaded on this cpu if needed.
*/
cpu_crash_vmclear_loaded_vmcss();
cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
*/
cpu_emergency_stop_pt();
#ifdef CONFIG_X86_IO_APIC
/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
ioapic_zap_locks();
clear_IO_APIC();
#endif
lapic_shutdown();
restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}
/* Override the weak function in kernel/panic.c */
void crash_smp_send_stop(void)
{
static int cpus_stopped;
if (cpus_stopped)
return;
if (smp_ops.crash_stop_other_cpus) // 与架构相关
smp_ops.crash_stop_other_cpus();
else
smp_send_stop();
cpus_stopped = 1;
}</code></pre>
<p>在哪里设置值?</p>
<pre><code class="language-c">// file: arch/x86/kernel/smp.c
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
.stop_other_cpus = native_stop_other_cpus,
#if defined(CONFIG_KEXEC_CORE)
.crash_stop_other_cpus = kdump_nmi_shootdown_cpus, // 这个
#endif
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
.cpu_die = native_cpu_die,
.cpu_disable = native_cpu_disable,
.play_dead = native_play_dead,
.send_call_func_ipi = native_send_call_func_ipi,
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);</code></pre>
<p>继续:</p>
<pre><code class="language-c">// file: arch/x86/kernel/crash.c
void kdump_nmi_shootdown_cpus(void)
{
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
}
/**
* nmi_shootdown_cpus - Stop other CPUs via NMI
* @callback: Optional callback to be invoked from the NMI handler
*
* The NMI handler on the remote CPUs invokes @callback, if not
* NULL, first and then disables virtualization to ensure that
* INIT is recognized during reboot.
*
* nmi_shootdown_cpus() can only be invoked once. After the first
* invocation all other CPUs are stuck in crash_nmi_callback() and
* cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
local_irq_disable();
/*
* Avoid certain doom if a shootdown already occurred; re-registering
* the NMI handler will cause list corruption, modifying the callback
* will do who knows what, etc...
*/
if (WARN_ON_ONCE(crash_ipi_issued))
return;
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
shootdown_callback = callback; // 中断回调会用到
atomic_set(&amp;waiting_for_crash_ipi, num_online_cpus() - 1);
/* Would it be better to replace the trap vector here? */
if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
NMI_FLAG_FIRST, &quot;crash&quot;)) // 注册中断?
return; /* Return what? */
/*
* Ensure the new callback function is set before sending
* out the NMI
*/
wmb();
smp_send_nmi_allbutself(); // 发送 NMI 中断
/* Kick CPUs looping in NMI context. */
WRITE_ONCE(crash_ipi_issued, 1);
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&amp;waiting_for_crash_ipi) &gt; 0) &amp;&amp; msecs) {
mdelay(1);
msecs--;
}
/*
* Leave the nmi callback set, shootdown is a one-time thing. Clearing
* the callback could result in a NULL pointer dereference if a CPU
* (finally) responds after the timeout expires.
*/
}
static void smp_send_nmi_allbutself(void)
{
apic-&gt;send_IPI_allbutself(NMI_VECTOR);
}
</code></pre>
<p>上文注册的中断处理函数:</p>
<pre><code class="language-c">// file: arch/x86/kernel/reboot.c
static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
{
int cpu;
cpu = raw_smp_processor_id();
/*
* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
return NMI_HANDLED;
local_irq_disable();
if (shootdown_callback) // 上文已经赋值,即 kdump_nmi_callback
shootdown_callback(cpu, regs);
/*
* Prepare the CPU for reboot _after_ invoking the callback so that the
* callback can safely use virtualization instructions, e.g. VMCLEAR.
*/
cpu_emergency_disable_virtualization();
atomic_dec(&amp;waiting_for_crash_ipi);
/* Assume hlt works */
halt();
for (;;)
cpu_relax(); // 死循环?
return NMI_HANDLED;
}</code></pre>
<p>继续看中断回调:</p>
<pre><code class="language-c">// file: arch/x86/kernel/crash.c
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
if (!user_mode(regs)) {
crash_fixup_ss_esp(&amp;fixed_regs, regs);
regs = &amp;fixed_regs;
}
#endif
crash_save_cpu(regs, cpu); // 待分析
/*
* VMCLEAR VMCSs loaded on all cpus if needed.
*/
cpu_crash_vmclear_loaded_vmcss();
/*
* Disable Intel PT to stop its logging
*/
cpu_emergency_stop_pt();
disable_local_APIC();
}</code></pre>