panic宕机处理

以 linux4.19 x86 架构为例。 <h2>panic 流程</h2> 调用 panic： <pre><code class="language-c">// file: kernel/panic.c /** * panic - halt the system * @fmt: The text string to print * * Display a message, then perform cleanups. * * This function never returns. */ void panic(const char *fmt, ...) { static char buf[1024]; va_list args; long i, i_next = 0; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; // 内核启动参数，一般为 false。可以查看 cat /sys/module/kernel/parameters/crash_kexec_post_notifiers if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. * Resetting this prevents additional WARN() from panicking the * system on this thread. Other threads are blocked by the * panic_mutex in panic(). */ panic_on_warn = 0; } /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since * there is nothing to prevent an interrupt handler (that runs * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); preempt_disable_notrace(); /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... * * Only one CPU is allowed to execute the panic code from here. For * multiple parallel invocations of panic, all other CPUs either * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). * * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which * comes here, so go ahead. * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ this_cpu = raw_smp_processor_id(); old_cpu = atomic_cmpxchg(&amp;panic_cpu, PANIC_CPU_INVALID, this_cpu); // 设置 panic_cpu if (old_cpu != PANIC_CPU_INVALID &amp;&amp; old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); pr_emerg(&quot;Kernel panic - not syncing: %s\n&quot;, buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ if (!test_taint(TAINT_DIE) &amp;&amp; oops_in_progress &lt;= 1) dump_stack(); #endif /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. * If we want to run this after calling panic_notifiers, pass * the &quot;crash_kexec_post_notifiers&quot; option to the kernel. * * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { // 实测会进入此分支，代码待分析 printk_safe_flush_on_panic(); __crash_kexec(NULL); // 继续 /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a * panic situation. */ smp_send_stop(); } else { /* * If we want to do crash dump after notifier calls and * kmsg_dump, we will need architecture dependent extra * works in addition to stopping other CPUs. */ crash_smp_send_stop(); // 这里就让其它 CPU 停机？ } /* * Run any panic handlers, including those that might need to * add information to the kmsg dump output. */ atomic_notifier_call_chain(&amp;panic_notifier_list, 0, buf); /* Call flush even twice. It tries harder with a single online CPU */ printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* * If you doubt kdump always works fine in any situation, * &quot;crash_kexec_post_notifiers&quot; offers you a chance to run * panic_notifiers and dumping kmsg before kdump. * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. * * Bypass the panic_cpu check and call __crash_kexec directly. */ if (_crash_kexec_post_notifiers) __crash_kexec(NULL); #ifdef CONFIG_VT unblank_screen(); #endif console_unblank(); /* * We may have ended up stopping the CPU holding the lock (in * smp_send_stop()) while still having some valuable data in the console * buffer. Try to acquire the lock then release it regardless of the * result. The release will also print the buffers out. Locks debug * should be disabled to avoid reporting bad unlock balance when * panic() is not being callled from OOPS. */ debug_locks_off(); console_flush_on_panic(); if (!panic_blink) panic_blink = no_blink; if (panic_timeout &gt; 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the &quot;normal&quot; timers since we just panicked. */ pr_emerg(&quot;Rebooting in %d seconds..\n&quot;, panic_timeout); for (i = 0; i &lt; panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); if (i &gt;= i_next) { i += panic_blink(state ^= 1); i_next = i + 3600 / PANIC_BLINK_SPD; } mdelay(PANIC_TIMER_STEP); } } if (panic_timeout != 0) { /* * This will not be a clean reboot, with everything * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ emergency_restart(); } #ifdef __sparc__ { extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; pr_emerg(&quot;Press Stop-A (L1-A) from sun keyboard or send break\n&quot; &quot;twice on console to return to the boot prom\n&quot;); } #endif #if defined(CONFIG_S390) { unsigned long caller; caller = (unsigned long)__builtin_return_address(0); disabled_wait(caller); } #endif pr_emerg(&quot;---[ end Kernel panic - not syncing: %s ]---\n&quot;, buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); if (i &gt;= i_next) { i += panic_blink(state ^= 1); i_next = i + 3600 / PANIC_BLINK_SPD; } mdelay(PANIC_TIMER_STEP); } } EXPORT_SYMBOL(panic);</code></pre> <pre><code class="language-c">// file: kernel/kexec_core.c /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU * which processes crash_kexec routines. */ void __noclone __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * * If the crash kernel was not located in a fixed area * of memory the xchg(&amp;kexec_crash_image) would be * sufficient. But since I reuse the memory... */ if (mutex_trylock(&amp;kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&amp;fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&amp;fixed_regs); // 继续 machine_kexec(kexec_crash_image); } mutex_unlock(&amp;kexec_mutex); } } STACK_FRAME_NON_STANDARD(__crash_kexec); </code></pre> 以 x86 架构为例： <pre><code class="language-c">// file: arch/x86/kernel/reboot.c #ifdef CONFIG_KEXEC_CORE void machine_crash_shutdown(struct pt_regs *regs) { machine_ops.crash_shutdown(regs); } #endif</code></pre> 赋值的地方，对应函数为 native_machine_crash_shutdown： <pre><code class="language-c">// file: arch/x86/kernel/reboot.c struct machine_ops machine_ops __ro_after_init = { .power_off = native_machine_power_off, .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, #ifdef CONFIG_KEXEC_CORE .crash_shutdown = native_machine_crash_shutdown, #endif };</code></pre> 继续： <pre><code class="language-c">// file: arch/x86/kernel/crash.c void native_machine_crash_shutdown(struct pt_regs *regs) { /* This function is only called after the system * has panicked or is otherwise in a critical state. * The minimum amount of code to allow a kexec'd kernel * to run successfully needs to happen here. * * In practice this means shooting down the other cpus in * an SMP system. */ /* The kernel is broken so disable interrupts */ local_irq_disable(); crash_smp_send_stop(); // 发送 IPI 中断，让其它 CPU 停机 /* * VMCLEAR VMCSs loaded on this cpu if needed. */ cpu_crash_vmclear_loaded_vmcss(); cpu_emergency_disable_virtualization(); /* * Disable Intel PT to stop its logging */ cpu_emergency_stop_pt(); #ifdef CONFIG_X86_IO_APIC /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ ioapic_zap_locks(); clear_IO_APIC(); #endif lapic_shutdown(); restore_boot_irq_mode(); #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif crash_save_cpu(regs, safe_smp_processor_id()); } /* Override the weak function in kernel/panic.c */ void crash_smp_send_stop(void) { static int cpus_stopped; if (cpus_stopped) return; if (smp_ops.crash_stop_other_cpus) // 与架构相关 smp_ops.crash_stop_other_cpus(); else smp_send_stop(); cpus_stopped = 1; }</code></pre> 在哪里设置值？ <pre><code class="language-c">// file: arch/x86/kernel/smp.c struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, #if defined(CONFIG_KEXEC_CORE) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, // 这个 #endif .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, .cpu_die = native_cpu_die, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead, .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops);</code></pre> 继续： <pre><code class="language-c">// file: arch/x86/kernel/crash.c void kdump_nmi_shootdown_cpus(void) { nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); } /** * nmi_shootdown_cpus - Stop other CPUs via NMI * @callback: Optional callback to be invoked from the NMI handler * * The NMI handler on the remote CPUs invokes @callback, if not * NULL, first and then disables virtualization to ensure that * INIT is recognized during reboot. * * nmi_shootdown_cpus() can only be invoked once. After the first * invocation all other CPUs are stuck in crash_nmi_callback() and * cannot respond to a second NMI. */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; local_irq_disable(); /* * Avoid certain doom if a shootdown already occurred; re-registering * the NMI handler will cause list corruption, modifying the callback * will do who knows what, etc... */ if (WARN_ON_ONCE(crash_ipi_issued)) return; /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); shootdown_callback = callback; // 中断回调会用到 atomic_set(&amp;waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, NMI_FLAG_FIRST, &quot;crash&quot;)) // 注册中断？ return; /* Return what? */ /* * Ensure the new callback function is set before sending * out the NMI */ wmb(); smp_send_nmi_allbutself(); // 发送 NMI 中断 /* Kick CPUs looping in NMI context. */ WRITE_ONCE(crash_ipi_issued, 1); msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&amp;waiting_for_crash_ipi) &gt; 0) &amp;&amp; msecs) { mdelay(1); msecs--; } /* * Leave the nmi callback set, shootdown is a one-time thing. Clearing * the callback could result in a NULL pointer dereference if a CPU * (finally) responds after the timeout expires. */ } static void smp_send_nmi_allbutself(void) { apic-&gt;send_IPI_allbutself(NMI_VECTOR); } </code></pre> 上文注册的中断处理函数： <pre><code class="language-c">// file: arch/x86/kernel/reboot.c static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { int cpu; cpu = raw_smp_processor_id(); /* * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) return NMI_HANDLED; local_irq_disable(); if (shootdown_callback) // 上文已经赋值，即 kdump_nmi_callback shootdown_callback(cpu, regs); /* * Prepare the CPU for reboot _after_ invoking the callback so that the * callback can safely use virtualization instructions, e.g. VMCLEAR. */ cpu_emergency_disable_virtualization(); atomic_dec(&amp;waiting_for_crash_ipi); /* Assume hlt works */ halt(); for (;;) cpu_relax(); // 死循环？ return NMI_HANDLED; }</code></pre> 继续看中断回调： <pre><code class="language-c">// file: arch/x86/kernel/crash.c static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; if (!user_mode(regs)) { crash_fixup_ss_esp(&amp;fixed_regs, regs); regs = &amp;fixed_regs; } #endif crash_save_cpu(regs, cpu); // 待分析 /* * VMCLEAR VMCSs loaded on all cpus if needed. */ cpu_crash_vmclear_loaded_vmcss(); /* * Disable Intel PT to stop its logging */ cpu_emergency_stop_pt(); disable_local_APIC(); }</code></pre>

公开学习文档

panic宕机处理

页面列表