负载
<h2>概述</h2>
<p>应用层是从 <code>/proc/stat</code> 来统计,它是在时钟 TICK 中断中进行更新统计的,本质上属于高频抽样。</p>
<p><code>nice</code> 本质上是属于 <code>user</code> 的,只是单独将 nice > 0 的情况拿出来统计;可以通过 top 来看;
<code>iowait</code> 本质上是属于 <code>idle</code> 的,只是单独将有 iowait 的情况拿出来统计
<code>sys</code> 就是系统态,但除去硬中断和软中断的场景,如系统调用、内核线程等</p>
<h2>分析(基于4.19)</h2>
<pre><code class="language-c">// file: fs/proc/stat.c
static int __init proc_stat_init(void)
{
proc_create(&quot;stat&quot;, 0, NULL, &amp;proc_stat_operations);
return 0;
}
static const struct file_operations proc_stat_operations = {
.open = stat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int stat_open(struct inode *inode, struct file *file)
{
unsigned int size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
size += 2 * nr_irqs;
return single_open_size(file, show_stat, NULL, size); // show_stat
}
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
u64 user, nice, system, idle, iowait, irq, softirq, steal;
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec64 boottime;
user = nice = system = idle = iowait =
irq = softirq = steal = 0;
guest = guest_nice = 0;
getboottime64(&amp;boottime);
for_each_possible_cpu(i) {
user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; // 取变量 kcpustat_cpu(i).cpustat 的值
nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(i);
iowait += get_iowait_time(i);
irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
for (j = 0; j &lt; NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
per_softirq_sums[j] += softirq_stat;
sum_softirq += softirq_stat;
}
}
sum += arch_irq_stat();
seq_put_decimal_ull(p, &quot;cpu &quot;, nsec_to_clock_t(user)); // 首先是所有 CPU 的占用情况。转换成节拍数并打印出来
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(nice));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(system));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(idle));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(iowait));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(irq));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(softirq));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(steal));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(guest));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(guest_nice));
seq_putc(p, '\n');
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(i);
iowait = get_iowait_time(i);
irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, &quot;cpu%d&quot;, i);
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(user));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(nice));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(system));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(idle));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(iowait));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(irq));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(softirq));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(steal));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(guest));
seq_put_decimal_ull(p, &quot; &quot;, nsec_to_clock_t(guest_nice));
seq_putc(p, '\n');
}
seq_put_decimal_ull(p, &quot;intr &quot;, (unsigned long long)sum);
/* sum again ? it could be updated? */
for_each_irq_nr(j)
seq_put_decimal_ull(p, &quot; &quot;, kstat_irqs_usr(j));
seq_printf(p,
&quot;\nctxt %llu\n&quot;
&quot;btime %llu\n&quot;
&quot;processes %lu\n&quot;
&quot;procs_running %lu\n&quot;
&quot;procs_blocked %lu\n&quot;,
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
total_forks,
nr_running(),
nr_iowait());
seq_put_decimal_ull(p, &quot;softirq &quot;, (unsigned long long)sum_softirq);
for (i = 0; i &lt; NR_SOFTIRQS; i++)
seq_put_decimal_ull(p, &quot; &quot;, per_softirq_sums[i]);
seq_putc(p, '\n');
return 0;
}</code></pre>
<p>那么 <code>kcpustat_cpu(i).cpustat</code> 里的值是什么时候更新呢?</p>
<h2>kcpustat_cpu(i).cpustat 更新</h2>
<p>时钟 TICK 中断时,会调用 <code>update_process_times </code>。</p>
<pre><code class="language-c">// file: kernel/time/timer.c
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick); // 继续
run_local_timers();
rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
#endif
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers(p);
}
// file: kernel/sched/cputime.c
/*
* Account a single tick of CPU time.
* @p: the process that the CPU time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
return;
if (sched_clock_irqtime) {
irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
cputime = TICK_NSEC; // 每个 TICK 的时间,单位是:纳秒
steal = steal_account_process_time(ULONG_MAX); // ?
if (steal &gt;= cputime)
return;
cputime -= steal;
if (user_tick)
account_user_time(p, cputime); // 1. 用户态
else if ((p != rq-&gt;idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime); // 2. 内核态。宏定义 #define HARDIRQ_OFFSET (1UL &lt;&lt; HARDIRQ_SHIFT)
else
account_idle_time(cputime); // 3. 空闲时间
}
/*
* Account user CPU time to a process.
* @p: the process that the CPU time gets accounted to
* @cputime: the CPU time spent in user space since the last update
*/
void account_user_time(struct task_struct *p, u64 cputime)
{
int index;
/* Add user time to process. */
p-&gt;utime += cputime;
account_group_user_time(p, cputime);
index = (task_nice(p) &gt; 0) ? CPUTIME_NICE : CPUTIME_USER; // user 分为这 2 个。注意:nice 值越大,优先级越低
/* Add user time to cpustat. */
task_group_account_field(p, index, cputime); // 这里统计到 kernel_cpustat.cpustat 中
/* Account for user time used */
acct_account_cputime(p);
}
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{
/*
* Since all updates are sure to touch the root cgroup, we
* get ourselves ahead and touch it first. If the root cgroup
* is the only cgroup, then nothing else should be necessary.
*
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp); // 统计到 kernel_cpustat.cpustat 中
cgroup_account_cputime_field(p, index, tmp);
}
/*
* Account system CPU time to a process.
* @p: the process that the CPU time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the CPU time spent in kernel space since the last update
*/
void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
int index;
if ((p-&gt;flags &amp; PF_VCPU) &amp;&amp; (irq_count() - hardirq_offset == 0)) { // (preempt_count() &amp; (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK))
account_guest_time(p, cputime);
return;
}
if (hardirq_count() - hardirq_offset) // (preempt_count() &amp; HARDIRQ_MASK)。会有硬中断嵌套吗?
index = CPUTIME_IRQ;
else if (in_serving_softirq()) // (softirq_count() &amp; SOFTIRQ_OFFSET)
index = CPUTIME_SOFTIRQ;
else
index = CPUTIME_SYSTEM; // sys:内核态 - 硬中断 - 软中断:系统调用
account_system_index_time(p, cputime, index);
}
/*
* Account system CPU time to a process and desired cpustat field
* @p: the process that the CPU time gets accounted to
* @cputime: the CPU time spent in kernel space since the last update
* @index: pointer to cpustat field that has to be updated
*/
void account_system_index_time(struct task_struct *p,
u64 cputime, enum cpu_usage_stat index)
{
/* Add system time to process. */
p-&gt;stime += cputime;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
task_group_account_field(p, index, cputime); // 添加
/* Account for system time used */
acct_account_cputime(p);
}
/*
* Account for idle time.
* @cputime: the CPU time spent in idle wait
*/
void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu-&gt;cpustat;
struct rq *rq = this_rq();
if (atomic_read(&amp;rq-&gt;nr_iowait) &gt; 0)
cpustat[CPUTIME_IOWAIT] += cputime; // iowait
else
cpustat[CPUTIME_IDLE] += cputime; // idle
}</code></pre>