Linux中softlockup和hardlockup检测机制

Question

Linux中softlockup和hardlockup检测机制

Opened this issue 3 years ago · 0 comments

什么是softlockup

softlockups are bugs that cause the kernel to loop in kernel mode for more than 20 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up

什么是hardlockup

hardlockups are bugs that cause the CPU to loop in kernel mode for more than 10 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up

softlockup检测原理

watchdog线程初始化

static void watchdog_enable(unsigned int cpu)
{
	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

	/*
	 * Start the timer first to prevent the NMI watchdog triggering
	 * before the timer has a chance to fire.
	 */
	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	hrtimer->function = watchdog_timer_fn;   // 线程函数
	hrtimer_start(hrtimer, ns_to_ktime(sample_period),   //4S
		      HRTIMER_MODE_REL_PINNED);

	/* Initialize timestamp */
	__touch_watchdog();
	/* Enable the perf event */
	if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
		watchdog_nmi_enable(cpu);

	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}

hrtimer默认每4S触发一次。超时函数watchdog_timer_fn

static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
    unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
    ...
    ...
            
	/* kick the hardlockup detector */
	watchdog_interrupt_count();

	/* kick the softlockup detector */
	wake_up_process(__this_cpu_read(softlockup_watchdog));
	...
    duration = is_softlockup(touch_ts);
    
	if(unlikely(duration)) {
        ...
        pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                 smp_processor_id(), duration,
                 current->comm, task_pid_nr(current));
        __this_cpu_write(softlockup_task_ptr_saved, current);
        print_modules();
        print_irqtrace_events(current);
        if (regs)
            show_regs(regs);
        else
            dump_stack();

    }
}

如何判断是否是softlockup

static int is_softlockup(unsigned long touch_ts)
{
	unsigned long now = get_timestamp();

	if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
		/* Warn about unreasonable delays. */
		if (time_after(now, touch_ts + get_softlockup_thresh()))
			return now - touch_ts;
	}
	return 0;
}

hrtimer执行路径检查20S内watchdog_touch_ts时间戳没有更新，说明watchdog线程20内没有被唤醒执行，hrtimer执行路径打印softlockup的cpu堆栈或者同时Panic。
唤醒watchdog线程，如果线程得到执行，更新时间戳

static void watchdog(unsigned int cpu)
{
	__this_cpu_write(soft_lockup_hrtimer_cnt,
			 __this_cpu_read(hrtimer_interrupts));
	__touch_watchdog();
}

在4.18内核上查看watchdog线程调度类

crash> ps | grep watchdog
...
    541      2  88  ffffa05fc421ed00  IN   0.0       0      0  [watchdog/88]
    547      2  89  ffffa05fc421fe00  IN   0.0       0      0  [watchdog/89]
    553      2  90  ffffa05fc4204400  IN   0.0       0      0  [watchdog/90]
    559      2  91  ffffa05fc422fd00  IN   0.0       0      0  [watchdog/91]
    565      2  92  ffffa05fc4224200  IN   0.0       0      0  [watchdog/92]
    571      2  93  ffffa05fc4203300  IN   0.0       0      0  [watchdog/93]
    577      2  94  ffffa05fc4234100  IN   0.0       0      0  [watchdog/94]
    583      2  95  ffffa05fc420dd00  IN   0.0       0      0  [watchdog/95]
    
crash> task_struct.sched_class ffffa05fc420dd00
  sched_class = 0xffff0000109d54f0 <rt_sched_class>

可以看到watchdog线程的调度类是rt.

查看migration线程调度类

crash> ps | grep migration
    536      2  87  ffffa05fc4202200  IN   0.0       0      0  [migration/87]
    542      2  88  ffffa05fc4235200  IN   0.0       0      0  [migration/88]
    548      2  89  ffffa05fc4227500  IN   0.0       0      0  [migration/89]
    554      2  90  ffffa05fc4214300  IN   0.0       0      0  [migration/90]
    560      2  91  ffffa05fc4238500  IN   0.0       0      0  [migration/91]
    566      2  92  ffffa05fc4213200  IN   0.0       0      0  [migration/92]
    572      2  93  ffffa05fc422a800  IN   0.0       0      0  [migration/93]
    578      2  94  ffffa05fc420ee00  IN   0.0       0      0  [migration/94]
    584      2  95  ffffa05fc421cb00  IN   0.0       0      0  [migration/95]
    
crash> task_struct.sched_class ffffa05fc421cb00
  sched_class = 0xffff0000109d5690 <stop_sched_class>

可以看到migration线程的调度类是stop。
而stop调度类的优先级最高，为什么，可以看到schedule实现

schedule
	pick_next_task
		for_each_class(class) {
		p = class->pick_next_task(rq, prev, rf);
	}

#define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
   
#ifdef CONFIG_SMP
#define sched_class_highest (&stop_sched_class)
#else
#define sched_class_highest (&dl_sched_class)
#endif

stop调度类

const struct sched_class stop_sched_class = {
	.next			= &dl_sched_class,
	...
};

dl调度类

const struct sched_class dl_sched_class = {
	.next			= &rt_sched_class,
	...
};

cfs调度类

const struct sched_class rt_sched_class = {
	.next			= &fair_sched_class,
	。。。
}

可以看到调度类优先级依次是stop,dl, rt,cfs.因此watchdog线程的优先级是低于migration线程的优先级。
可以看到dl调度类优先级是比rt高，那就有这样一种可能，当dl周期性执行的时候，rt是可能得不到调度的，可能会产生softlockup 误报。
查看上游代码，发现有个patch似乎解决了上面这个问题，commit id 9cf57731b63e37ed995b46690adc604891a9a28f
可以看到，kernel把更新时间戳的动作放到migration里来做，这样就可以抢占dl调度类的线程了，避免误报。
内核相关配置

/proc/sys/kernel/soft_watchdog:1
/proc/sys/kernel/softlockup_all_cpu_backtrace:0
/proc/sys/kernel/softlockup_panic:0

migration 线程创建流程

static struct smp_hotplug_thread cpu_stop_threads = {
	.store			= &cpu_stopper.thread,
	.thread_should_run	= cpu_stop_should_run,
	.thread_fn		= cpu_stopper_thread,
	.thread_comm		= "migration/%u",
	.create			= cpu_stop_create,
	.park			= cpu_stop_park,
	.selfparking		= true,
};

// kernel/stop_machine.c
early_initcall(cpu_stop_init);
	smpboot_register_percpu_thread(&cpu_stop_threads)
			return smpboot_register_percpu_thread_cpumask(plug_thread, cpu_possible_mask);
					for_each_online_cpu(cpu) {
						__smpboot_create_thread(plug_thread, cpu);
					}
							kthread_create_on_cpu(smpboot_thread_fn, td, cpu, ht->thread_comm);
static int smpboot_thread_fn(void *data)
{
	struct smpboot_thread_data *td = data;
	struct smp_hotplug_thread *ht = td->ht;

	while (1) {

		if (!ht->thread_should_run(td->cpu)) {
			preempt_enable_no_resched();
			schedule();
		} else {
			__set_current_state(TASK_RUNNING);
			preempt_enable();
			ht->thread_fn(td->cpu); //调用上面的cpu_stopper_thread
		}
	}
}

cpu_stopper_thread流程比较简单，从work queue里面取出第一个队列，然后执行

static void cpu_stopper_thread(unsigned int cpu)
{
	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
	struct cpu_stop_work *work;

repeat:
	work = NULL;
	raw_spin_lock_irq(&stopper->lock);
	if (!list_empty(&stopper->works)) {
		work = list_first_entry(&stopper->works,
					struct cpu_stop_work, list);
		list_del_init(&work->list);
	}
	raw_spin_unlock_irq(&stopper->lock);

	if (work) {
		cpu_stop_fn_t fn = work->fn;
		void *arg = work->arg;
		struct cpu_stop_done *done = work->done;
		int ret;

		/* cpu stop callbacks must not sleep, make in_atomic() == T */
		preempt_count_inc();
		ret = fn(arg);
		if (done) {
			if (ret)
				done->ret = ret;
			cpu_stop_signal_done(done);
		}
		preempt_count_dec();
		WARN_ONCE(preempt_count(),
			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
		goto repeat;
	}
}

第一个work执行完了后，然后取下一个work继续执行。