jason--liu/Blog

记一次僵尸进程分析过程

Opened this issue · 0 comments

背景

服务器上有一个虚机发现删不掉, ps命令查看发现qemu-kvm变成了zombie.

僵尸进程出现的原因, 一般是子进程退出时父进程没有回收子进程的资源, 这个时候子进程就会变成僵尸进程, 或者子进程根本没有发送SIGCHLD通知父进程回收

分析过程

查看进程状态

[root@node-3 ~]# cat /proc/22889/status
Name:   qemu-kvm
State:  Z (zombie)
Tgid:   22889
Ngid:   22985
Pid:    22889
PPid:   1
TracerPid:      0
Uid:    42436   42436   42436   42436
Gid:    36      36      36      36
FDSize: 0
Groups: 10 36 42400 42427 42436
NStgid: 22889
NSpid:  22889
NSpgid: 22807
NSsid:  22807
...

的确是Z state. 再查看22889此时的父进程是systemd. 查看当前系统systemd运行正常, 相关命令也能正常执行, 不存在”卡住”的情况. 怀疑是不是systemd只回收自己创建的进程, 而不回收托孤进程. 验证了下

#include <stdio.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <unistd.h>

int main(void)
{
        pid_t pid,wait_pid;
        int status;

        pid = fork();

        if (pid==-1)    {
                perror("Cannot create new process");
                exit(1);
        } else  if (pid==0) {
                printf("child process id: %ld\n", (long) getpid());
                pause();
                _exit(0);
        } else {
#if 1 /* define 1 to make child process always a zomie */
                printf("ppid:%d\n", getpid());
                while(1);
#endif
                do {
                        wait_pid=waitpid(pid, &status, WUNTRACED | WCONTINUED);

                        if (wait_pid == -1) {
                                perror("cannot using waitpid function");
                                exit(1);
                        }

                        if (WIFEXITED(status))
                                printf("child process exites, status=%d\n", WEXITSTATUS(status));

                        if(WIFSIGNALED(status))
                                printf("child process is killed by signal %d\n", WTERMSIG(status));

                        if (WIFSTOPPED(status))
                                printf("child process is stopped by signal %d\n", WSTOPSIG(status));

                        if (WIFCONTINUED(status))
                                printf("child process resume running....\n");

                } while (!WIFEXITED(status) && !WIFSIGNALED(status));

                exit(0);
        }
}

实验发现, 托孤进程退出时, system正常情况下也会回收.
image

再查看22889的一些进程状态. 发现22889下面有子线程处于D state

[root@node-3 ~]# cat /proc/22889/task/22976/status
Name:   safe_timer
State:  D (disk sleep)
Tgid:   22889
Ngid:   0
Pid:    22976
PPid:   1
TracerPid:      6076
Uid:    42436   42436   42436   42436
Gid:    36      36      36      36
FDSize: 0
Groups: 10 36 42400 42427 42436
NStgid: 22889
NSpid:  22976
NSpgid: 22807
NSsid:  22807
Threads:        2
SigQ:   2/260720
SigPnd: 0000000000040000
ShdPnd: 0000000000004100
SigBlk: fffffffe7ffbfeff
SigIgn: 0000000000001000
SigCgt: 0000000180004243
CapInh: 0000000000000000
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: 0000003fffffffff
CapAmb: 0000000000000000
NoNewPrivs:     1
Seccomp:        2
Speculation_Store_Bypass:       not vulnerable
Cpus_allowed:   ffffffff,fffc0000
Cpus_allowed_list:      18-63
Mems_allowed:   0f
Mems_allowed_list:      0-3
voluntary_ctxt_switches:        11
nonvoluntary_ctxt_switches:     3

查看进程call trace信息

[root@node-3 ~]# cat /proc/22889/task/22976/stack
[<0>] __switch_to+0x6c/0x90
[<0>] exit_aio+0x104/0x120
[<0>] mmput+0x54/0x168
[<0>] vhost_dev_cleanup+0x290/0x2c0 [vhost]
[<0>] vhost_net_release+0x5c/0xd0 [vhost_net]
[<0>] __fput+0x9c/0x1d8
[<0>] ____fput+0x20/0x30
[<0>] task_work_run+0xb0/0xe0
[<0>] do_exit+0x3c0/0xa78
[<0>] do_group_exit+0x3c/0xd0
[<0>] get_signal+0x160/0x810
[<0>] do_notify_resume+0x15c/0x2f8
[<0>] work_pending+0x8/0x10
[<0>] 0xffffffffffffffff

看起来是aio没有返回, 然后schedule出去了.

看起来应该是子线程处于D state, 杀掉主线程时, 主线程就会变成zombie, 需要验证下.

实验验证

在本地实验, 通过fork来模拟托孤进程效果. 子进程中创建线程来读设备文件, 驱动中会讲进程设置为TASK_UNINTERRUPTIBLE(D state)
应用程序:

#include <stdio.h>
#include <pthread.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <linux/unistd.h>
#include <sys/syscall.h>

static pid_t gettid( void )
{
	return syscall(__NR_gettid);
}

static void *thread_fun(void *param)
{
	int fd = open("/dev/memory",  O_RDWR);
	printf("fd=%d\n",fd );
	
	char buf[500]={0};
	int size = read(fd, buf, 500);
    printf("thread pid:%d, tid:%d pthread_self:%lu, fd=%d, size=%d\n", getpid(), gettid(),pthread_self(), fd, size);
	return NULL;
}

int main(void)
{
	pthread_t tid1, tid2;
	pid_t pid;
	int ret;

	printf("thread pid:%d, tid:%d pthread_self:%lu\n", getpid(), gettid(),pthread_self());

	pid = fork();
	
	if (pid==-1)    {
                perror("Cannot create new process");
                exit(1);
        } else  if (pid==0) {
                printf("child process id: %ld\n", (long) getpid());
		ret = pthread_create(&tid1, NULL, thread_fun, NULL);
		if (ret == -1) {
			perror("cannot create new thread"); return -1;
		}
                pause();
                _exit(0);
	}
	//ret = pthread_create(&tid1, NULL, thread_fun, NULL);
/*
	if (pthread_join(tid1, NULL) != 0) {
		perror("call pthread_join function fail");
		return -1;
	}
*/
	while(1);
	return 0;
}

驱动程序部分实现:

// ...
// ...
static DECLARE_WAIT_QUEUE_HEAD(wq);
static volatile int flag = 0;

ssize_t memory_read(struct file *filp, char *buf, size_t count, loff_t *f_pos) 
{ 
    printk("<1>going to sleep\n");
    flag = 0;
    //wait_event_interruptible(wq, flag != 0);
    wait_event(wq, flag != 0); //将进程设置为TASK_UNINTERRUPTIBLE
    //wait_event_killable(wq, flag != 0);
    //pr_info("wake-up by P:0x%x T:0x%x\n", current->signal->shared_pending.signal,current->pending.signal);
    printk("<1>Reading from memory module\n");

    copy_to_user(buf,memory_buffer,1);

    /* Changing reading position as best suits */ 
    if (*f_pos == 0) { 
        *f_pos+=1; 
        return 1; 
    } else { 
        return 0; 
    }
}

// ...
// ...

编译执行, 父进程pid是2203, 子进程是2204
image
杀掉父进程, 让子进程2204托孤给systemd.
image
查看2204子线程状态是处于D state的.
image
2204这个时候处于S state.
image
尝试杀掉2204, 再来查看status
image
可以看到进程变成了zombie 状态.

疑问: 为什么当taskgroup中存在D的task时, taskgroup leader被kill会造成 leader变成zombie?

原因分析

进程被kill时, 会走到do_exit

void __noreturn do_exit(long code)
{
  // ...
	taskstats_exit(tsk, group_dead);
  // 是否各种资源
	exit_mm();

	if (group_dead)
		acct_process();
	trace_sched_process_exit(tsk);

	exit_sem(tsk);
	exit_shm(tsk);
	exit_files(tsk);
	exit_fs(tsk);
	if (group_dead)
		disassociate_ctty(1);
	exit_task_namespaces(tsk);
	exit_task_work(tsk);
	exit_thread(tsk);
	exit_umh(tsk);
  // ...
  	exit_notify(tsk, group_dead);

}

然后走到exit_notify

static void exit_notify(struct task_struct *tsk, int group_dead)
{
	bool autoreap;
	struct task_struct *p, *n;
	LIST_HEAD(dead);
  // ...
  // 如果是group leader且group不为空, aotoreap=false
  else if (thread_group_leader(tsk)) {
		autoreap = thread_group_empty(tsk) &&  // thread_group_empty为false, &&后面的就不会再判断了
			do_notify_parent(tsk, tsk->exit_signal);
	} else {
		autoreap = true;
	}
    // 将退出状态设置为EXIT_ZOMBIE
	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
	if (tsk->do_exitexit_state == EXIT_DEAD)
		list_add(&tsk->ptrace_entry, &dead);
  //...
	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
	list_del_init(&p->ptrace_entry);
	release_task(p);
	}

}

执行完exit_notify之后, 主线程状态设置成了EXIT_ZOMBIE, 成为了zombie, 而且没有通知父进程. 而子线程因为处于D state, 没法返回用户空间, 所以一直不会收到信号.
主线程资源什么时候被回收?
当D task从内核空间返回会检查信号位图, 如果收到信号,会有相关出理操作, 这里kill主线程, D task返回后也会收到SIGKILL, 也会走到do_exit, 然后走到exit_notify, 这一次会进入release_task

void release_task(struct task_struct *p)
{
    struct task_struct *leader;
    //...
    	if (leader != p && thread_group_empty(leader)
			&& leader->exit_state == EXIT_ZOMBIE) {
		/*
		 * If we were the last child thread and the leader has
		 * exited already, and the leader's parent ignores SIGCHLD,
		 * then we are the one who should release the leader.
		 */
		zap_leader = do_notify_parent(leader, leader->exit_signal);
		if (zap_leader)
			leader->exit_state = EXIT_DEAD;
	}
	write_unlock_irq(&tasklist_lock);
	release_thread(p);
	// 释放task_struct
	call_rcu(&p->rcu, delayed_put_task_struct);

  // ...
}

这里会进入if判断, 然后调用do_notify_parent发送SIGCHLD来通知主线程的parent来"收尸". 子线程然后走到delayed_put_task_struct来释放task_struct.
父进程收到SIGCHLD, 会调用waipid(), 然后又会走到release_task(wait_task_zombie->release_task)来释放主线程的task_struct.
最后,主线程的躯壳也被释放了.