23.11.23 최초 작성
64bit Armv8
기준 스택 크기는 0x4000
IRQ Stack
: 인터럽트를 처리하기 위한 프로세스의 스택 공간
(크기 0x4000
보다 클 수 있음)
Stack Overflow, out-of-bound, use-after-free
대응 : Stack canary
, compile option
, 디버깅
, 코드 리뷰
, 스택 내용 분석
현재 실행중인 프로세스의task_struct
의 주소를 알 수 있는 매크로
get_current
: sp_el0
의 값(현재 사용자 프로세스 stack 주소)를 task_struct
로 변환해 반환
Context switching
시 X1은 앞으로 CPU를 점유하며 실행할 프로세스의 task_struct
주소 저장
sp_el0
는 이제부터 CPU를 점유하며 실행되는 프로세스의 주소로 업데이트
//https://elixir.bootlin.com/linux/v5.15.30/source/arch/arm64/include/asm/current.h#L15
#ifndef __ASM_CURRENT_H
#define __ASM_CURRENT_H
#include <linux/compiler.h>
#ifndef __ASSEMBLY__
struct task_struct;
/*
* We don't use read_sysreg() as we want the compiler to cache the value where
* possible.
*/
static __always_inline struct task_struct *get_current(void)
{
unsigned long sp_el0;
asm ("mrs %0, sp_el0" : "=r" (sp_el0));
return (struct task_struct *)sp_el0;
}
#define current get_current()
#endif /* __ASSEMBLY__ */
#endif /* __ASM_CURRENT_H */
:
task descriptor
라고 함 (아키텍처와 무관)이름
, pid
, 관계
등의 정보 저장//https://elixir.bootlin.com/linux/v5.15.30/source/include/linux/sched.h
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
unsigned int __state;
#ifdef CONFIG_PREEMPT_RT
/* saved state for "spinlock sleepers" */
unsigned int saved_state;
#endif
/*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
*/
randomized_struct_fields_start
void *stack;
refcount_t usage;
/* Per task flags (PF_*), defined further below: */
unsigned int flags;
unsigned int ptrace;
#ifdef CONFIG_SMP
int on_cpu;
struct __call_single_node wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int cpu;
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
...
}
task_struct
의 시작 주소//https://elixir.bootlin.com/linux/v5.15.30/source/arch/arm64
struct thread_struct {
struct cpu_context cpu_context; /* cpu context */
/*
* Whitelisted fields for hardened usercopy:
* Maintainers must ensure manually that this contains no
* implicit padding.
*/
struct {
unsigned long tp_value; /* TLS register */
unsigned long tp2_value;
struct user_fpsimd_state fpsimd_state;
} uw;
...
}
struct cpu_context {
unsigned long x19;
unsigned long x20;
unsigned long x21;
unsigned long x22;
unsigned long x23;
unsigned long x24;
unsigned long x25;
unsigned long x26;
unsigned long x27;
unsigned long x28;
unsigned long fp;
unsigned long sp;
unsigned long pc;
};
링크 참고
관련 커널
// https://elixir.bootlin.com/linux/v5.10.60/source/arch/arm64/include/asm/ptrace.h
// register 상태 확인해 (pstate) user_mode, kernel_mode 구분
#define user_mode(regs) \
(((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t)
#define processor_mode(regs) \
((regs)->pstate & PSR_MODE_MASK)
//https://elixir.bootlin.com/linux/v5.10.60/source/arch/arm64/kernel/traps.c
static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
int i;
if (user_mode(regs))
return;
...
printk("%sCode: %s\n", lvl, str);
}
// User mode
fork()
// Kernel mode
__arm64_sys_clone
kernel_clone
copy_process
__arm64_sys_clone
: kernel_clone
함수 호출//https://elixir.bootlin.com/linux/v5.15.30/source/kernel/fork.c
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
unsigned long, tls,
int __user *, child_tidptr)
{
struct kernel_clone_args args = {
.flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
.pidfd = parent_tidptr,
.child_tid = child_tidptr,
.parent_tid = parent_tidptr,
.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
.stack = newsp,
.tls = tls,
};
return kernel_clone(&args); //
}
kernel_clone
: 프로세스를 생성하고 pid
를 반환https://elixir.bootlin.com/linux/v5.15.30/source/kernel/fork.c
pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
pid_t nr;
/*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
* field in struct clone_args and it still doesn't make sense to have
* them both point at the same memory location. Performing this check
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
if ((args->flags & CLONE_PIDFD) &&
(args->flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
return PTR_ERR(p);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
return nr;
}
copy_process
: 프로세스의 정보를 복사하는 함수 https://elixir.bootlin.com/linux/v5.15.30/source/kernel/fork.c
static __latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
struct kernel_clone_args *args)
{
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;
...
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
* processes that happen during the fork and delay them so that
* they appear to happen after the fork.
*/
sigemptyset(&delayed.signal);
INIT_HLIST_NODE(&delayed.node);
spin_lock_irq(¤t->sighand->siglock);
if (!(clone_flags & CLONE_THREAD))
hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
retval = -ERESTARTNOINTR;
if (task_sigpending(current))
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
* fatal or STOP
*/
p->flags |= PF_IO_WORKER;
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p, clone_flags);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
if (retval)
goto bad_fork_cleanup_io;
...
}
exit
호출 시 종료 흐름// user mode
exit
// kernel mode
__arm64_sys_exit_group
do_group_exit
do_exit
kill
시그널 수신 시 종료 흐름//user mode
//kernel mode
el0_svc
do_notify_resume
get_signal
do_signal
do_group_exit
do_exit
__arm64_sys_exit_group
, do_group_exit
: do_exit
함수 호출//https://elixir.bootlin.com/linux/v5.15.30/source/kernel/exit.c
SYSCALL_DEFINE1(exit_group, int, error_code)
{
do_group_exit((error_code & 0xff) << 8);
/* NOTREACHED */
return 0;
}
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
...
do_exit(exit_code);
/* NOTREACHED */
}
do_exit
: 프로세스를 종료하고 리소스를 반환하는 함수//https://elixir.bootlin.com/linux/v5.15.30/source/kernel/exit.c
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
/*
* We can get here from a kernel oops, sometimes with preemption off.
* Start by checking for critical errors.
* Then fix up important state like USER_DS and preemption.
* Then do everything else.
*/
WARN_ON(blk_needs_flush_plug(tsk));
//인터럽트 발생 시 먼저 처리
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
...
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
futex_exit_recursive(tsk);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
...
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
...
exit_mm();
if (group_dead)
acct_process();
trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
...
do_task_dead();
}
커널 공간에서만 실행되는 프로세스
백그라운드에서 실행되며 시스템 메모리나 전원 제어
유저 공간과 상호작용하지 않으며 모든 동작을 커널에서 직접 관리
시스템이 부팅될 때 생성되고 시스템이 종료될때 소멸
//https://elixir.bootlin.com/linux/v5.10.60/source/kernel/workqueue.c
static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
...
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
// worker_thread 함수 호출
...
}
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
woke_up:
raw_spin_lock_irq(&pool->lock);
//커널 스레드 생성 요청
kthread_create
kthread_create_on_node
__kthread_create_on_node
//스케줄링
//kthreadd 프로세스
kthreadd
create_kthread
kernel_thread
kernel_clone
kthreadd
프로세스kthreadd
: 핸들러 함수이며 커널 스레드 생성함ksoftirqd
프로세스Soft IRQ
를 위한 프로세스run_ksoftirqd
: Soft IRQ 서비스 실행, __do_softirq()
함수에서 깨움threaded IRQ