CFS 스케줄러 분석

EEEFFEE·2023년 12월 7일

raspberrypi4-kernel

목록 보기

5/12

23.12.07 최초 작성

1. CFS 스케줄러

고정된 시간 간격에서 시스템의 스레드가 적어도 1번은 실행되는 스케줄링 기법
현재 대부분의 리눅스 시스템에서 채택하고 있음
nice값을 바탕으로 우선도를 결정하며 낮을수록 높은 우선도를 가짐
- Non-real-time-priority : 100 ~ 139의 범위를 가짐 (default = 120)
- Real-time priority : 0 ~ 99의 범위를 가짐

2. sched 자료구조 분석

/include/linux/sched.h
sched_entity
- task_struct에서 관리되며 해당 자료구조를 통해 스케줄러에 스케줄링 됨
- struct load_weight는 우선순위를 나타내며 nice값에 의해 결정
- u64 vruntime은 최근 실행된 시간을 저장함

struct sched_entity {
	/* For load-balancing: */
	struct load_weight		load;
	struct rb_node			run_node;
	struct list_head		group_node;
	unsigned int			on_rq;

	u64				exec_start;
	u64				sum_exec_runtime;
	u64				vruntime;
	u64				prev_sum_exec_runtime;

	u64				nr_migrations;

	struct sched_statistics		statistics;

	...
};

task_struct
- 프로세스를 관리하는 자료구조
- thread_info에서 스레드의 레지스터, sp, pc 등을 저장
- state에서 프로세스 상태를 나타냄
- sched_entity se에서 스케줄링에 필요한 정보 저장

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * For reasons of header soup (see current_thread_info()), this
	 * must be the first element of task_struct.
	 */
	struct thread_info		thread_info;
#endif
	/* -1 unrunnable, 0 runnable, >0 stopped: */
	volatile long			state;

	/*
	 * This begins the randomizable portion of task_struct. Only
	 * scheduling-critical items should be added above here.
	 */
	randomized_struct_fields_start

	void				*stack;
	refcount_t			usage;
	/* Per task flags (PF_*), defined further below: */
	unsigned int			flags;
	unsigned int			ptrace;

#ifdef CONFIG_SMP
	int				on_cpu;
	struct __call_single_node	wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/* Current CPU: */
	unsigned int			cpu;
#endif
	unsigned int			wakee_flips;
	unsigned long			wakee_flip_decay_ts;
	struct task_struct		*last_wakee;

	/*
	 * recent_used_cpu is initially set as the last CPU used by a task
	 * that wakes affine another task. Waker/wakee relationships can
	 * push tasks around a CPU where each wakeup moves to the next one.
	 * Tracking a recently used CPU allows a quick search for a recently
	 * used CPU that may be idle.
	 */
	int				recent_used_cpu;
	int				wake_cpu;
#endif
	int				on_rq;

	int				prio;
	int				static_prio;
	int				normal_prio;
	unsigned int			rt_priority;

	const struct sched_class	*sched_class;
	struct sched_entity		se;
	struct sched_rt_entity		rt;
	...
};

/kernel/sched/sched.h
struct rq
- 런큐를 관리하는 자료구조로 서브 런큐 cfs_rq cfs, rt_rq rt, dl_rq dl를 관리함

struct rq {
	/* runqueue lock: */
	raw_spinlock_t		__lock;

	/*
	 * nr_running and cpu_load should be in the same cacheline because
	 * remote CPUs use both these fields when doing load calculation.
	 */
	unsigned int		nr_running;
#ifdef CONFIG_NUMA_BALANCING
	unsigned int		nr_numa_running;
	unsigned int		nr_preferred_running;
	unsigned int		numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
	unsigned long		last_blocked_load_update_tick;
	unsigned int		has_blocked_load;
	call_single_data_t	nohz_csd;
#endif /* CONFIG_SMP */
	unsigned int		nohz_tick_stopped;
	atomic_t		nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */

#ifdef CONFIG_SMP
	unsigned int		ttwu_pending;
#endif
	u64			nr_switches;

#ifdef CONFIG_UCLAMP_TASK
	/* Utilization clamp values based on CPU's RUNNABLE tasks */
	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
	unsigned int		uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endif

	struct cfs_rq		cfs;
	struct rt_rq		rt;
	struct dl_rq		dl;
    ...
};

cfs_rq
- cfs 스케줄링을 관리하는 자료구조
- unsigned int nr_running : 해당 런큐의 스레드 갯수
- load_weight load : 해당 런큐의 weight 총 합
- u64 min_vruntime : 최소 실행 시간, red-black트리(rb_root_cached tasks_timeline)로 프로세스를 관리하며 이 때 정렬 기준은 vruntime
- sched_entity *curr : 현재 실행 중인 프로세스의 정보 저장
- struct sched_entity *next : 교체할 프로세스의 정보 저장

struct cfs_rq {
	struct load_weight	load;
	unsigned int		nr_running;
	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
	unsigned int		idle_nr_running;   /* SCHED_IDLE */
	unsigned int		idle_h_nr_running; /* SCHED_IDLE */

	s64			avg_vruntime;
	u64			avg_load;

	u64			exec_clock;
	u64			min_vruntime;
#ifdef CONFIG_SCHED_CORE
	unsigned int		forceidle_seq;
	u64			min_vruntime_fi;
#endif

#ifndef CONFIG_64BIT
	u64			min_vruntime_copy;
#endif

	struct rb_root_cached	tasks_timeline;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity	*curr;
	struct sched_entity	*next;

	struct {
		raw_spinlock_t	lock ____cacheline_aligned;
		int		nr;
		unsigned long	load_avg;
		unsigned long	util_avg;
		unsigned long	runnable_avg;
	} removed;

	...
};

3. CFS 커널 분석

kernel/sched/fair.c
- sched_slice() : timeslice를 설정하는 함수
  - __sched_period() : 만약 프로세스의 스레드 수가 일정 수 보다 크다면 period를 조정
  - __calc_delta() : 각 프로세스의 weight에 따라 timeslice를 계산
- pick_next_task_fair() : context switching할 프로세스 선택 (red-black tree)
- update_curr() : __schedule()로부터 호출되어 curr에 저장된 sched_entity정보 수정
- task_tick_fair() : curr에 저장된 sched_entity와 timeslice를 업데이트
__sched_period()에서 period 조정
__calc_delta()에서 weight에 따라 timeslice 계산
task_tick_fair()에서 timeslice를 업데이트

/kernel/sched/rt.c
- task_tick_rt() : 프로세스의 sched_rt_entity정보에 따라 프로세스를 context switching 하는 함수
- dequeue_task_rt() : 프로세스의 sched_rt_entity정보에 따라 런큐에서 제거할 프로세스 제거
task_tick_rt()
dequeue_task_rt()

4. CFS 로드밸런싱

처리해야 할 작업/요청을 컴퓨터 리소스에 분산하는 것
로드밸런싱 시점
- fork, exec : 프로세스 생성 시 분산 (가장 한가한 cpu)
- wake up : 가장 한가한 cpu가 프로세스를 깨우고 해당 cpu가 작업 실행
- idle : 적당한 작업을 현재 cpu로 할당
- periodic : 주기적으로 작업 분산

4.1 로드밸런싱 기법

global queue
- 하나의 대기큐에 프로세스를 저장하고 모든 cpu에 할당하는 방법
- cpu 활용률이 좋고 모든 cpu에 균등하게 분배할 수 있음
- global queue를 lock해줘야 하며 캐시의 지역성이 떨어짐
Per-cpu queue
- 적용하기 쉽고 캐시의 지역성을 높일 수 있음
- 모든 cpu에 불균등하게 작업이 배분될 수 있음

4.2 로드밸런싱 기준

각 cpu에 할당 된 프로세스 갯수 : high priority queue, low priority queue
프로세스의 weight :
CFS load balancer (프로세스의 weight + cpu의 활용률) : 멀티스레드로 인한 cpu에 할당된 스레드 갯수 차이 발생, 각 스레드의 cpu할당 시간을 설정 불가

4.3.1 Group Scheduling

프로세스의 스레드 단위로 작업 분배
task_group : 관련 자료구조
---c

// linux/sched/sched.h
struct task_group {
	struct cgroup_subsys_state css;

	...
    
#ifdef CONFIG_RT_GROUP_SCHED
	struct sched_rt_entity	**rt_se;
	struct rt_rq		**rt_rq;

	struct rt_bandwidth	rt_bandwidth;
#endif

	struct rcu_head		rcu;
	struct list_head	list;

	struct task_group	*parent;
	struct list_head	siblings;
	struct list_head	children;
    ...
}

4.3.2 CFS bandwidth control 기능

각 스레드마다 cpu할당 시간 조정 가능
설정값
- cpu.cfs_period_us : cpu 주기(ms)
- cpu.cfs_quota_us : 프로세스를 사용할 시간(ms) ( < cpu.cfs_period_us)

//	cgroup을 이용하여 CPU 25% 제한
sudo apt install -y stress
sudo su
cd /sys/fs/cgroup/cpu

mkdir kdt
cat cpu.cfs_period_us
$100000
echo 25000 > cpu.cfs_quota_us
// 100000/4 = 25000

//	현재 쉘 프로세스 등록해 결과 확인
echo $$ > tasks 
$ stress --cpu 1 
$ htop

4.3.3 scheduling domain + hierarchical load balancing

하드웨어의 변화에 맞춰 로드밸런싱할 수 있는 기능

5. 로드밸런싱 커널 분석

kernel/sched/fair.c
- enqueue_task_fair() : 스레드를 wake up 하기 위해 런큐에 sched_entitiy를 넣는 함수
- task_tick_fair() : curr에 저장된 sched_entity와 timeslice를 업데이트
- check_preempt_tick() : entity_tick()에 의해 호출되어 선점이 필요한지 체크
- load_balance() : 실제 로드밸런스를 실행하는 함수
익셉션 핸들러를 종료하고 스레드 wake up을 위해 enqueue_task_fair() 호출하는 모습
task_tick_fair()
check_preempt_tick()
load_balance()가 실행되는 모습