1  /* SPDX-License-Identifier: GPL-2.0 */
2  /*
3   * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4   *
5   * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6   * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
7   * Copyright (c) 2022 David Vernet <dvernet@meta.com>
8   */
9  #ifndef _LINUX_SCHED_EXT_H
10  #define _LINUX_SCHED_EXT_H
11  
12  #ifdef CONFIG_SCHED_CLASS_EXT
13  
14  #include <linux/llist.h>
15  #include <linux/rhashtable-types.h>
16  
17  enum scx_public_consts {
18  	SCX_OPS_NAME_LEN	= 128,
19  
20  	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
21  	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
22  };
23  
24  /*
25   * DSQ (dispatch queue) IDs are 64bit of the format:
26   *
27   *   Bits: [63] [62 ..  0]
28   *         [ B] [   ID   ]
29   *
30   *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
31   *   ID: 63 bit ID
32   *
33   * Built-in IDs:
34   *
35   *   Bits: [63] [62] [61..32] [31 ..  0]
36   *         [ 1] [ L] [   R  ] [    V   ]
37   *
38   *    1: 1 for built-in DSQs.
39   *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
40   *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
41   */
42  enum scx_dsq_id_flags {
43  	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
44  	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
45  
46  	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
47  	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
48  	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
49  	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
50  	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
51  };
52  
53  /*
54   * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
55   * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
56   * buffer between the scheduler core and the BPF scheduler. See the
57   * documentation for more details.
58   */
59  struct scx_dispatch_q {
60  	raw_spinlock_t		lock;
61  	struct list_head	list;	/* tasks in dispatch order */
62  	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
63  	u32			nr;
64  	u32			seq;	/* used by BPF iter */
65  	u64			id;
66  	struct rhash_head	hash_node;
67  	struct llist_node	free_node;
68  	struct rcu_head		rcu;
69  };
70  
71  /* scx_entity.flags */
72  enum scx_ent_flags {
73  	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
74  	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
75  	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
76  
77  	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
78  	SCX_TASK_STATE_BITS	= 2,
79  	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
80  
81  	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
82  };
83  
84  /* scx_entity.flags & SCX_TASK_STATE_MASK */
85  enum scx_task_state {
86  	SCX_TASK_NONE,		/* ops.init_task() not called yet */
87  	SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */
88  	SCX_TASK_READY,		/* fully initialized, but not in sched_ext */
89  	SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */
90  
91  	SCX_TASK_NR_STATES,
92  };
93  
94  /* scx_entity.dsq_flags */
95  enum scx_ent_dsq_flags {
96  	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
97  };
98  
99  /*
100   * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
101   * everywhere and the following bits track which kfunc sets are currently
102   * allowed for %current. This simple per-task tracking works because SCX ops
103   * nest in a limited way. BPF will likely implement a way to allow and disallow
104   * kfuncs depending on the calling context which will replace this manual
105   * mechanism. See scx_kf_allow().
106   */
107  enum scx_kf_mask {
108  	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
109  	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
110  	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
111  	/* ops.dequeue (in REST) may be nested inside DISPATCH */
112  	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
113  	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
114  	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
115  	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */
116  
117  	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
118  				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
119  	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
120  };
121  
122  enum scx_dsq_lnode_flags {
123  	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
124  
125  	/* high 16 bits can be for iter cursor flags */
126  	__SCX_DSQ_LNODE_PRIV_SHIFT = 16,
127  };
128  
129  struct scx_dsq_list_node {
130  	struct list_head	node;
131  	u32			flags;
132  	u32			priv;		/* can be used by iter cursor */
133  };
134  
135  /*
136   * The following is embedded in task_struct and contains all fields necessary
137   * for a task to be scheduled by SCX.
138   */
139  struct sched_ext_entity {
140  	struct scx_dispatch_q	*dsq;
141  	struct scx_dsq_list_node dsq_list;	/* dispatch order */
142  	struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */
143  	u32			dsq_seq;
144  	u32			dsq_flags;	/* protected by DSQ lock */
145  	u32			flags;		/* protected by rq lock */
146  	u32			weight;
147  	s32			sticky_cpu;
148  	s32			holding_cpu;
149  	u32			kf_mask;	/* see scx_kf_mask above */
150  	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
151  	atomic_long_t		ops_state;
152  
153  	struct list_head	runnable_node;	/* rq->scx.runnable_list */
154  	unsigned long		runnable_at;
155  
156  #ifdef CONFIG_SCHED_CORE
157  	u64			core_sched_at;	/* see scx_prio_less() */
158  #endif
159  	u64			ddsp_dsq_id;
160  	u64			ddsp_enq_flags;
161  
162  	/* BPF scheduler modifiable fields */
163  
164  	/*
165  	 * Runtime budget in nsecs. This is usually set through
166  	 * scx_bpf_dispatch() but can also be modified directly by the BPF
167  	 * scheduler. Automatically decreased by SCX as the task executes. On
168  	 * depletion, a scheduling event is triggered.
169  	 *
170  	 * This value is cleared to zero if the task is preempted by
171  	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
172  	 * task ran. Use p->se.sum_exec_runtime instead.
173  	 */
174  	u64			slice;
175  
176  	/*
177  	 * Used to order tasks when dispatching to the vtime-ordered priority
178  	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
179  	 * but can also be modified directly by the BPF scheduler. Modifying it
180  	 * while a task is queued on a dsq may mangle the ordering and is not
181  	 * recommended.
182  	 */
183  	u64			dsq_vtime;
184  
185  	/*
186  	 * If set, reject future sched_setscheduler(2) calls updating the policy
187  	 * to %SCHED_EXT with -%EACCES.
188  	 *
189  	 * Can be set from ops.init_task() while the BPF scheduler is being
190  	 * loaded (!scx_init_task_args->fork). If set and the task's policy is
191  	 * already %SCHED_EXT, the task's policy is rejected and forcefully
192  	 * reverted to %SCHED_NORMAL. The number of such events are reported
193  	 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
194  	 * during fork is not allowed.
195  	 */
196  	bool			disallow;	/* reject switching into SCX */
197  
198  	/* cold fields */
199  #ifdef CONFIG_EXT_GROUP_SCHED
200  	struct cgroup		*cgrp_moving_from;
201  #endif
202  	/* must be the last field, see init_scx_entity() */
203  	struct list_head	tasks_node;
204  };
205  
206  void sched_ext_free(struct task_struct *p);
207  void print_scx_info(const char *log_lvl, struct task_struct *p);
208  
209  #else	/* !CONFIG_SCHED_CLASS_EXT */
210  
sched_ext_free(struct task_struct * p)211  static inline void sched_ext_free(struct task_struct *p) {}
print_scx_info(const char * log_lvl,struct task_struct * p)212  static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
213  
214  #endif	/* CONFIG_SCHED_CLASS_EXT */
215  #endif	/* _LINUX_SCHED_EXT_H */
216