1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include "cpuset-internal.h"
4 
5 /*
6  * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
7  */
8 struct cpuset_remove_tasks_struct {
9 	struct work_struct work;
10 	struct cpuset *cs;
11 };
12 
13 /*
14  * Frequency meter - How fast is some event occurring?
15  *
16  * These routines manage a digitally filtered, constant time based,
17  * event frequency meter.  There are four routines:
18  *   fmeter_init() - initialize a frequency meter.
19  *   fmeter_markevent() - called each time the event happens.
20  *   fmeter_getrate() - returns the recent rate of such events.
21  *   fmeter_update() - internal routine used to update fmeter.
22  *
23  * A common data structure is passed to each of these routines,
24  * which is used to keep track of the state required to manage the
25  * frequency meter and its digital filter.
26  *
27  * The filter works on the number of events marked per unit time.
28  * The filter is single-pole low-pass recursive (IIR).  The time unit
29  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
30  * simulate 3 decimal digits of precision (multiplied by 1000).
31  *
32  * With an FM_COEF of 933, and a time base of 1 second, the filter
33  * has a half-life of 10 seconds, meaning that if the events quit
34  * happening, then the rate returned from the fmeter_getrate()
35  * will be cut in half each 10 seconds, until it converges to zero.
36  *
37  * It is not worth doing a real infinitely recursive filter.  If more
38  * than FM_MAXTICKS ticks have elapsed since the last filter event,
39  * just compute FM_MAXTICKS ticks worth, by which point the level
40  * will be stable.
41  *
42  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
43  * arithmetic overflow in the fmeter_update() routine.
44  *
45  * Given the simple 32 bit integer arithmetic used, this meter works
46  * best for reporting rates between one per millisecond (msec) and
47  * one per 32 (approx) seconds.  At constant rates faster than one
48  * per msec it maxes out at values just under 1,000,000.  At constant
49  * rates between one per msec, and one per second it will stabilize
50  * to a value N*1000, where N is the rate of events per second.
51  * At constant rates between one per second and one per 32 seconds,
52  * it will be choppy, moving up on the seconds that have an event,
53  * and then decaying until the next event.  At rates slower than
54  * about one in 32 seconds, it decays all the way back to zero between
55  * each event.
56  */
57 
58 #define FM_COEF 933		/* coefficient for half-life of 10 secs */
59 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
60 #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
61 #define FM_SCALE 1000		/* faux fixed point scale */
62 
63 /* Initialize a frequency meter */
fmeter_init(struct fmeter * fmp)64 void fmeter_init(struct fmeter *fmp)
65 {
66 	fmp->cnt = 0;
67 	fmp->val = 0;
68 	fmp->time = 0;
69 	spin_lock_init(&fmp->lock);
70 }
71 
72 /* Internal meter update - process cnt events and update value */
fmeter_update(struct fmeter * fmp)73 static void fmeter_update(struct fmeter *fmp)
74 {
75 	time64_t now;
76 	u32 ticks;
77 
78 	now = ktime_get_seconds();
79 	ticks = now - fmp->time;
80 
81 	if (ticks == 0)
82 		return;
83 
84 	ticks = min(FM_MAXTICKS, ticks);
85 	while (ticks-- > 0)
86 		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
87 	fmp->time = now;
88 
89 	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
90 	fmp->cnt = 0;
91 }
92 
93 /* Process any previous ticks, then bump cnt by one (times scale). */
fmeter_markevent(struct fmeter * fmp)94 static void fmeter_markevent(struct fmeter *fmp)
95 {
96 	spin_lock(&fmp->lock);
97 	fmeter_update(fmp);
98 	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
99 	spin_unlock(&fmp->lock);
100 }
101 
102 /* Process any previous ticks, then return current value. */
fmeter_getrate(struct fmeter * fmp)103 static int fmeter_getrate(struct fmeter *fmp)
104 {
105 	int val;
106 
107 	spin_lock(&fmp->lock);
108 	fmeter_update(fmp);
109 	val = fmp->val;
110 	spin_unlock(&fmp->lock);
111 	return val;
112 }
113 
114 /*
115  * Collection of memory_pressure is suppressed unless
116  * this flag is enabled by writing "1" to the special
117  * cpuset file 'memory_pressure_enabled' in the root cpuset.
118  */
119 
120 int cpuset_memory_pressure_enabled __read_mostly;
121 
122 /*
123  * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
124  *
125  * Keep a running average of the rate of synchronous (direct)
126  * page reclaim efforts initiated by tasks in each cpuset.
127  *
128  * This represents the rate at which some task in the cpuset
129  * ran low on memory on all nodes it was allowed to use, and
130  * had to enter the kernels page reclaim code in an effort to
131  * create more free memory by tossing clean pages or swapping
132  * or writing dirty pages.
133  *
134  * Display to user space in the per-cpuset read-only file
135  * "memory_pressure".  Value displayed is an integer
136  * representing the recent rate of entry into the synchronous
137  * (direct) page reclaim by any task attached to the cpuset.
138  */
139 
__cpuset_memory_pressure_bump(void)140 void __cpuset_memory_pressure_bump(void)
141 {
142 	rcu_read_lock();
143 	fmeter_markevent(&task_cs(current)->fmeter);
144 	rcu_read_unlock();
145 }
146 
update_relax_domain_level(struct cpuset * cs,s64 val)147 static int update_relax_domain_level(struct cpuset *cs, s64 val)
148 {
149 #ifdef CONFIG_SMP
150 	if (val < -1 || val > sched_domain_level_max + 1)
151 		return -EINVAL;
152 #endif
153 
154 	if (val != cs->relax_domain_level) {
155 		cs->relax_domain_level = val;
156 		if (!cpumask_empty(cs->cpus_allowed) &&
157 		    is_sched_load_balance(cs))
158 			rebuild_sched_domains_locked();
159 	}
160 
161 	return 0;
162 }
163 
cpuset_write_s64(struct cgroup_subsys_state * css,struct cftype * cft,s64 val)164 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
165 			    s64 val)
166 {
167 	struct cpuset *cs = css_cs(css);
168 	cpuset_filetype_t type = cft->private;
169 	int retval = -ENODEV;
170 
171 	cpus_read_lock();
172 	cpuset_lock();
173 	if (!is_cpuset_online(cs))
174 		goto out_unlock;
175 
176 	switch (type) {
177 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
178 		retval = update_relax_domain_level(cs, val);
179 		break;
180 	default:
181 		retval = -EINVAL;
182 		break;
183 	}
184 out_unlock:
185 	cpuset_unlock();
186 	cpus_read_unlock();
187 	return retval;
188 }
189 
cpuset_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)190 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
191 {
192 	struct cpuset *cs = css_cs(css);
193 	cpuset_filetype_t type = cft->private;
194 
195 	switch (type) {
196 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
197 		return cs->relax_domain_level;
198 	default:
199 		BUG();
200 	}
201 
202 	/* Unreachable but makes gcc happy */
203 	return 0;
204 }
205 
206 /*
207  * update task's spread flag if cpuset's page/slab spread flag is set
208  *
209  * Call with callback_lock or cpuset_mutex held. The check can be skipped
210  * if on default hierarchy.
211  */
cpuset1_update_task_spread_flags(struct cpuset * cs,struct task_struct * tsk)212 void cpuset1_update_task_spread_flags(struct cpuset *cs,
213 					struct task_struct *tsk)
214 {
215 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
216 		return;
217 
218 	if (is_spread_page(cs))
219 		task_set_spread_page(tsk);
220 	else
221 		task_clear_spread_page(tsk);
222 
223 	if (is_spread_slab(cs))
224 		task_set_spread_slab(tsk);
225 	else
226 		task_clear_spread_slab(tsk);
227 }
228 
229 /**
230  * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231  * @cs: the cpuset in which each task's spread flags needs to be changed
232  *
233  * Iterate through each task of @cs updating its spread flags.  As this
234  * function is called with cpuset_mutex held, cpuset membership stays
235  * stable.
236  */
cpuset1_update_tasks_flags(struct cpuset * cs)237 void cpuset1_update_tasks_flags(struct cpuset *cs)
238 {
239 	struct css_task_iter it;
240 	struct task_struct *task;
241 
242 	css_task_iter_start(&cs->css, 0, &it);
243 	while ((task = css_task_iter_next(&it)))
244 		cpuset1_update_task_spread_flags(cs, task);
245 	css_task_iter_end(&it);
246 }
247 
248 /*
249  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
250  * or memory nodes, we need to walk over the cpuset hierarchy,
251  * removing that CPU or node from all cpusets.  If this removes the
252  * last CPU or node from a cpuset, then move the tasks in the empty
253  * cpuset to its next-highest non-empty parent.
254  */
remove_tasks_in_empty_cpuset(struct cpuset * cs)255 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
256 {
257 	struct cpuset *parent;
258 
259 	/*
260 	 * Find its next-highest non-empty parent, (top cpuset
261 	 * has online cpus, so can't be empty).
262 	 */
263 	parent = parent_cs(cs);
264 	while (cpumask_empty(parent->cpus_allowed) ||
265 			nodes_empty(parent->mems_allowed))
266 		parent = parent_cs(parent);
267 
268 	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
269 		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270 		pr_cont_cgroup_name(cs->css.cgroup);
271 		pr_cont("\n");
272 	}
273 }
274 
cpuset_migrate_tasks_workfn(struct work_struct * work)275 static void cpuset_migrate_tasks_workfn(struct work_struct *work)
276 {
277 	struct cpuset_remove_tasks_struct *s;
278 
279 	s = container_of(work, struct cpuset_remove_tasks_struct, work);
280 	remove_tasks_in_empty_cpuset(s->cs);
281 	css_put(&s->cs->css);
282 	kfree(s);
283 }
284 
cpuset1_hotplug_update_tasks(struct cpuset * cs,struct cpumask * new_cpus,nodemask_t * new_mems,bool cpus_updated,bool mems_updated)285 void cpuset1_hotplug_update_tasks(struct cpuset *cs,
286 			    struct cpumask *new_cpus, nodemask_t *new_mems,
287 			    bool cpus_updated, bool mems_updated)
288 {
289 	bool is_empty;
290 
291 	cpuset_callback_lock_irq();
292 	cpumask_copy(cs->cpus_allowed, new_cpus);
293 	cpumask_copy(cs->effective_cpus, new_cpus);
294 	cs->mems_allowed = *new_mems;
295 	cs->effective_mems = *new_mems;
296 	cpuset_callback_unlock_irq();
297 
298 	/*
299 	 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300 	 * as the tasks will be migrated to an ancestor.
301 	 */
302 	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
303 		cpuset_update_tasks_cpumask(cs, new_cpus);
304 	if (mems_updated && !nodes_empty(cs->mems_allowed))
305 		cpuset_update_tasks_nodemask(cs);
306 
307 	is_empty = cpumask_empty(cs->cpus_allowed) ||
308 		   nodes_empty(cs->mems_allowed);
309 
310 	/*
311 	 * Move tasks to the nearest ancestor with execution resources,
312 	 * This is full cgroup operation which will also call back into
313 	 * cpuset. Execute it asynchronously using workqueue.
314 	 */
315 	if (is_empty && cs->css.cgroup->nr_populated_csets &&
316 	    css_tryget_online(&cs->css)) {
317 		struct cpuset_remove_tasks_struct *s;
318 
319 		s = kzalloc(sizeof(*s), GFP_KERNEL);
320 		if (WARN_ON_ONCE(!s)) {
321 			css_put(&cs->css);
322 			return;
323 		}
324 
325 		s->cs = cs;
326 		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
327 		schedule_work(&s->work);
328 	}
329 }
330 
331 /*
332  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
333  *
334  * One cpuset is a subset of another if all its allowed CPUs and
335  * Memory Nodes are a subset of the other, and its exclusive flags
336  * are only set if the other's are set.  Call holding cpuset_mutex.
337  */
338 
is_cpuset_subset(const struct cpuset * p,const struct cpuset * q)339 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
340 {
341 	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
342 		nodes_subset(p->mems_allowed, q->mems_allowed) &&
343 		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
344 		is_mem_exclusive(p) <= is_mem_exclusive(q);
345 }
346 
347 /*
348  * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
349  *                            behavior.
350  */
cpuset1_validate_change(struct cpuset * cur,struct cpuset * trial)351 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
352 {
353 	struct cgroup_subsys_state *css;
354 	struct cpuset *c, *par;
355 	int ret;
356 
357 	WARN_ON_ONCE(!rcu_read_lock_held());
358 
359 	/* Each of our child cpusets must be a subset of us */
360 	ret = -EBUSY;
361 	cpuset_for_each_child(c, css, cur)
362 		if (!is_cpuset_subset(c, trial))
363 			goto out;
364 
365 	/* On legacy hierarchy, we must be a subset of our parent cpuset. */
366 	ret = -EACCES;
367 	par = parent_cs(cur);
368 	if (par && !is_cpuset_subset(trial, par))
369 		goto out;
370 
371 	ret = 0;
372 out:
373 	return ret;
374 }
375 
cpuset_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)376 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
377 {
378 	struct cpuset *cs = css_cs(css);
379 	cpuset_filetype_t type = cft->private;
380 
381 	switch (type) {
382 	case FILE_CPU_EXCLUSIVE:
383 		return is_cpu_exclusive(cs);
384 	case FILE_MEM_EXCLUSIVE:
385 		return is_mem_exclusive(cs);
386 	case FILE_MEM_HARDWALL:
387 		return is_mem_hardwall(cs);
388 	case FILE_SCHED_LOAD_BALANCE:
389 		return is_sched_load_balance(cs);
390 	case FILE_MEMORY_MIGRATE:
391 		return is_memory_migrate(cs);
392 	case FILE_MEMORY_PRESSURE_ENABLED:
393 		return cpuset_memory_pressure_enabled;
394 	case FILE_MEMORY_PRESSURE:
395 		return fmeter_getrate(&cs->fmeter);
396 	case FILE_SPREAD_PAGE:
397 		return is_spread_page(cs);
398 	case FILE_SPREAD_SLAB:
399 		return is_spread_slab(cs);
400 	default:
401 		BUG();
402 	}
403 
404 	/* Unreachable but makes gcc happy */
405 	return 0;
406 }
407 
cpuset_write_u64(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)408 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
409 			    u64 val)
410 {
411 	struct cpuset *cs = css_cs(css);
412 	cpuset_filetype_t type = cft->private;
413 	int retval = 0;
414 
415 	cpus_read_lock();
416 	cpuset_lock();
417 	if (!is_cpuset_online(cs)) {
418 		retval = -ENODEV;
419 		goto out_unlock;
420 	}
421 
422 	switch (type) {
423 	case FILE_CPU_EXCLUSIVE:
424 		retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
425 		break;
426 	case FILE_MEM_EXCLUSIVE:
427 		retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
428 		break;
429 	case FILE_MEM_HARDWALL:
430 		retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
431 		break;
432 	case FILE_SCHED_LOAD_BALANCE:
433 		retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
434 		break;
435 	case FILE_MEMORY_MIGRATE:
436 		retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
437 		break;
438 	case FILE_MEMORY_PRESSURE_ENABLED:
439 		cpuset_memory_pressure_enabled = !!val;
440 		break;
441 	case FILE_SPREAD_PAGE:
442 		retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
443 		break;
444 	case FILE_SPREAD_SLAB:
445 		retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
446 		break;
447 	default:
448 		retval = -EINVAL;
449 		break;
450 	}
451 out_unlock:
452 	cpuset_unlock();
453 	cpus_read_unlock();
454 	return retval;
455 }
456 
457 /*
458  * for the common functions, 'private' gives the type of file
459  */
460 
461 struct cftype cpuset1_files[] = {
462 	{
463 		.name = "cpus",
464 		.seq_show = cpuset_common_seq_show,
465 		.write = cpuset_write_resmask,
466 		.max_write_len = (100U + 6 * NR_CPUS),
467 		.private = FILE_CPULIST,
468 	},
469 
470 	{
471 		.name = "mems",
472 		.seq_show = cpuset_common_seq_show,
473 		.write = cpuset_write_resmask,
474 		.max_write_len = (100U + 6 * MAX_NUMNODES),
475 		.private = FILE_MEMLIST,
476 	},
477 
478 	{
479 		.name = "effective_cpus",
480 		.seq_show = cpuset_common_seq_show,
481 		.private = FILE_EFFECTIVE_CPULIST,
482 	},
483 
484 	{
485 		.name = "effective_mems",
486 		.seq_show = cpuset_common_seq_show,
487 		.private = FILE_EFFECTIVE_MEMLIST,
488 	},
489 
490 	{
491 		.name = "cpu_exclusive",
492 		.read_u64 = cpuset_read_u64,
493 		.write_u64 = cpuset_write_u64,
494 		.private = FILE_CPU_EXCLUSIVE,
495 	},
496 
497 	{
498 		.name = "mem_exclusive",
499 		.read_u64 = cpuset_read_u64,
500 		.write_u64 = cpuset_write_u64,
501 		.private = FILE_MEM_EXCLUSIVE,
502 	},
503 
504 	{
505 		.name = "mem_hardwall",
506 		.read_u64 = cpuset_read_u64,
507 		.write_u64 = cpuset_write_u64,
508 		.private = FILE_MEM_HARDWALL,
509 	},
510 
511 	{
512 		.name = "sched_load_balance",
513 		.read_u64 = cpuset_read_u64,
514 		.write_u64 = cpuset_write_u64,
515 		.private = FILE_SCHED_LOAD_BALANCE,
516 	},
517 
518 	{
519 		.name = "sched_relax_domain_level",
520 		.read_s64 = cpuset_read_s64,
521 		.write_s64 = cpuset_write_s64,
522 		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
523 	},
524 
525 	{
526 		.name = "memory_migrate",
527 		.read_u64 = cpuset_read_u64,
528 		.write_u64 = cpuset_write_u64,
529 		.private = FILE_MEMORY_MIGRATE,
530 	},
531 
532 	{
533 		.name = "memory_pressure",
534 		.read_u64 = cpuset_read_u64,
535 		.private = FILE_MEMORY_PRESSURE,
536 	},
537 
538 	{
539 		.name = "memory_spread_page",
540 		.read_u64 = cpuset_read_u64,
541 		.write_u64 = cpuset_write_u64,
542 		.private = FILE_SPREAD_PAGE,
543 	},
544 
545 	{
546 		/* obsolete, may be removed in the future */
547 		.name = "memory_spread_slab",
548 		.read_u64 = cpuset_read_u64,
549 		.write_u64 = cpuset_write_u64,
550 		.private = FILE_SPREAD_SLAB,
551 	},
552 
553 	{
554 		.name = "memory_pressure_enabled",
555 		.flags = CFTYPE_ONLY_ON_ROOT,
556 		.read_u64 = cpuset_read_u64,
557 		.write_u64 = cpuset_write_u64,
558 		.private = FILE_MEMORY_PRESSURE_ENABLED,
559 	},
560 
561 	{ }	/* terminate */
562 };
563