1  // SPDX-License-Identifier: GPL-2.0-only
2  #include "cgroup-internal.h"
3  
4  #include <linux/ctype.h>
5  #include <linux/kmod.h>
6  #include <linux/sort.h>
7  #include <linux/delay.h>
8  #include <linux/mm.h>
9  #include <linux/sched/signal.h>
10  #include <linux/sched/task.h>
11  #include <linux/magic.h>
12  #include <linux/slab.h>
13  #include <linux/vmalloc.h>
14  #include <linux/delayacct.h>
15  #include <linux/pid_namespace.h>
16  #include <linux/cgroupstats.h>
17  #include <linux/fs_parser.h>
18  
19  #include <trace/events/cgroup.h>
20  
21  /*
22   * pidlists linger the following amount before being destroyed.  The goal
23   * is avoiding frequent destruction in the middle of consecutive read calls
24   * Expiring in the middle is a performance problem not a correctness one.
25   * 1 sec should be enough.
26   */
27  #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
28  
29  /* Controllers blocked by the commandline in v1 */
30  static u16 cgroup_no_v1_mask;
31  
32  /* disable named v1 mounts */
33  static bool cgroup_no_v1_named;
34  
35  /*
36   * pidlist destructions need to be flushed on cgroup destruction.  Use a
37   * separate workqueue as flush domain.
38   */
39  static struct workqueue_struct *cgroup_pidlist_destroy_wq;
40  
41  /* protects cgroup_subsys->release_agent_path */
42  static DEFINE_SPINLOCK(release_agent_path_lock);
43  
cgroup1_ssid_disabled(int ssid)44  bool cgroup1_ssid_disabled(int ssid)
45  {
46  	return cgroup_no_v1_mask & (1 << ssid);
47  }
48  
cgroup1_subsys_absent(struct cgroup_subsys * ss)49  static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
50  {
51  	/* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
52  	return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
53  }
54  
55  /**
56   * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
57   * @from: attach to all cgroups of a given task
58   * @tsk: the task to be attached
59   *
60   * Return: %0 on success or a negative errno code on failure
61   */
cgroup_attach_task_all(struct task_struct * from,struct task_struct * tsk)62  int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
63  {
64  	struct cgroup_root *root;
65  	int retval = 0;
66  
67  	cgroup_lock();
68  	cgroup_attach_lock(true);
69  	for_each_root(root) {
70  		struct cgroup *from_cgrp;
71  
72  		spin_lock_irq(&css_set_lock);
73  		from_cgrp = task_cgroup_from_root(from, root);
74  		spin_unlock_irq(&css_set_lock);
75  
76  		retval = cgroup_attach_task(from_cgrp, tsk, false);
77  		if (retval)
78  			break;
79  	}
80  	cgroup_attach_unlock(true);
81  	cgroup_unlock();
82  
83  	return retval;
84  }
85  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
86  
87  /**
88   * cgroup_transfer_tasks - move tasks from one cgroup to another
89   * @to: cgroup to which the tasks will be moved
90   * @from: cgroup in which the tasks currently reside
91   *
92   * Locking rules between cgroup_post_fork() and the migration path
93   * guarantee that, if a task is forking while being migrated, the new child
94   * is guaranteed to be either visible in the source cgroup after the
95   * parent's migration is complete or put into the target cgroup.  No task
96   * can slip out of migration through forking.
97   *
98   * Return: %0 on success or a negative errno code on failure
99   */
cgroup_transfer_tasks(struct cgroup * to,struct cgroup * from)100  int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
101  {
102  	DEFINE_CGROUP_MGCTX(mgctx);
103  	struct cgrp_cset_link *link;
104  	struct css_task_iter it;
105  	struct task_struct *task;
106  	int ret;
107  
108  	if (cgroup_on_dfl(to))
109  		return -EINVAL;
110  
111  	ret = cgroup_migrate_vet_dst(to);
112  	if (ret)
113  		return ret;
114  
115  	cgroup_lock();
116  
117  	cgroup_attach_lock(true);
118  
119  	/* all tasks in @from are being moved, all csets are source */
120  	spin_lock_irq(&css_set_lock);
121  	list_for_each_entry(link, &from->cset_links, cset_link)
122  		cgroup_migrate_add_src(link->cset, to, &mgctx);
123  	spin_unlock_irq(&css_set_lock);
124  
125  	ret = cgroup_migrate_prepare_dst(&mgctx);
126  	if (ret)
127  		goto out_err;
128  
129  	/*
130  	 * Migrate tasks one-by-one until @from is empty.  This fails iff
131  	 * ->can_attach() fails.
132  	 */
133  	do {
134  		css_task_iter_start(&from->self, 0, &it);
135  
136  		do {
137  			task = css_task_iter_next(&it);
138  		} while (task && (task->flags & PF_EXITING));
139  
140  		if (task)
141  			get_task_struct(task);
142  		css_task_iter_end(&it);
143  
144  		if (task) {
145  			ret = cgroup_migrate(task, false, &mgctx);
146  			if (!ret)
147  				TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
148  			put_task_struct(task);
149  		}
150  	} while (task && !ret);
151  out_err:
152  	cgroup_migrate_finish(&mgctx);
153  	cgroup_attach_unlock(true);
154  	cgroup_unlock();
155  	return ret;
156  }
157  
158  /*
159   * Stuff for reading the 'tasks'/'procs' files.
160   *
161   * Reading this file can return large amounts of data if a cgroup has
162   * *lots* of attached tasks. So it may need several calls to read(),
163   * but we cannot guarantee that the information we produce is correct
164   * unless we produce it entirely atomically.
165   *
166   */
167  
168  /* which pidlist file are we talking about? */
169  enum cgroup_filetype {
170  	CGROUP_FILE_PROCS,
171  	CGROUP_FILE_TASKS,
172  };
173  
174  /*
175   * A pidlist is a list of pids that virtually represents the contents of one
176   * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
177   * a pair (one each for procs, tasks) for each pid namespace that's relevant
178   * to the cgroup.
179   */
180  struct cgroup_pidlist {
181  	/*
182  	 * used to find which pidlist is wanted. doesn't change as long as
183  	 * this particular list stays in the list.
184  	*/
185  	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
186  	/* array of xids */
187  	pid_t *list;
188  	/* how many elements the above list has */
189  	int length;
190  	/* each of these stored in a list by its cgroup */
191  	struct list_head links;
192  	/* pointer to the cgroup we belong to, for list removal purposes */
193  	struct cgroup *owner;
194  	/* for delayed destruction */
195  	struct delayed_work destroy_dwork;
196  };
197  
198  /*
199   * Used to destroy all pidlists lingering waiting for destroy timer.  None
200   * should be left afterwards.
201   */
cgroup1_pidlist_destroy_all(struct cgroup * cgrp)202  void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
203  {
204  	struct cgroup_pidlist *l, *tmp_l;
205  
206  	mutex_lock(&cgrp->pidlist_mutex);
207  	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
208  		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
209  	mutex_unlock(&cgrp->pidlist_mutex);
210  
211  	flush_workqueue(cgroup_pidlist_destroy_wq);
212  	BUG_ON(!list_empty(&cgrp->pidlists));
213  }
214  
cgroup_pidlist_destroy_work_fn(struct work_struct * work)215  static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
216  {
217  	struct delayed_work *dwork = to_delayed_work(work);
218  	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
219  						destroy_dwork);
220  	struct cgroup_pidlist *tofree = NULL;
221  
222  	mutex_lock(&l->owner->pidlist_mutex);
223  
224  	/*
225  	 * Destroy iff we didn't get queued again.  The state won't change
226  	 * as destroy_dwork can only be queued while locked.
227  	 */
228  	if (!delayed_work_pending(dwork)) {
229  		list_del(&l->links);
230  		kvfree(l->list);
231  		put_pid_ns(l->key.ns);
232  		tofree = l;
233  	}
234  
235  	mutex_unlock(&l->owner->pidlist_mutex);
236  	kfree(tofree);
237  }
238  
239  /*
240   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
241   * Returns the number of unique elements.
242   */
pidlist_uniq(pid_t * list,int length)243  static int pidlist_uniq(pid_t *list, int length)
244  {
245  	int src, dest = 1;
246  
247  	/*
248  	 * we presume the 0th element is unique, so i starts at 1. trivial
249  	 * edge cases first; no work needs to be done for either
250  	 */
251  	if (length == 0 || length == 1)
252  		return length;
253  	/* src and dest walk down the list; dest counts unique elements */
254  	for (src = 1; src < length; src++) {
255  		/* find next unique element */
256  		while (list[src] == list[src-1]) {
257  			src++;
258  			if (src == length)
259  				goto after;
260  		}
261  		/* dest always points to where the next unique element goes */
262  		list[dest] = list[src];
263  		dest++;
264  	}
265  after:
266  	return dest;
267  }
268  
269  /*
270   * The two pid files - task and cgroup.procs - guaranteed that the result
271   * is sorted, which forced this whole pidlist fiasco.  As pid order is
272   * different per namespace, each namespace needs differently sorted list,
273   * making it impossible to use, for example, single rbtree of member tasks
274   * sorted by task pointer.  As pidlists can be fairly large, allocating one
275   * per open file is dangerous, so cgroup had to implement shared pool of
276   * pidlists keyed by cgroup and namespace.
277   */
cmppid(const void * a,const void * b)278  static int cmppid(const void *a, const void *b)
279  {
280  	return *(pid_t *)a - *(pid_t *)b;
281  }
282  
cgroup_pidlist_find(struct cgroup * cgrp,enum cgroup_filetype type)283  static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
284  						  enum cgroup_filetype type)
285  {
286  	struct cgroup_pidlist *l;
287  	/* don't need task_nsproxy() if we're looking at ourself */
288  	struct pid_namespace *ns = task_active_pid_ns(current);
289  
290  	lockdep_assert_held(&cgrp->pidlist_mutex);
291  
292  	list_for_each_entry(l, &cgrp->pidlists, links)
293  		if (l->key.type == type && l->key.ns == ns)
294  			return l;
295  	return NULL;
296  }
297  
298  /*
299   * find the appropriate pidlist for our purpose (given procs vs tasks)
300   * returns with the lock on that pidlist already held, and takes care
301   * of the use count, or returns NULL with no locks held if we're out of
302   * memory.
303   */
cgroup_pidlist_find_create(struct cgroup * cgrp,enum cgroup_filetype type)304  static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
305  						enum cgroup_filetype type)
306  {
307  	struct cgroup_pidlist *l;
308  
309  	lockdep_assert_held(&cgrp->pidlist_mutex);
310  
311  	l = cgroup_pidlist_find(cgrp, type);
312  	if (l)
313  		return l;
314  
315  	/* entry not found; create a new one */
316  	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
317  	if (!l)
318  		return l;
319  
320  	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
321  	l->key.type = type;
322  	/* don't need task_nsproxy() if we're looking at ourself */
323  	l->key.ns = get_pid_ns(task_active_pid_ns(current));
324  	l->owner = cgrp;
325  	list_add(&l->links, &cgrp->pidlists);
326  	return l;
327  }
328  
329  /*
330   * Load a cgroup's pidarray with either procs' tgids or tasks' pids
331   */
pidlist_array_load(struct cgroup * cgrp,enum cgroup_filetype type,struct cgroup_pidlist ** lp)332  static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
333  			      struct cgroup_pidlist **lp)
334  {
335  	pid_t *array;
336  	int length;
337  	int pid, n = 0; /* used for populating the array */
338  	struct css_task_iter it;
339  	struct task_struct *tsk;
340  	struct cgroup_pidlist *l;
341  
342  	lockdep_assert_held(&cgrp->pidlist_mutex);
343  
344  	/*
345  	 * If cgroup gets more users after we read count, we won't have
346  	 * enough space - tough.  This race is indistinguishable to the
347  	 * caller from the case that the additional cgroup users didn't
348  	 * show up until sometime later on.
349  	 */
350  	length = cgroup_task_count(cgrp);
351  	array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
352  	if (!array)
353  		return -ENOMEM;
354  	/* now, populate the array */
355  	css_task_iter_start(&cgrp->self, 0, &it);
356  	while ((tsk = css_task_iter_next(&it))) {
357  		if (unlikely(n == length))
358  			break;
359  		/* get tgid or pid for procs or tasks file respectively */
360  		if (type == CGROUP_FILE_PROCS)
361  			pid = task_tgid_vnr(tsk);
362  		else
363  			pid = task_pid_vnr(tsk);
364  		if (pid > 0) /* make sure to only use valid results */
365  			array[n++] = pid;
366  	}
367  	css_task_iter_end(&it);
368  	length = n;
369  	/* now sort & strip out duplicates (tgids or recycled thread PIDs) */
370  	sort(array, length, sizeof(pid_t), cmppid, NULL);
371  	length = pidlist_uniq(array, length);
372  
373  	l = cgroup_pidlist_find_create(cgrp, type);
374  	if (!l) {
375  		kvfree(array);
376  		return -ENOMEM;
377  	}
378  
379  	/* store array, freeing old if necessary */
380  	kvfree(l->list);
381  	l->list = array;
382  	l->length = length;
383  	*lp = l;
384  	return 0;
385  }
386  
387  /*
388   * seq_file methods for the tasks/procs files. The seq_file position is the
389   * next pid to display; the seq_file iterator is a pointer to the pid
390   * in the cgroup->l->list array.
391   */
392  
cgroup_pidlist_start(struct seq_file * s,loff_t * pos)393  static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
394  {
395  	/*
396  	 * Initially we receive a position value that corresponds to
397  	 * one more than the last pid shown (or 0 on the first call or
398  	 * after a seek to the start). Use a binary-search to find the
399  	 * next pid to display, if any
400  	 */
401  	struct kernfs_open_file *of = s->private;
402  	struct cgroup_file_ctx *ctx = of->priv;
403  	struct cgroup *cgrp = seq_css(s)->cgroup;
404  	struct cgroup_pidlist *l;
405  	enum cgroup_filetype type = seq_cft(s)->private;
406  	int index = 0, pid = *pos;
407  	int *iter, ret;
408  
409  	mutex_lock(&cgrp->pidlist_mutex);
410  
411  	/*
412  	 * !NULL @ctx->procs1.pidlist indicates that this isn't the first
413  	 * start() after open. If the matching pidlist is around, we can use
414  	 * that. Look for it. Note that @ctx->procs1.pidlist can't be used
415  	 * directly. It could already have been destroyed.
416  	 */
417  	if (ctx->procs1.pidlist)
418  		ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
419  
420  	/*
421  	 * Either this is the first start() after open or the matching
422  	 * pidlist has been destroyed inbetween.  Create a new one.
423  	 */
424  	if (!ctx->procs1.pidlist) {
425  		ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
426  		if (ret)
427  			return ERR_PTR(ret);
428  	}
429  	l = ctx->procs1.pidlist;
430  
431  	if (pid) {
432  		int end = l->length;
433  
434  		while (index < end) {
435  			int mid = (index + end) / 2;
436  			if (l->list[mid] == pid) {
437  				index = mid;
438  				break;
439  			} else if (l->list[mid] < pid)
440  				index = mid + 1;
441  			else
442  				end = mid;
443  		}
444  	}
445  	/* If we're off the end of the array, we're done */
446  	if (index >= l->length)
447  		return NULL;
448  	/* Update the abstract position to be the actual pid that we found */
449  	iter = l->list + index;
450  	*pos = *iter;
451  	return iter;
452  }
453  
cgroup_pidlist_stop(struct seq_file * s,void * v)454  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
455  {
456  	struct kernfs_open_file *of = s->private;
457  	struct cgroup_file_ctx *ctx = of->priv;
458  	struct cgroup_pidlist *l = ctx->procs1.pidlist;
459  
460  	if (l)
461  		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
462  				 CGROUP_PIDLIST_DESTROY_DELAY);
463  	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
464  }
465  
cgroup_pidlist_next(struct seq_file * s,void * v,loff_t * pos)466  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
467  {
468  	struct kernfs_open_file *of = s->private;
469  	struct cgroup_file_ctx *ctx = of->priv;
470  	struct cgroup_pidlist *l = ctx->procs1.pidlist;
471  	pid_t *p = v;
472  	pid_t *end = l->list + l->length;
473  	/*
474  	 * Advance to the next pid in the array. If this goes off the
475  	 * end, we're done
476  	 */
477  	p++;
478  	if (p >= end) {
479  		(*pos)++;
480  		return NULL;
481  	} else {
482  		*pos = *p;
483  		return p;
484  	}
485  }
486  
cgroup_pidlist_show(struct seq_file * s,void * v)487  static int cgroup_pidlist_show(struct seq_file *s, void *v)
488  {
489  	seq_printf(s, "%d\n", *(int *)v);
490  
491  	return 0;
492  }
493  
__cgroup1_procs_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,bool threadgroup)494  static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
495  				     char *buf, size_t nbytes, loff_t off,
496  				     bool threadgroup)
497  {
498  	struct cgroup *cgrp;
499  	struct task_struct *task;
500  	const struct cred *cred, *tcred;
501  	ssize_t ret;
502  	bool locked;
503  
504  	cgrp = cgroup_kn_lock_live(of->kn, false);
505  	if (!cgrp)
506  		return -ENODEV;
507  
508  	task = cgroup_procs_write_start(buf, threadgroup, &locked);
509  	ret = PTR_ERR_OR_ZERO(task);
510  	if (ret)
511  		goto out_unlock;
512  
513  	/*
514  	 * Even if we're attaching all tasks in the thread group, we only need
515  	 * to check permissions on one of them. Check permissions using the
516  	 * credentials from file open to protect against inherited fd attacks.
517  	 */
518  	cred = of->file->f_cred;
519  	tcred = get_task_cred(task);
520  	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
521  	    !uid_eq(cred->euid, tcred->uid) &&
522  	    !uid_eq(cred->euid, tcred->suid))
523  		ret = -EACCES;
524  	put_cred(tcred);
525  	if (ret)
526  		goto out_finish;
527  
528  	ret = cgroup_attach_task(cgrp, task, threadgroup);
529  
530  out_finish:
531  	cgroup_procs_write_finish(task, locked);
532  out_unlock:
533  	cgroup_kn_unlock(of->kn);
534  
535  	return ret ?: nbytes;
536  }
537  
cgroup1_procs_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)538  static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
539  				   char *buf, size_t nbytes, loff_t off)
540  {
541  	return __cgroup1_procs_write(of, buf, nbytes, off, true);
542  }
543  
cgroup1_tasks_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)544  static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
545  				   char *buf, size_t nbytes, loff_t off)
546  {
547  	return __cgroup1_procs_write(of, buf, nbytes, off, false);
548  }
549  
cgroup_release_agent_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)550  static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
551  					  char *buf, size_t nbytes, loff_t off)
552  {
553  	struct cgroup *cgrp;
554  	struct cgroup_file_ctx *ctx;
555  
556  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
557  
558  	/*
559  	 * Release agent gets called with all capabilities,
560  	 * require capabilities to set release agent.
561  	 */
562  	ctx = of->priv;
563  	if ((ctx->ns->user_ns != &init_user_ns) ||
564  	    !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
565  		return -EPERM;
566  
567  	cgrp = cgroup_kn_lock_live(of->kn, false);
568  	if (!cgrp)
569  		return -ENODEV;
570  	spin_lock(&release_agent_path_lock);
571  	strscpy(cgrp->root->release_agent_path, strstrip(buf),
572  		sizeof(cgrp->root->release_agent_path));
573  	spin_unlock(&release_agent_path_lock);
574  	cgroup_kn_unlock(of->kn);
575  	return nbytes;
576  }
577  
cgroup_release_agent_show(struct seq_file * seq,void * v)578  static int cgroup_release_agent_show(struct seq_file *seq, void *v)
579  {
580  	struct cgroup *cgrp = seq_css(seq)->cgroup;
581  
582  	spin_lock(&release_agent_path_lock);
583  	seq_puts(seq, cgrp->root->release_agent_path);
584  	spin_unlock(&release_agent_path_lock);
585  	seq_putc(seq, '\n');
586  	return 0;
587  }
588  
cgroup_sane_behavior_show(struct seq_file * seq,void * v)589  static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
590  {
591  	seq_puts(seq, "0\n");
592  	return 0;
593  }
594  
cgroup_read_notify_on_release(struct cgroup_subsys_state * css,struct cftype * cft)595  static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
596  					 struct cftype *cft)
597  {
598  	return notify_on_release(css->cgroup);
599  }
600  
cgroup_write_notify_on_release(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)601  static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
602  					  struct cftype *cft, u64 val)
603  {
604  	if (val)
605  		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
606  	else
607  		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
608  	return 0;
609  }
610  
cgroup_clone_children_read(struct cgroup_subsys_state * css,struct cftype * cft)611  static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
612  				      struct cftype *cft)
613  {
614  	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
615  }
616  
cgroup_clone_children_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)617  static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
618  				       struct cftype *cft, u64 val)
619  {
620  	if (val)
621  		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
622  	else
623  		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
624  	return 0;
625  }
626  
627  /* cgroup core interface files for the legacy hierarchies */
628  struct cftype cgroup1_base_files[] = {
629  	{
630  		.name = "cgroup.procs",
631  		.seq_start = cgroup_pidlist_start,
632  		.seq_next = cgroup_pidlist_next,
633  		.seq_stop = cgroup_pidlist_stop,
634  		.seq_show = cgroup_pidlist_show,
635  		.private = CGROUP_FILE_PROCS,
636  		.write = cgroup1_procs_write,
637  	},
638  	{
639  		.name = "cgroup.clone_children",
640  		.read_u64 = cgroup_clone_children_read,
641  		.write_u64 = cgroup_clone_children_write,
642  	},
643  	{
644  		.name = "cgroup.sane_behavior",
645  		.flags = CFTYPE_ONLY_ON_ROOT,
646  		.seq_show = cgroup_sane_behavior_show,
647  	},
648  	{
649  		.name = "tasks",
650  		.seq_start = cgroup_pidlist_start,
651  		.seq_next = cgroup_pidlist_next,
652  		.seq_stop = cgroup_pidlist_stop,
653  		.seq_show = cgroup_pidlist_show,
654  		.private = CGROUP_FILE_TASKS,
655  		.write = cgroup1_tasks_write,
656  	},
657  	{
658  		.name = "notify_on_release",
659  		.read_u64 = cgroup_read_notify_on_release,
660  		.write_u64 = cgroup_write_notify_on_release,
661  	},
662  	{
663  		.name = "release_agent",
664  		.flags = CFTYPE_ONLY_ON_ROOT,
665  		.seq_show = cgroup_release_agent_show,
666  		.write = cgroup_release_agent_write,
667  		.max_write_len = PATH_MAX - 1,
668  	},
669  	{ }	/* terminate */
670  };
671  
672  /* Display information about each subsystem and each hierarchy */
proc_cgroupstats_show(struct seq_file * m,void * v)673  int proc_cgroupstats_show(struct seq_file *m, void *v)
674  {
675  	struct cgroup_subsys *ss;
676  	int i;
677  
678  	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
679  	/*
680  	 * Grab the subsystems state racily. No need to add avenue to
681  	 * cgroup_mutex contention.
682  	 */
683  
684  	for_each_subsys(ss, i) {
685  		if (cgroup1_subsys_absent(ss))
686  			continue;
687  		seq_printf(m, "%s\t%d\t%d\t%d\n",
688  			   ss->legacy_name, ss->root->hierarchy_id,
689  			   atomic_read(&ss->root->nr_cgrps),
690  			   cgroup_ssid_enabled(i));
691  	}
692  
693  	return 0;
694  }
695  
696  /**
697   * cgroupstats_build - build and fill cgroupstats
698   * @stats: cgroupstats to fill information into
699   * @dentry: A dentry entry belonging to the cgroup for which stats have
700   * been requested.
701   *
702   * Build and fill cgroupstats so that taskstats can export it to user
703   * space.
704   *
705   * Return: %0 on success or a negative errno code on failure
706   */
cgroupstats_build(struct cgroupstats * stats,struct dentry * dentry)707  int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
708  {
709  	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
710  	struct cgroup *cgrp;
711  	struct css_task_iter it;
712  	struct task_struct *tsk;
713  
714  	/* it should be kernfs_node belonging to cgroupfs and is a directory */
715  	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
716  	    kernfs_type(kn) != KERNFS_DIR)
717  		return -EINVAL;
718  
719  	/*
720  	 * We aren't being called from kernfs and there's no guarantee on
721  	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
722  	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
723  	 */
724  	rcu_read_lock();
725  	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
726  	if (!cgrp || !cgroup_tryget(cgrp)) {
727  		rcu_read_unlock();
728  		return -ENOENT;
729  	}
730  	rcu_read_unlock();
731  
732  	css_task_iter_start(&cgrp->self, 0, &it);
733  	while ((tsk = css_task_iter_next(&it))) {
734  		switch (READ_ONCE(tsk->__state)) {
735  		case TASK_RUNNING:
736  			stats->nr_running++;
737  			break;
738  		case TASK_INTERRUPTIBLE:
739  			stats->nr_sleeping++;
740  			break;
741  		case TASK_UNINTERRUPTIBLE:
742  			stats->nr_uninterruptible++;
743  			break;
744  		case TASK_STOPPED:
745  			stats->nr_stopped++;
746  			break;
747  		default:
748  			if (tsk->in_iowait)
749  				stats->nr_io_wait++;
750  			break;
751  		}
752  	}
753  	css_task_iter_end(&it);
754  
755  	cgroup_put(cgrp);
756  	return 0;
757  }
758  
cgroup1_check_for_release(struct cgroup * cgrp)759  void cgroup1_check_for_release(struct cgroup *cgrp)
760  {
761  	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
762  	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
763  		schedule_work(&cgrp->release_agent_work);
764  }
765  
766  /*
767   * Notify userspace when a cgroup is released, by running the
768   * configured release agent with the name of the cgroup (path
769   * relative to the root of cgroup file system) as the argument.
770   *
771   * Most likely, this user command will try to rmdir this cgroup.
772   *
773   * This races with the possibility that some other task will be
774   * attached to this cgroup before it is removed, or that some other
775   * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
776   * The presumed 'rmdir' will fail quietly if this cgroup is no longer
777   * unused, and this cgroup will be reprieved from its death sentence,
778   * to continue to serve a useful existence.  Next time it's released,
779   * we will get notified again, if it still has 'notify_on_release' set.
780   *
781   * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
782   * means only wait until the task is successfully execve()'d.  The
783   * separate release agent task is forked by call_usermodehelper(),
784   * then control in this thread returns here, without waiting for the
785   * release agent task.  We don't bother to wait because the caller of
786   * this routine has no use for the exit status of the release agent
787   * task, so no sense holding our caller up for that.
788   */
cgroup1_release_agent(struct work_struct * work)789  void cgroup1_release_agent(struct work_struct *work)
790  {
791  	struct cgroup *cgrp =
792  		container_of(work, struct cgroup, release_agent_work);
793  	char *pathbuf, *agentbuf;
794  	char *argv[3], *envp[3];
795  	int ret;
796  
797  	/* snoop agent path and exit early if empty */
798  	if (!cgrp->root->release_agent_path[0])
799  		return;
800  
801  	/* prepare argument buffers */
802  	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
803  	agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
804  	if (!pathbuf || !agentbuf)
805  		goto out_free;
806  
807  	spin_lock(&release_agent_path_lock);
808  	strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
809  	spin_unlock(&release_agent_path_lock);
810  	if (!agentbuf[0])
811  		goto out_free;
812  
813  	ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
814  	if (ret < 0)
815  		goto out_free;
816  
817  	argv[0] = agentbuf;
818  	argv[1] = pathbuf;
819  	argv[2] = NULL;
820  
821  	/* minimal command environment */
822  	envp[0] = "HOME=/";
823  	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
824  	envp[2] = NULL;
825  
826  	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
827  out_free:
828  	kfree(agentbuf);
829  	kfree(pathbuf);
830  }
831  
832  /*
833   * cgroup_rename - Only allow simple rename of directories in place.
834   */
cgroup1_rename(struct kernfs_node * kn,struct kernfs_node * new_parent,const char * new_name_str)835  static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
836  			  const char *new_name_str)
837  {
838  	struct cgroup *cgrp = kn->priv;
839  	int ret;
840  
841  	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
842  	if (strchr(new_name_str, '\n'))
843  		return -EINVAL;
844  
845  	if (kernfs_type(kn) != KERNFS_DIR)
846  		return -ENOTDIR;
847  	if (kn->parent != new_parent)
848  		return -EIO;
849  
850  	/*
851  	 * We're gonna grab cgroup_mutex which nests outside kernfs
852  	 * active_ref.  kernfs_rename() doesn't require active_ref
853  	 * protection.  Break them before grabbing cgroup_mutex.
854  	 */
855  	kernfs_break_active_protection(new_parent);
856  	kernfs_break_active_protection(kn);
857  
858  	cgroup_lock();
859  
860  	ret = kernfs_rename(kn, new_parent, new_name_str);
861  	if (!ret)
862  		TRACE_CGROUP_PATH(rename, cgrp);
863  
864  	cgroup_unlock();
865  
866  	kernfs_unbreak_active_protection(kn);
867  	kernfs_unbreak_active_protection(new_parent);
868  	return ret;
869  }
870  
cgroup1_show_options(struct seq_file * seq,struct kernfs_root * kf_root)871  static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
872  {
873  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
874  	struct cgroup_subsys *ss;
875  	int ssid;
876  
877  	for_each_subsys(ss, ssid)
878  		if (root->subsys_mask & (1 << ssid))
879  			seq_show_option(seq, ss->legacy_name, NULL);
880  	if (root->flags & CGRP_ROOT_NOPREFIX)
881  		seq_puts(seq, ",noprefix");
882  	if (root->flags & CGRP_ROOT_XATTR)
883  		seq_puts(seq, ",xattr");
884  	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
885  		seq_puts(seq, ",cpuset_v2_mode");
886  	if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
887  		seq_puts(seq, ",favordynmods");
888  
889  	spin_lock(&release_agent_path_lock);
890  	if (strlen(root->release_agent_path))
891  		seq_show_option(seq, "release_agent",
892  				root->release_agent_path);
893  	spin_unlock(&release_agent_path_lock);
894  
895  	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
896  		seq_puts(seq, ",clone_children");
897  	if (strlen(root->name))
898  		seq_show_option(seq, "name", root->name);
899  	return 0;
900  }
901  
902  enum cgroup1_param {
903  	Opt_all,
904  	Opt_clone_children,
905  	Opt_cpuset_v2_mode,
906  	Opt_name,
907  	Opt_none,
908  	Opt_noprefix,
909  	Opt_release_agent,
910  	Opt_xattr,
911  	Opt_favordynmods,
912  	Opt_nofavordynmods,
913  };
914  
915  const struct fs_parameter_spec cgroup1_fs_parameters[] = {
916  	fsparam_flag  ("all",		Opt_all),
917  	fsparam_flag  ("clone_children", Opt_clone_children),
918  	fsparam_flag  ("cpuset_v2_mode", Opt_cpuset_v2_mode),
919  	fsparam_string("name",		Opt_name),
920  	fsparam_flag  ("none",		Opt_none),
921  	fsparam_flag  ("noprefix",	Opt_noprefix),
922  	fsparam_string("release_agent",	Opt_release_agent),
923  	fsparam_flag  ("xattr",		Opt_xattr),
924  	fsparam_flag  ("favordynmods",	Opt_favordynmods),
925  	fsparam_flag  ("nofavordynmods", Opt_nofavordynmods),
926  	{}
927  };
928  
cgroup1_parse_param(struct fs_context * fc,struct fs_parameter * param)929  int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
930  {
931  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
932  	struct cgroup_subsys *ss;
933  	struct fs_parse_result result;
934  	int opt, i;
935  
936  	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
937  	if (opt == -ENOPARAM) {
938  		int ret;
939  
940  		ret = vfs_parse_fs_param_source(fc, param);
941  		if (ret != -ENOPARAM)
942  			return ret;
943  		for_each_subsys(ss, i) {
944  			if (strcmp(param->key, ss->legacy_name) ||
945  			    cgroup1_subsys_absent(ss))
946  				continue;
947  			if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
948  				return invalfc(fc, "Disabled controller '%s'",
949  					       param->key);
950  			ctx->subsys_mask |= (1 << i);
951  			return 0;
952  		}
953  		return invalfc(fc, "Unknown subsys name '%s'", param->key);
954  	}
955  	if (opt < 0)
956  		return opt;
957  
958  	switch (opt) {
959  	case Opt_none:
960  		/* Explicitly have no subsystems */
961  		ctx->none = true;
962  		break;
963  	case Opt_all:
964  		ctx->all_ss = true;
965  		break;
966  	case Opt_noprefix:
967  		ctx->flags |= CGRP_ROOT_NOPREFIX;
968  		break;
969  	case Opt_clone_children:
970  		ctx->cpuset_clone_children = true;
971  		break;
972  	case Opt_cpuset_v2_mode:
973  		ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
974  		break;
975  	case Opt_xattr:
976  		ctx->flags |= CGRP_ROOT_XATTR;
977  		break;
978  	case Opt_favordynmods:
979  		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
980  		break;
981  	case Opt_nofavordynmods:
982  		ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
983  		break;
984  	case Opt_release_agent:
985  		/* Specifying two release agents is forbidden */
986  		if (ctx->release_agent)
987  			return invalfc(fc, "release_agent respecified");
988  		/*
989  		 * Release agent gets called with all capabilities,
990  		 * require capabilities to set release agent.
991  		 */
992  		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
993  			return invalfc(fc, "Setting release_agent not allowed");
994  		ctx->release_agent = param->string;
995  		param->string = NULL;
996  		break;
997  	case Opt_name:
998  		/* blocked by boot param? */
999  		if (cgroup_no_v1_named)
1000  			return -ENOENT;
1001  		/* Can't specify an empty name */
1002  		if (!param->size)
1003  			return invalfc(fc, "Empty name");
1004  		if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
1005  			return invalfc(fc, "Name too long");
1006  		/* Must match [\w.-]+ */
1007  		for (i = 0; i < param->size; i++) {
1008  			char c = param->string[i];
1009  			if (isalnum(c))
1010  				continue;
1011  			if ((c == '.') || (c == '-') || (c == '_'))
1012  				continue;
1013  			return invalfc(fc, "Invalid name");
1014  		}
1015  		/* Specifying two names is forbidden */
1016  		if (ctx->name)
1017  			return invalfc(fc, "name respecified");
1018  		ctx->name = param->string;
1019  		param->string = NULL;
1020  		break;
1021  	}
1022  	return 0;
1023  }
1024  
check_cgroupfs_options(struct fs_context * fc)1025  static int check_cgroupfs_options(struct fs_context *fc)
1026  {
1027  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1028  	u16 mask = U16_MAX;
1029  	u16 enabled = 0;
1030  	struct cgroup_subsys *ss;
1031  	int i;
1032  
1033  #ifdef CONFIG_CPUSETS
1034  	mask = ~((u16)1 << cpuset_cgrp_id);
1035  #endif
1036  	for_each_subsys(ss, i)
1037  		if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
1038  		    !cgroup1_subsys_absent(ss))
1039  			enabled |= 1 << i;
1040  
1041  	ctx->subsys_mask &= enabled;
1042  
1043  	/*
1044  	 * In absence of 'none', 'name=' and subsystem name options,
1045  	 * let's default to 'all'.
1046  	 */
1047  	if (!ctx->subsys_mask && !ctx->none && !ctx->name)
1048  		ctx->all_ss = true;
1049  
1050  	if (ctx->all_ss) {
1051  		/* Mutually exclusive option 'all' + subsystem name */
1052  		if (ctx->subsys_mask)
1053  			return invalfc(fc, "subsys name conflicts with all");
1054  		/* 'all' => select all the subsystems */
1055  		ctx->subsys_mask = enabled;
1056  	}
1057  
1058  	/*
1059  	 * We either have to specify by name or by subsystems. (So all
1060  	 * empty hierarchies must have a name).
1061  	 */
1062  	if (!ctx->subsys_mask && !ctx->name)
1063  		return invalfc(fc, "Need name or subsystem set");
1064  
1065  	/*
1066  	 * Option noprefix was introduced just for backward compatibility
1067  	 * with the old cpuset, so we allow noprefix only if mounting just
1068  	 * the cpuset subsystem.
1069  	 */
1070  	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
1071  		return invalfc(fc, "noprefix used incorrectly");
1072  
1073  	/* Can't specify "none" and some subsystems */
1074  	if (ctx->subsys_mask && ctx->none)
1075  		return invalfc(fc, "none used incorrectly");
1076  
1077  	return 0;
1078  }
1079  
cgroup1_reconfigure(struct fs_context * fc)1080  int cgroup1_reconfigure(struct fs_context *fc)
1081  {
1082  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1083  	struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
1084  	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1085  	int ret = 0;
1086  	u16 added_mask, removed_mask;
1087  
1088  	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1089  
1090  	/* See what subsystems are wanted */
1091  	ret = check_cgroupfs_options(fc);
1092  	if (ret)
1093  		goto out_unlock;
1094  
1095  	if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
1096  		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1097  			task_tgid_nr(current), current->comm);
1098  
1099  	added_mask = ctx->subsys_mask & ~root->subsys_mask;
1100  	removed_mask = root->subsys_mask & ~ctx->subsys_mask;
1101  
1102  	/* Don't allow flags or name to change at remount */
1103  	if ((ctx->flags ^ root->flags) ||
1104  	    (ctx->name && strcmp(ctx->name, root->name))) {
1105  		errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
1106  		       ctx->flags, ctx->name ?: "", root->flags, root->name);
1107  		ret = -EINVAL;
1108  		goto out_unlock;
1109  	}
1110  
1111  	/* remounting is not allowed for populated hierarchies */
1112  	if (!list_empty(&root->cgrp.self.children)) {
1113  		ret = -EBUSY;
1114  		goto out_unlock;
1115  	}
1116  
1117  	ret = rebind_subsystems(root, added_mask);
1118  	if (ret)
1119  		goto out_unlock;
1120  
1121  	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1122  
1123  	if (ctx->release_agent) {
1124  		spin_lock(&release_agent_path_lock);
1125  		strcpy(root->release_agent_path, ctx->release_agent);
1126  		spin_unlock(&release_agent_path_lock);
1127  	}
1128  
1129  	trace_cgroup_remount(root);
1130  
1131   out_unlock:
1132  	cgroup_unlock();
1133  	return ret;
1134  }
1135  
1136  struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
1137  	.rename			= cgroup1_rename,
1138  	.show_options		= cgroup1_show_options,
1139  	.mkdir			= cgroup_mkdir,
1140  	.rmdir			= cgroup_rmdir,
1141  	.show_path		= cgroup_show_path,
1142  };
1143  
1144  /*
1145   * The guts of cgroup1 mount - find or create cgroup_root to use.
1146   * Called with cgroup_mutex held; returns 0 on success, -E... on
1147   * error and positive - in case when the candidate is busy dying.
1148   * On success it stashes a reference to cgroup_root into given
1149   * cgroup_fs_context; that reference is *NOT* counting towards the
1150   * cgroup_root refcount.
1151   */
cgroup1_root_to_use(struct fs_context * fc)1152  static int cgroup1_root_to_use(struct fs_context *fc)
1153  {
1154  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1155  	struct cgroup_root *root;
1156  	struct cgroup_subsys *ss;
1157  	int i, ret;
1158  
1159  	/* First find the desired set of subsystems */
1160  	ret = check_cgroupfs_options(fc);
1161  	if (ret)
1162  		return ret;
1163  
1164  	/*
1165  	 * Destruction of cgroup root is asynchronous, so subsystems may
1166  	 * still be dying after the previous unmount.  Let's drain the
1167  	 * dying subsystems.  We just need to ensure that the ones
1168  	 * unmounted previously finish dying and don't care about new ones
1169  	 * starting.  Testing ref liveliness is good enough.
1170  	 */
1171  	for_each_subsys(ss, i) {
1172  		if (!(ctx->subsys_mask & (1 << i)) ||
1173  		    ss->root == &cgrp_dfl_root)
1174  			continue;
1175  
1176  		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
1177  			return 1;	/* restart */
1178  		cgroup_put(&ss->root->cgrp);
1179  	}
1180  
1181  	for_each_root(root) {
1182  		bool name_match = false;
1183  
1184  		if (root == &cgrp_dfl_root)
1185  			continue;
1186  
1187  		/*
1188  		 * If we asked for a name then it must match.  Also, if
1189  		 * name matches but sybsys_mask doesn't, we should fail.
1190  		 * Remember whether name matched.
1191  		 */
1192  		if (ctx->name) {
1193  			if (strcmp(ctx->name, root->name))
1194  				continue;
1195  			name_match = true;
1196  		}
1197  
1198  		/*
1199  		 * If we asked for subsystems (or explicitly for no
1200  		 * subsystems) then they must match.
1201  		 */
1202  		if ((ctx->subsys_mask || ctx->none) &&
1203  		    (ctx->subsys_mask != root->subsys_mask)) {
1204  			if (!name_match)
1205  				continue;
1206  			return -EBUSY;
1207  		}
1208  
1209  		if (root->flags ^ ctx->flags)
1210  			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1211  
1212  		ctx->root = root;
1213  		return 0;
1214  	}
1215  
1216  	/*
1217  	 * No such thing, create a new one.  name= matching without subsys
1218  	 * specification is allowed for already existing hierarchies but we
1219  	 * can't create new one without subsys specification.
1220  	 */
1221  	if (!ctx->subsys_mask && !ctx->none)
1222  		return invalfc(fc, "No subsys list or none specified");
1223  
1224  	/* Hierarchies may only be created in the initial cgroup namespace. */
1225  	if (ctx->ns != &init_cgroup_ns)
1226  		return -EPERM;
1227  
1228  	root = kzalloc(sizeof(*root), GFP_KERNEL);
1229  	if (!root)
1230  		return -ENOMEM;
1231  
1232  	ctx->root = root;
1233  	init_cgroup_root(ctx);
1234  
1235  	ret = cgroup_setup_root(root, ctx->subsys_mask);
1236  	if (!ret)
1237  		cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
1238  	else
1239  		cgroup_free_root(root);
1240  
1241  	return ret;
1242  }
1243  
cgroup1_get_tree(struct fs_context * fc)1244  int cgroup1_get_tree(struct fs_context *fc)
1245  {
1246  	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1247  	int ret;
1248  
1249  	/* Check if the caller has permission to mount. */
1250  	if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
1251  		return -EPERM;
1252  
1253  	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1254  
1255  	ret = cgroup1_root_to_use(fc);
1256  	if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
1257  		ret = 1;	/* restart */
1258  
1259  	cgroup_unlock();
1260  
1261  	if (!ret)
1262  		ret = cgroup_do_get_tree(fc);
1263  
1264  	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
1265  		fc_drop_locked(fc);
1266  		ret = 1;
1267  	}
1268  
1269  	if (unlikely(ret > 0)) {
1270  		msleep(10);
1271  		return restart_syscall();
1272  	}
1273  	return ret;
1274  }
1275  
1276  /**
1277   * task_get_cgroup1 - Acquires the associated cgroup of a task within a
1278   * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
1279   * hierarchy ID.
1280   * @tsk: The target task
1281   * @hierarchy_id: The ID of a cgroup1 hierarchy
1282   *
1283   * On success, the cgroup is returned. On failure, ERR_PTR is returned.
1284   * We limit it to cgroup1 only.
1285   */
task_get_cgroup1(struct task_struct * tsk,int hierarchy_id)1286  struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
1287  {
1288  	struct cgroup *cgrp = ERR_PTR(-ENOENT);
1289  	struct cgroup_root *root;
1290  	unsigned long flags;
1291  
1292  	rcu_read_lock();
1293  	for_each_root(root) {
1294  		/* cgroup1 only*/
1295  		if (root == &cgrp_dfl_root)
1296  			continue;
1297  		if (root->hierarchy_id != hierarchy_id)
1298  			continue;
1299  		spin_lock_irqsave(&css_set_lock, flags);
1300  		cgrp = task_cgroup_from_root(tsk, root);
1301  		if (!cgrp || !cgroup_tryget(cgrp))
1302  			cgrp = ERR_PTR(-ENOENT);
1303  		spin_unlock_irqrestore(&css_set_lock, flags);
1304  		break;
1305  	}
1306  	rcu_read_unlock();
1307  	return cgrp;
1308  }
1309  
cgroup1_wq_init(void)1310  static int __init cgroup1_wq_init(void)
1311  {
1312  	/*
1313  	 * Used to destroy pidlists and separate to serve as flush domain.
1314  	 * Cap @max_active to 1 too.
1315  	 */
1316  	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
1317  						    0, 1);
1318  	BUG_ON(!cgroup_pidlist_destroy_wq);
1319  	return 0;
1320  }
1321  core_initcall(cgroup1_wq_init);
1322  
cgroup_no_v1(char * str)1323  static int __init cgroup_no_v1(char *str)
1324  {
1325  	struct cgroup_subsys *ss;
1326  	char *token;
1327  	int i;
1328  
1329  	while ((token = strsep(&str, ",")) != NULL) {
1330  		if (!*token)
1331  			continue;
1332  
1333  		if (!strcmp(token, "all")) {
1334  			cgroup_no_v1_mask = U16_MAX;
1335  			continue;
1336  		}
1337  
1338  		if (!strcmp(token, "named")) {
1339  			cgroup_no_v1_named = true;
1340  			continue;
1341  		}
1342  
1343  		for_each_subsys(ss, i) {
1344  			if (strcmp(token, ss->name) &&
1345  			    strcmp(token, ss->legacy_name))
1346  				continue;
1347  
1348  			cgroup_no_v1_mask |= 1 << i;
1349  			break;
1350  		}
1351  	}
1352  	return 1;
1353  }
1354  __setup("cgroup_no_v1=", cgroup_no_v1);
1355