1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  Copyright (C) 2006 IBM Corporation
4   *
5   *  Author: Serge Hallyn <serue@us.ibm.com>
6   *
7   *  Jun 2006 - namespaces support
8   *             OpenVZ, SWsoft Inc.
9   *             Pavel Emelianov <xemul@openvz.org>
10   */
11  
12  #include <linux/slab.h>
13  #include <linux/export.h>
14  #include <linux/nsproxy.h>
15  #include <linux/init_task.h>
16  #include <linux/mnt_namespace.h>
17  #include <linux/utsname.h>
18  #include <linux/pid_namespace.h>
19  #include <net/net_namespace.h>
20  #include <linux/ipc_namespace.h>
21  #include <linux/time_namespace.h>
22  #include <linux/fs_struct.h>
23  #include <linux/proc_fs.h>
24  #include <linux/proc_ns.h>
25  #include <linux/file.h>
26  #include <linux/syscalls.h>
27  #include <linux/cgroup.h>
28  #include <linux/perf_event.h>
29  
30  static struct kmem_cache *nsproxy_cachep;
31  
32  struct nsproxy init_nsproxy = {
33  	.count			= REFCOUNT_INIT(1),
34  	.uts_ns			= &init_uts_ns,
35  #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
36  	.ipc_ns			= &init_ipc_ns,
37  #endif
38  	.mnt_ns			= NULL,
39  	.pid_ns_for_children	= &init_pid_ns,
40  #ifdef CONFIG_NET
41  	.net_ns			= &init_net,
42  #endif
43  #ifdef CONFIG_CGROUPS
44  	.cgroup_ns		= &init_cgroup_ns,
45  #endif
46  #ifdef CONFIG_TIME_NS
47  	.time_ns		= &init_time_ns,
48  	.time_ns_for_children	= &init_time_ns,
49  #endif
50  };
51  
create_nsproxy(void)52  static inline struct nsproxy *create_nsproxy(void)
53  {
54  	struct nsproxy *nsproxy;
55  
56  	nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
57  	if (nsproxy)
58  		refcount_set(&nsproxy->count, 1);
59  	return nsproxy;
60  }
61  
62  /*
63   * Create new nsproxy and all of its the associated namespaces.
64   * Return the newly created nsproxy.  Do not attach this to the task,
65   * leave it to the caller to do proper locking and attach it to task.
66   */
create_new_namespaces(unsigned long flags,struct task_struct * tsk,struct user_namespace * user_ns,struct fs_struct * new_fs)67  static struct nsproxy *create_new_namespaces(unsigned long flags,
68  	struct task_struct *tsk, struct user_namespace *user_ns,
69  	struct fs_struct *new_fs)
70  {
71  	struct nsproxy *new_nsp;
72  	int err;
73  
74  	new_nsp = create_nsproxy();
75  	if (!new_nsp)
76  		return ERR_PTR(-ENOMEM);
77  
78  	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
79  	if (IS_ERR(new_nsp->mnt_ns)) {
80  		err = PTR_ERR(new_nsp->mnt_ns);
81  		goto out_ns;
82  	}
83  
84  	new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
85  	if (IS_ERR(new_nsp->uts_ns)) {
86  		err = PTR_ERR(new_nsp->uts_ns);
87  		goto out_uts;
88  	}
89  
90  	new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
91  	if (IS_ERR(new_nsp->ipc_ns)) {
92  		err = PTR_ERR(new_nsp->ipc_ns);
93  		goto out_ipc;
94  	}
95  
96  	new_nsp->pid_ns_for_children =
97  		copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
98  	if (IS_ERR(new_nsp->pid_ns_for_children)) {
99  		err = PTR_ERR(new_nsp->pid_ns_for_children);
100  		goto out_pid;
101  	}
102  
103  	new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
104  					    tsk->nsproxy->cgroup_ns);
105  	if (IS_ERR(new_nsp->cgroup_ns)) {
106  		err = PTR_ERR(new_nsp->cgroup_ns);
107  		goto out_cgroup;
108  	}
109  
110  	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
111  	if (IS_ERR(new_nsp->net_ns)) {
112  		err = PTR_ERR(new_nsp->net_ns);
113  		goto out_net;
114  	}
115  
116  	new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
117  					tsk->nsproxy->time_ns_for_children);
118  	if (IS_ERR(new_nsp->time_ns_for_children)) {
119  		err = PTR_ERR(new_nsp->time_ns_for_children);
120  		goto out_time;
121  	}
122  	new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
123  
124  	return new_nsp;
125  
126  out_time:
127  	put_net(new_nsp->net_ns);
128  out_net:
129  	put_cgroup_ns(new_nsp->cgroup_ns);
130  out_cgroup:
131  	if (new_nsp->pid_ns_for_children)
132  		put_pid_ns(new_nsp->pid_ns_for_children);
133  out_pid:
134  	if (new_nsp->ipc_ns)
135  		put_ipc_ns(new_nsp->ipc_ns);
136  out_ipc:
137  	if (new_nsp->uts_ns)
138  		put_uts_ns(new_nsp->uts_ns);
139  out_uts:
140  	if (new_nsp->mnt_ns)
141  		put_mnt_ns(new_nsp->mnt_ns);
142  out_ns:
143  	kmem_cache_free(nsproxy_cachep, new_nsp);
144  	return ERR_PTR(err);
145  }
146  
147  /*
148   * called from clone.  This now handles copy for nsproxy and all
149   * namespaces therein.
150   */
copy_namespaces(unsigned long flags,struct task_struct * tsk)151  int copy_namespaces(unsigned long flags, struct task_struct *tsk)
152  {
153  	struct nsproxy *old_ns = tsk->nsproxy;
154  	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
155  	struct nsproxy *new_ns;
156  
157  	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
158  			      CLONE_NEWPID | CLONE_NEWNET |
159  			      CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
160  		if ((flags & CLONE_VM) ||
161  		    likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
162  			get_nsproxy(old_ns);
163  			return 0;
164  		}
165  	} else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
166  		return -EPERM;
167  
168  	/*
169  	 * CLONE_NEWIPC must detach from the undolist: after switching
170  	 * to a new ipc namespace, the semaphore arrays from the old
171  	 * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
172  	 * means share undolist with parent, so we must forbid using
173  	 * it along with CLONE_NEWIPC.
174  	 */
175  	if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
176  		(CLONE_NEWIPC | CLONE_SYSVSEM))
177  		return -EINVAL;
178  
179  	new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
180  	if (IS_ERR(new_ns))
181  		return  PTR_ERR(new_ns);
182  
183  	if ((flags & CLONE_VM) == 0)
184  		timens_on_fork(new_ns, tsk);
185  
186  	tsk->nsproxy = new_ns;
187  	return 0;
188  }
189  
free_nsproxy(struct nsproxy * ns)190  void free_nsproxy(struct nsproxy *ns)
191  {
192  	if (ns->mnt_ns)
193  		put_mnt_ns(ns->mnt_ns);
194  	if (ns->uts_ns)
195  		put_uts_ns(ns->uts_ns);
196  	if (ns->ipc_ns)
197  		put_ipc_ns(ns->ipc_ns);
198  	if (ns->pid_ns_for_children)
199  		put_pid_ns(ns->pid_ns_for_children);
200  	if (ns->time_ns)
201  		put_time_ns(ns->time_ns);
202  	if (ns->time_ns_for_children)
203  		put_time_ns(ns->time_ns_for_children);
204  	put_cgroup_ns(ns->cgroup_ns);
205  	put_net(ns->net_ns);
206  	kmem_cache_free(nsproxy_cachep, ns);
207  }
208  
209  /*
210   * Called from unshare. Unshare all the namespaces part of nsproxy.
211   * On success, returns the new nsproxy.
212   */
unshare_nsproxy_namespaces(unsigned long unshare_flags,struct nsproxy ** new_nsp,struct cred * new_cred,struct fs_struct * new_fs)213  int unshare_nsproxy_namespaces(unsigned long unshare_flags,
214  	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
215  {
216  	struct user_namespace *user_ns;
217  	int err = 0;
218  
219  	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
220  			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
221  			       CLONE_NEWTIME)))
222  		return 0;
223  
224  	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
225  	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
226  		return -EPERM;
227  
228  	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
229  					 new_fs ? new_fs : current->fs);
230  	if (IS_ERR(*new_nsp)) {
231  		err = PTR_ERR(*new_nsp);
232  		goto out;
233  	}
234  
235  out:
236  	return err;
237  }
238  
switch_task_namespaces(struct task_struct * p,struct nsproxy * new)239  void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
240  {
241  	struct nsproxy *ns;
242  
243  	might_sleep();
244  
245  	task_lock(p);
246  	ns = p->nsproxy;
247  	p->nsproxy = new;
248  	task_unlock(p);
249  
250  	if (ns)
251  		put_nsproxy(ns);
252  }
253  
exit_task_namespaces(struct task_struct * p)254  void exit_task_namespaces(struct task_struct *p)
255  {
256  	switch_task_namespaces(p, NULL);
257  }
258  
exec_task_namespaces(void)259  int exec_task_namespaces(void)
260  {
261  	struct task_struct *tsk = current;
262  	struct nsproxy *new;
263  
264  	if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
265  		return 0;
266  
267  	new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
268  	if (IS_ERR(new))
269  		return PTR_ERR(new);
270  
271  	timens_on_fork(new, tsk);
272  	switch_task_namespaces(tsk, new);
273  	return 0;
274  }
275  
check_setns_flags(unsigned long flags)276  static int check_setns_flags(unsigned long flags)
277  {
278  	if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
279  				 CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
280  				 CLONE_NEWPID | CLONE_NEWCGROUP)))
281  		return -EINVAL;
282  
283  #ifndef CONFIG_USER_NS
284  	if (flags & CLONE_NEWUSER)
285  		return -EINVAL;
286  #endif
287  #ifndef CONFIG_PID_NS
288  	if (flags & CLONE_NEWPID)
289  		return -EINVAL;
290  #endif
291  #ifndef CONFIG_UTS_NS
292  	if (flags & CLONE_NEWUTS)
293  		return -EINVAL;
294  #endif
295  #ifndef CONFIG_IPC_NS
296  	if (flags & CLONE_NEWIPC)
297  		return -EINVAL;
298  #endif
299  #ifndef CONFIG_CGROUPS
300  	if (flags & CLONE_NEWCGROUP)
301  		return -EINVAL;
302  #endif
303  #ifndef CONFIG_NET_NS
304  	if (flags & CLONE_NEWNET)
305  		return -EINVAL;
306  #endif
307  #ifndef CONFIG_TIME_NS
308  	if (flags & CLONE_NEWTIME)
309  		return -EINVAL;
310  #endif
311  
312  	return 0;
313  }
314  
put_nsset(struct nsset * nsset)315  static void put_nsset(struct nsset *nsset)
316  {
317  	unsigned flags = nsset->flags;
318  
319  	if (flags & CLONE_NEWUSER)
320  		put_cred(nsset_cred(nsset));
321  	/*
322  	 * We only created a temporary copy if we attached to more than just
323  	 * the mount namespace.
324  	 */
325  	if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
326  		free_fs_struct(nsset->fs);
327  	if (nsset->nsproxy)
328  		free_nsproxy(nsset->nsproxy);
329  }
330  
prepare_nsset(unsigned flags,struct nsset * nsset)331  static int prepare_nsset(unsigned flags, struct nsset *nsset)
332  {
333  	struct task_struct *me = current;
334  
335  	nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
336  	if (IS_ERR(nsset->nsproxy))
337  		return PTR_ERR(nsset->nsproxy);
338  
339  	if (flags & CLONE_NEWUSER)
340  		nsset->cred = prepare_creds();
341  	else
342  		nsset->cred = current_cred();
343  	if (!nsset->cred)
344  		goto out;
345  
346  	/* Only create a temporary copy of fs_struct if we really need to. */
347  	if (flags == CLONE_NEWNS) {
348  		nsset->fs = me->fs;
349  	} else if (flags & CLONE_NEWNS) {
350  		nsset->fs = copy_fs_struct(me->fs);
351  		if (!nsset->fs)
352  			goto out;
353  	}
354  
355  	nsset->flags = flags;
356  	return 0;
357  
358  out:
359  	put_nsset(nsset);
360  	return -ENOMEM;
361  }
362  
validate_ns(struct nsset * nsset,struct ns_common * ns)363  static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
364  {
365  	return ns->ops->install(nsset, ns);
366  }
367  
368  /*
369   * This is the inverse operation to unshare().
370   * Ordering is equivalent to the standard ordering used everywhere else
371   * during unshare and process creation. The switch to the new set of
372   * namespaces occurs at the point of no return after installation of
373   * all requested namespaces was successful in commit_nsset().
374   */
validate_nsset(struct nsset * nsset,struct pid * pid)375  static int validate_nsset(struct nsset *nsset, struct pid *pid)
376  {
377  	int ret = 0;
378  	unsigned flags = nsset->flags;
379  	struct user_namespace *user_ns = NULL;
380  	struct pid_namespace *pid_ns = NULL;
381  	struct nsproxy *nsp;
382  	struct task_struct *tsk;
383  
384  	/* Take a "snapshot" of the target task's namespaces. */
385  	rcu_read_lock();
386  	tsk = pid_task(pid, PIDTYPE_PID);
387  	if (!tsk) {
388  		rcu_read_unlock();
389  		return -ESRCH;
390  	}
391  
392  	if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
393  		rcu_read_unlock();
394  		return -EPERM;
395  	}
396  
397  	task_lock(tsk);
398  	nsp = tsk->nsproxy;
399  	if (nsp)
400  		get_nsproxy(nsp);
401  	task_unlock(tsk);
402  	if (!nsp) {
403  		rcu_read_unlock();
404  		return -ESRCH;
405  	}
406  
407  #ifdef CONFIG_PID_NS
408  	if (flags & CLONE_NEWPID) {
409  		pid_ns = task_active_pid_ns(tsk);
410  		if (unlikely(!pid_ns)) {
411  			rcu_read_unlock();
412  			ret = -ESRCH;
413  			goto out;
414  		}
415  		get_pid_ns(pid_ns);
416  	}
417  #endif
418  
419  #ifdef CONFIG_USER_NS
420  	if (flags & CLONE_NEWUSER)
421  		user_ns = get_user_ns(__task_cred(tsk)->user_ns);
422  #endif
423  	rcu_read_unlock();
424  
425  	/*
426  	 * Install requested namespaces. The caller will have
427  	 * verified earlier that the requested namespaces are
428  	 * supported on this kernel. We don't report errors here
429  	 * if a namespace is requested that isn't supported.
430  	 */
431  #ifdef CONFIG_USER_NS
432  	if (flags & CLONE_NEWUSER) {
433  		ret = validate_ns(nsset, &user_ns->ns);
434  		if (ret)
435  			goto out;
436  	}
437  #endif
438  
439  	if (flags & CLONE_NEWNS) {
440  		ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
441  		if (ret)
442  			goto out;
443  	}
444  
445  #ifdef CONFIG_UTS_NS
446  	if (flags & CLONE_NEWUTS) {
447  		ret = validate_ns(nsset, &nsp->uts_ns->ns);
448  		if (ret)
449  			goto out;
450  	}
451  #endif
452  
453  #ifdef CONFIG_IPC_NS
454  	if (flags & CLONE_NEWIPC) {
455  		ret = validate_ns(nsset, &nsp->ipc_ns->ns);
456  		if (ret)
457  			goto out;
458  	}
459  #endif
460  
461  #ifdef CONFIG_PID_NS
462  	if (flags & CLONE_NEWPID) {
463  		ret = validate_ns(nsset, &pid_ns->ns);
464  		if (ret)
465  			goto out;
466  	}
467  #endif
468  
469  #ifdef CONFIG_CGROUPS
470  	if (flags & CLONE_NEWCGROUP) {
471  		ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
472  		if (ret)
473  			goto out;
474  	}
475  #endif
476  
477  #ifdef CONFIG_NET_NS
478  	if (flags & CLONE_NEWNET) {
479  		ret = validate_ns(nsset, &nsp->net_ns->ns);
480  		if (ret)
481  			goto out;
482  	}
483  #endif
484  
485  #ifdef CONFIG_TIME_NS
486  	if (flags & CLONE_NEWTIME) {
487  		ret = validate_ns(nsset, &nsp->time_ns->ns);
488  		if (ret)
489  			goto out;
490  	}
491  #endif
492  
493  out:
494  	if (pid_ns)
495  		put_pid_ns(pid_ns);
496  	if (nsp)
497  		put_nsproxy(nsp);
498  	put_user_ns(user_ns);
499  
500  	return ret;
501  }
502  
503  /*
504   * This is the point of no return. There are just a few namespaces
505   * that do some actual work here and it's sufficiently minimal that
506   * a separate ns_common operation seems unnecessary for now.
507   * Unshare is doing the same thing. If we'll end up needing to do
508   * more in a given namespace or a helper here is ultimately not
509   * exported anymore a simple commit handler for each namespace
510   * should be added to ns_common.
511   */
commit_nsset(struct nsset * nsset)512  static void commit_nsset(struct nsset *nsset)
513  {
514  	unsigned flags = nsset->flags;
515  	struct task_struct *me = current;
516  
517  #ifdef CONFIG_USER_NS
518  	if (flags & CLONE_NEWUSER) {
519  		/* transfer ownership */
520  		commit_creds(nsset_cred(nsset));
521  		nsset->cred = NULL;
522  	}
523  #endif
524  
525  	/* We only need to commit if we have used a temporary fs_struct. */
526  	if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
527  		set_fs_root(me->fs, &nsset->fs->root);
528  		set_fs_pwd(me->fs, &nsset->fs->pwd);
529  	}
530  
531  #ifdef CONFIG_IPC_NS
532  	if (flags & CLONE_NEWIPC)
533  		exit_sem(me);
534  #endif
535  
536  #ifdef CONFIG_TIME_NS
537  	if (flags & CLONE_NEWTIME)
538  		timens_commit(me, nsset->nsproxy->time_ns);
539  #endif
540  
541  	/* transfer ownership */
542  	switch_task_namespaces(me, nsset->nsproxy);
543  	nsset->nsproxy = NULL;
544  }
545  
SYSCALL_DEFINE2(setns,int,fd,int,flags)546  SYSCALL_DEFINE2(setns, int, fd, int, flags)
547  {
548  	struct fd f = fdget(fd);
549  	struct ns_common *ns = NULL;
550  	struct nsset nsset = {};
551  	int err = 0;
552  
553  	if (!fd_file(f))
554  		return -EBADF;
555  
556  	if (proc_ns_file(fd_file(f))) {
557  		ns = get_proc_ns(file_inode(fd_file(f)));
558  		if (flags && (ns->ops->type != flags))
559  			err = -EINVAL;
560  		flags = ns->ops->type;
561  	} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
562  		err = check_setns_flags(flags);
563  	} else {
564  		err = -EINVAL;
565  	}
566  	if (err)
567  		goto out;
568  
569  	err = prepare_nsset(flags, &nsset);
570  	if (err)
571  		goto out;
572  
573  	if (proc_ns_file(fd_file(f)))
574  		err = validate_ns(&nsset, ns);
575  	else
576  		err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
577  	if (!err) {
578  		commit_nsset(&nsset);
579  		perf_event_namespaces(current);
580  	}
581  	put_nsset(&nsset);
582  out:
583  	fdput(f);
584  	return err;
585  }
586  
nsproxy_cache_init(void)587  int __init nsproxy_cache_init(void)
588  {
589  	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
590  	return 0;
591  }
592