1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /* Common capabilities, needed by capability.o.
3   */
4  
5  #include <linux/capability.h>
6  #include <linux/audit.h>
7  #include <linux/init.h>
8  #include <linux/kernel.h>
9  #include <linux/lsm_hooks.h>
10  #include <linux/file.h>
11  #include <linux/mm.h>
12  #include <linux/mman.h>
13  #include <linux/pagemap.h>
14  #include <linux/swap.h>
15  #include <linux/skbuff.h>
16  #include <linux/netlink.h>
17  #include <linux/ptrace.h>
18  #include <linux/xattr.h>
19  #include <linux/hugetlb.h>
20  #include <linux/mount.h>
21  #include <linux/sched.h>
22  #include <linux/prctl.h>
23  #include <linux/securebits.h>
24  #include <linux/user_namespace.h>
25  #include <linux/binfmts.h>
26  #include <linux/personality.h>
27  #include <linux/mnt_idmapping.h>
28  #include <uapi/linux/lsm.h>
29  
30  /*
31   * If a non-root user executes a setuid-root binary in
32   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
33   * However if fE is also set, then the intent is for only
34   * the file capabilities to be applied, and the setuid-root
35   * bit is left on either to change the uid (plausible) or
36   * to get full privilege on a kernel without file capabilities
37   * support.  So in that case we do not raise capabilities.
38   *
39   * Warn if that happens, once per boot.
40   */
warn_setuid_and_fcaps_mixed(const char * fname)41  static void warn_setuid_and_fcaps_mixed(const char *fname)
42  {
43  	static int warned;
44  	if (!warned) {
45  		printk(KERN_INFO "warning: `%s' has both setuid-root and"
46  			" effective capabilities. Therefore not raising all"
47  			" capabilities.\n", fname);
48  		warned = 1;
49  	}
50  }
51  
52  /**
53   * cap_capable - Determine whether a task has a particular effective capability
54   * @cred: The credentials to use
55   * @targ_ns:  The user namespace in which we need the capability
56   * @cap: The capability to check for
57   * @opts: Bitmask of options defined in include/linux/security.h
58   *
59   * Determine whether the nominated task has the specified capability amongst
60   * its effective set, returning 0 if it does, -ve if it does not.
61   *
62   * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
63   * and has_capability() functions.  That is, it has the reverse semantics:
64   * cap_has_capability() returns 0 when a task has a capability, but the
65   * kernel's capable() and has_capability() returns 1 for this case.
66   */
cap_capable(const struct cred * cred,struct user_namespace * targ_ns,int cap,unsigned int opts)67  int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
68  		int cap, unsigned int opts)
69  {
70  	struct user_namespace *ns = targ_ns;
71  
72  	/* See if cred has the capability in the target user namespace
73  	 * by examining the target user namespace and all of the target
74  	 * user namespace's parents.
75  	 */
76  	for (;;) {
77  		/* Do we have the necessary capabilities? */
78  		if (ns == cred->user_ns)
79  			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
80  
81  		/*
82  		 * If we're already at a lower level than we're looking for,
83  		 * we're done searching.
84  		 */
85  		if (ns->level <= cred->user_ns->level)
86  			return -EPERM;
87  
88  		/*
89  		 * The owner of the user namespace in the parent of the
90  		 * user namespace has all caps.
91  		 */
92  		if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
93  			return 0;
94  
95  		/*
96  		 * If you have a capability in a parent user ns, then you have
97  		 * it over all children user namespaces as well.
98  		 */
99  		ns = ns->parent;
100  	}
101  
102  	/* We never get here */
103  }
104  
105  /**
106   * cap_settime - Determine whether the current process may set the system clock
107   * @ts: The time to set
108   * @tz: The timezone to set
109   *
110   * Determine whether the current process may set the system clock and timezone
111   * information, returning 0 if permission granted, -ve if denied.
112   */
cap_settime(const struct timespec64 * ts,const struct timezone * tz)113  int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
114  {
115  	if (!capable(CAP_SYS_TIME))
116  		return -EPERM;
117  	return 0;
118  }
119  
120  /**
121   * cap_ptrace_access_check - Determine whether the current process may access
122   *			   another
123   * @child: The process to be accessed
124   * @mode: The mode of attachment.
125   *
126   * If we are in the same or an ancestor user_ns and have all the target
127   * task's capabilities, then ptrace access is allowed.
128   * If we have the ptrace capability to the target user_ns, then ptrace
129   * access is allowed.
130   * Else denied.
131   *
132   * Determine whether a process may access another, returning 0 if permission
133   * granted, -ve if denied.
134   */
cap_ptrace_access_check(struct task_struct * child,unsigned int mode)135  int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
136  {
137  	int ret = 0;
138  	const struct cred *cred, *child_cred;
139  	const kernel_cap_t *caller_caps;
140  
141  	rcu_read_lock();
142  	cred = current_cred();
143  	child_cred = __task_cred(child);
144  	if (mode & PTRACE_MODE_FSCREDS)
145  		caller_caps = &cred->cap_effective;
146  	else
147  		caller_caps = &cred->cap_permitted;
148  	if (cred->user_ns == child_cred->user_ns &&
149  	    cap_issubset(child_cred->cap_permitted, *caller_caps))
150  		goto out;
151  	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
152  		goto out;
153  	ret = -EPERM;
154  out:
155  	rcu_read_unlock();
156  	return ret;
157  }
158  
159  /**
160   * cap_ptrace_traceme - Determine whether another process may trace the current
161   * @parent: The task proposed to be the tracer
162   *
163   * If parent is in the same or an ancestor user_ns and has all current's
164   * capabilities, then ptrace access is allowed.
165   * If parent has the ptrace capability to current's user_ns, then ptrace
166   * access is allowed.
167   * Else denied.
168   *
169   * Determine whether the nominated task is permitted to trace the current
170   * process, returning 0 if permission is granted, -ve if denied.
171   */
cap_ptrace_traceme(struct task_struct * parent)172  int cap_ptrace_traceme(struct task_struct *parent)
173  {
174  	int ret = 0;
175  	const struct cred *cred, *child_cred;
176  
177  	rcu_read_lock();
178  	cred = __task_cred(parent);
179  	child_cred = current_cred();
180  	if (cred->user_ns == child_cred->user_ns &&
181  	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
182  		goto out;
183  	if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
184  		goto out;
185  	ret = -EPERM;
186  out:
187  	rcu_read_unlock();
188  	return ret;
189  }
190  
191  /**
192   * cap_capget - Retrieve a task's capability sets
193   * @target: The task from which to retrieve the capability sets
194   * @effective: The place to record the effective set
195   * @inheritable: The place to record the inheritable set
196   * @permitted: The place to record the permitted set
197   *
198   * This function retrieves the capabilities of the nominated task and returns
199   * them to the caller.
200   */
cap_capget(const struct task_struct * target,kernel_cap_t * effective,kernel_cap_t * inheritable,kernel_cap_t * permitted)201  int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
202  	       kernel_cap_t *inheritable, kernel_cap_t *permitted)
203  {
204  	const struct cred *cred;
205  
206  	/* Derived from kernel/capability.c:sys_capget. */
207  	rcu_read_lock();
208  	cred = __task_cred(target);
209  	*effective   = cred->cap_effective;
210  	*inheritable = cred->cap_inheritable;
211  	*permitted   = cred->cap_permitted;
212  	rcu_read_unlock();
213  	return 0;
214  }
215  
216  /*
217   * Determine whether the inheritable capabilities are limited to the old
218   * permitted set.  Returns 1 if they are limited, 0 if they are not.
219   */
cap_inh_is_capped(void)220  static inline int cap_inh_is_capped(void)
221  {
222  	/* they are so limited unless the current task has the CAP_SETPCAP
223  	 * capability
224  	 */
225  	if (cap_capable(current_cred(), current_cred()->user_ns,
226  			CAP_SETPCAP, CAP_OPT_NONE) == 0)
227  		return 0;
228  	return 1;
229  }
230  
231  /**
232   * cap_capset - Validate and apply proposed changes to current's capabilities
233   * @new: The proposed new credentials; alterations should be made here
234   * @old: The current task's current credentials
235   * @effective: A pointer to the proposed new effective capabilities set
236   * @inheritable: A pointer to the proposed new inheritable capabilities set
237   * @permitted: A pointer to the proposed new permitted capabilities set
238   *
239   * This function validates and applies a proposed mass change to the current
240   * process's capability sets.  The changes are made to the proposed new
241   * credentials, and assuming no error, will be committed by the caller of LSM.
242   */
cap_capset(struct cred * new,const struct cred * old,const kernel_cap_t * effective,const kernel_cap_t * inheritable,const kernel_cap_t * permitted)243  int cap_capset(struct cred *new,
244  	       const struct cred *old,
245  	       const kernel_cap_t *effective,
246  	       const kernel_cap_t *inheritable,
247  	       const kernel_cap_t *permitted)
248  {
249  	if (cap_inh_is_capped() &&
250  	    !cap_issubset(*inheritable,
251  			  cap_combine(old->cap_inheritable,
252  				      old->cap_permitted)))
253  		/* incapable of using this inheritable set */
254  		return -EPERM;
255  
256  	if (!cap_issubset(*inheritable,
257  			  cap_combine(old->cap_inheritable,
258  				      old->cap_bset)))
259  		/* no new pI capabilities outside bounding set */
260  		return -EPERM;
261  
262  	/* verify restrictions on target's new Permitted set */
263  	if (!cap_issubset(*permitted, old->cap_permitted))
264  		return -EPERM;
265  
266  	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
267  	if (!cap_issubset(*effective, *permitted))
268  		return -EPERM;
269  
270  	new->cap_effective   = *effective;
271  	new->cap_inheritable = *inheritable;
272  	new->cap_permitted   = *permitted;
273  
274  	/*
275  	 * Mask off ambient bits that are no longer both permitted and
276  	 * inheritable.
277  	 */
278  	new->cap_ambient = cap_intersect(new->cap_ambient,
279  					 cap_intersect(*permitted,
280  						       *inheritable));
281  	if (WARN_ON(!cap_ambient_invariant_ok(new)))
282  		return -EINVAL;
283  	return 0;
284  }
285  
286  /**
287   * cap_inode_need_killpriv - Determine if inode change affects privileges
288   * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
289   *
290   * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
291   * affects the security markings on that inode, and if it is, should
292   * inode_killpriv() be invoked or the change rejected.
293   *
294   * Return: 1 if security.capability has a value, meaning inode_killpriv()
295   * is required, 0 otherwise, meaning inode_killpriv() is not required.
296   */
cap_inode_need_killpriv(struct dentry * dentry)297  int cap_inode_need_killpriv(struct dentry *dentry)
298  {
299  	struct inode *inode = d_backing_inode(dentry);
300  	int error;
301  
302  	error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
303  	return error > 0;
304  }
305  
306  /**
307   * cap_inode_killpriv - Erase the security markings on an inode
308   *
309   * @idmap:	idmap of the mount the inode was found from
310   * @dentry:	The inode/dentry to alter
311   *
312   * Erase the privilege-enhancing security markings on an inode.
313   *
314   * If the inode has been found through an idmapped mount the idmap of
315   * the vfsmount must be passed through @idmap. This function will then
316   * take care to map the inode according to @idmap before checking
317   * permissions. On non-idmapped mounts or if permission checking is to be
318   * performed on the raw inode simply pass @nop_mnt_idmap.
319   *
320   * Return: 0 if successful, -ve on error.
321   */
cap_inode_killpriv(struct mnt_idmap * idmap,struct dentry * dentry)322  int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
323  {
324  	int error;
325  
326  	error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
327  	if (error == -EOPNOTSUPP)
328  		error = 0;
329  	return error;
330  }
331  
rootid_owns_currentns(vfsuid_t rootvfsuid)332  static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
333  {
334  	struct user_namespace *ns;
335  	kuid_t kroot;
336  
337  	if (!vfsuid_valid(rootvfsuid))
338  		return false;
339  
340  	kroot = vfsuid_into_kuid(rootvfsuid);
341  	for (ns = current_user_ns();; ns = ns->parent) {
342  		if (from_kuid(ns, kroot) == 0)
343  			return true;
344  		if (ns == &init_user_ns)
345  			break;
346  	}
347  
348  	return false;
349  }
350  
sansflags(__u32 m)351  static __u32 sansflags(__u32 m)
352  {
353  	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
354  }
355  
is_v2header(int size,const struct vfs_cap_data * cap)356  static bool is_v2header(int size, const struct vfs_cap_data *cap)
357  {
358  	if (size != XATTR_CAPS_SZ_2)
359  		return false;
360  	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
361  }
362  
is_v3header(int size,const struct vfs_cap_data * cap)363  static bool is_v3header(int size, const struct vfs_cap_data *cap)
364  {
365  	if (size != XATTR_CAPS_SZ_3)
366  		return false;
367  	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
368  }
369  
370  /*
371   * getsecurity: We are called for security.* before any attempt to read the
372   * xattr from the inode itself.
373   *
374   * This gives us a chance to read the on-disk value and convert it.  If we
375   * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
376   *
377   * Note we are not called by vfs_getxattr_alloc(), but that is only called
378   * by the integrity subsystem, which really wants the unconverted values -
379   * so that's good.
380   */
cap_inode_getsecurity(struct mnt_idmap * idmap,struct inode * inode,const char * name,void ** buffer,bool alloc)381  int cap_inode_getsecurity(struct mnt_idmap *idmap,
382  			  struct inode *inode, const char *name, void **buffer,
383  			  bool alloc)
384  {
385  	int size;
386  	kuid_t kroot;
387  	vfsuid_t vfsroot;
388  	u32 nsmagic, magic;
389  	uid_t root, mappedroot;
390  	char *tmpbuf = NULL;
391  	struct vfs_cap_data *cap;
392  	struct vfs_ns_cap_data *nscap = NULL;
393  	struct dentry *dentry;
394  	struct user_namespace *fs_ns;
395  
396  	if (strcmp(name, "capability") != 0)
397  		return -EOPNOTSUPP;
398  
399  	dentry = d_find_any_alias(inode);
400  	if (!dentry)
401  		return -EINVAL;
402  	size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
403  				  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
404  	dput(dentry);
405  	/* gcc11 complains if we don't check for !tmpbuf */
406  	if (size < 0 || !tmpbuf)
407  		goto out_free;
408  
409  	fs_ns = inode->i_sb->s_user_ns;
410  	cap = (struct vfs_cap_data *) tmpbuf;
411  	if (is_v2header(size, cap)) {
412  		root = 0;
413  	} else if (is_v3header(size, cap)) {
414  		nscap = (struct vfs_ns_cap_data *) tmpbuf;
415  		root = le32_to_cpu(nscap->rootid);
416  	} else {
417  		size = -EINVAL;
418  		goto out_free;
419  	}
420  
421  	kroot = make_kuid(fs_ns, root);
422  
423  	/* If this is an idmapped mount shift the kuid. */
424  	vfsroot = make_vfsuid(idmap, fs_ns, kroot);
425  
426  	/* If the root kuid maps to a valid uid in current ns, then return
427  	 * this as a nscap. */
428  	mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
429  	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
430  		size = sizeof(struct vfs_ns_cap_data);
431  		if (alloc) {
432  			if (!nscap) {
433  				/* v2 -> v3 conversion */
434  				nscap = kzalloc(size, GFP_ATOMIC);
435  				if (!nscap) {
436  					size = -ENOMEM;
437  					goto out_free;
438  				}
439  				nsmagic = VFS_CAP_REVISION_3;
440  				magic = le32_to_cpu(cap->magic_etc);
441  				if (magic & VFS_CAP_FLAGS_EFFECTIVE)
442  					nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
443  				memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
444  				nscap->magic_etc = cpu_to_le32(nsmagic);
445  			} else {
446  				/* use allocated v3 buffer */
447  				tmpbuf = NULL;
448  			}
449  			nscap->rootid = cpu_to_le32(mappedroot);
450  			*buffer = nscap;
451  		}
452  		goto out_free;
453  	}
454  
455  	if (!rootid_owns_currentns(vfsroot)) {
456  		size = -EOVERFLOW;
457  		goto out_free;
458  	}
459  
460  	/* This comes from a parent namespace.  Return as a v2 capability */
461  	size = sizeof(struct vfs_cap_data);
462  	if (alloc) {
463  		if (nscap) {
464  			/* v3 -> v2 conversion */
465  			cap = kzalloc(size, GFP_ATOMIC);
466  			if (!cap) {
467  				size = -ENOMEM;
468  				goto out_free;
469  			}
470  			magic = VFS_CAP_REVISION_2;
471  			nsmagic = le32_to_cpu(nscap->magic_etc);
472  			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
473  				magic |= VFS_CAP_FLAGS_EFFECTIVE;
474  			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
475  			cap->magic_etc = cpu_to_le32(magic);
476  		} else {
477  			/* use unconverted v2 */
478  			tmpbuf = NULL;
479  		}
480  		*buffer = cap;
481  	}
482  out_free:
483  	kfree(tmpbuf);
484  	return size;
485  }
486  
487  /**
488   * rootid_from_xattr - translate root uid of vfs caps
489   *
490   * @value:	vfs caps value which may be modified by this function
491   * @size:	size of @ivalue
492   * @task_ns:	user namespace of the caller
493   */
rootid_from_xattr(const void * value,size_t size,struct user_namespace * task_ns)494  static vfsuid_t rootid_from_xattr(const void *value, size_t size,
495  				  struct user_namespace *task_ns)
496  {
497  	const struct vfs_ns_cap_data *nscap = value;
498  	uid_t rootid = 0;
499  
500  	if (size == XATTR_CAPS_SZ_3)
501  		rootid = le32_to_cpu(nscap->rootid);
502  
503  	return VFSUIDT_INIT(make_kuid(task_ns, rootid));
504  }
505  
validheader(size_t size,const struct vfs_cap_data * cap)506  static bool validheader(size_t size, const struct vfs_cap_data *cap)
507  {
508  	return is_v2header(size, cap) || is_v3header(size, cap);
509  }
510  
511  /**
512   * cap_convert_nscap - check vfs caps
513   *
514   * @idmap:	idmap of the mount the inode was found from
515   * @dentry:	used to retrieve inode to check permissions on
516   * @ivalue:	vfs caps value which may be modified by this function
517   * @size:	size of @ivalue
518   *
519   * User requested a write of security.capability.  If needed, update the
520   * xattr to change from v2 to v3, or to fixup the v3 rootid.
521   *
522   * If the inode has been found through an idmapped mount the idmap of
523   * the vfsmount must be passed through @idmap. This function will then
524   * take care to map the inode according to @idmap before checking
525   * permissions. On non-idmapped mounts or if permission checking is to be
526   * performed on the raw inode simply pass @nop_mnt_idmap.
527   *
528   * Return: On success, return the new size; on error, return < 0.
529   */
cap_convert_nscap(struct mnt_idmap * idmap,struct dentry * dentry,const void ** ivalue,size_t size)530  int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
531  		      const void **ivalue, size_t size)
532  {
533  	struct vfs_ns_cap_data *nscap;
534  	uid_t nsrootid;
535  	const struct vfs_cap_data *cap = *ivalue;
536  	__u32 magic, nsmagic;
537  	struct inode *inode = d_backing_inode(dentry);
538  	struct user_namespace *task_ns = current_user_ns(),
539  		*fs_ns = inode->i_sb->s_user_ns;
540  	kuid_t rootid;
541  	vfsuid_t vfsrootid;
542  	size_t newsize;
543  
544  	if (!*ivalue)
545  		return -EINVAL;
546  	if (!validheader(size, cap))
547  		return -EINVAL;
548  	if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
549  		return -EPERM;
550  	if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
551  		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
552  			/* user is privileged, just write the v2 */
553  			return size;
554  
555  	vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
556  	if (!vfsuid_valid(vfsrootid))
557  		return -EINVAL;
558  
559  	rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
560  	if (!uid_valid(rootid))
561  		return -EINVAL;
562  
563  	nsrootid = from_kuid(fs_ns, rootid);
564  	if (nsrootid == -1)
565  		return -EINVAL;
566  
567  	newsize = sizeof(struct vfs_ns_cap_data);
568  	nscap = kmalloc(newsize, GFP_ATOMIC);
569  	if (!nscap)
570  		return -ENOMEM;
571  	nscap->rootid = cpu_to_le32(nsrootid);
572  	nsmagic = VFS_CAP_REVISION_3;
573  	magic = le32_to_cpu(cap->magic_etc);
574  	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
575  		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
576  	nscap->magic_etc = cpu_to_le32(nsmagic);
577  	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
578  
579  	*ivalue = nscap;
580  	return newsize;
581  }
582  
583  /*
584   * Calculate the new process capability sets from the capability sets attached
585   * to a file.
586   */
bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data * caps,struct linux_binprm * bprm,bool * effective,bool * has_fcap)587  static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
588  					  struct linux_binprm *bprm,
589  					  bool *effective,
590  					  bool *has_fcap)
591  {
592  	struct cred *new = bprm->cred;
593  	int ret = 0;
594  
595  	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
596  		*effective = true;
597  
598  	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
599  		*has_fcap = true;
600  
601  	/*
602  	 * pP' = (X & fP) | (pI & fI)
603  	 * The addition of pA' is handled later.
604  	 */
605  	new->cap_permitted.val =
606  		(new->cap_bset.val & caps->permitted.val) |
607  		(new->cap_inheritable.val & caps->inheritable.val);
608  
609  	if (caps->permitted.val & ~new->cap_permitted.val)
610  		/* insufficient to execute correctly */
611  		ret = -EPERM;
612  
613  	/*
614  	 * For legacy apps, with no internal support for recognizing they
615  	 * do not have enough capabilities, we return an error if they are
616  	 * missing some "forced" (aka file-permitted) capabilities.
617  	 */
618  	return *effective ? ret : 0;
619  }
620  
621  /**
622   * get_vfs_caps_from_disk - retrieve vfs caps from disk
623   *
624   * @idmap:	idmap of the mount the inode was found from
625   * @dentry:	dentry from which @inode is retrieved
626   * @cpu_caps:	vfs capabilities
627   *
628   * Extract the on-exec-apply capability sets for an executable file.
629   *
630   * If the inode has been found through an idmapped mount the idmap of
631   * the vfsmount must be passed through @idmap. This function will then
632   * take care to map the inode according to @idmap before checking
633   * permissions. On non-idmapped mounts or if permission checking is to be
634   * performed on the raw inode simply pass @nop_mnt_idmap.
635   */
get_vfs_caps_from_disk(struct mnt_idmap * idmap,const struct dentry * dentry,struct cpu_vfs_cap_data * cpu_caps)636  int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
637  			   const struct dentry *dentry,
638  			   struct cpu_vfs_cap_data *cpu_caps)
639  {
640  	struct inode *inode = d_backing_inode(dentry);
641  	__u32 magic_etc;
642  	int size;
643  	struct vfs_ns_cap_data data, *nscaps = &data;
644  	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
645  	kuid_t rootkuid;
646  	vfsuid_t rootvfsuid;
647  	struct user_namespace *fs_ns;
648  
649  	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
650  
651  	if (!inode)
652  		return -ENODATA;
653  
654  	fs_ns = inode->i_sb->s_user_ns;
655  	size = __vfs_getxattr((struct dentry *)dentry, inode,
656  			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
657  	if (size == -ENODATA || size == -EOPNOTSUPP)
658  		/* no data, that's ok */
659  		return -ENODATA;
660  
661  	if (size < 0)
662  		return size;
663  
664  	if (size < sizeof(magic_etc))
665  		return -EINVAL;
666  
667  	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
668  
669  	rootkuid = make_kuid(fs_ns, 0);
670  	switch (magic_etc & VFS_CAP_REVISION_MASK) {
671  	case VFS_CAP_REVISION_1:
672  		if (size != XATTR_CAPS_SZ_1)
673  			return -EINVAL;
674  		break;
675  	case VFS_CAP_REVISION_2:
676  		if (size != XATTR_CAPS_SZ_2)
677  			return -EINVAL;
678  		break;
679  	case VFS_CAP_REVISION_3:
680  		if (size != XATTR_CAPS_SZ_3)
681  			return -EINVAL;
682  		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
683  		break;
684  
685  	default:
686  		return -EINVAL;
687  	}
688  
689  	rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
690  	if (!vfsuid_valid(rootvfsuid))
691  		return -ENODATA;
692  
693  	/* Limit the caps to the mounter of the filesystem
694  	 * or the more limited uid specified in the xattr.
695  	 */
696  	if (!rootid_owns_currentns(rootvfsuid))
697  		return -ENODATA;
698  
699  	cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
700  	cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
701  
702  	/*
703  	 * Rev1 had just a single 32-bit word, later expanded
704  	 * to a second one for the high bits
705  	 */
706  	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
707  		cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
708  		cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
709  	}
710  
711  	cpu_caps->permitted.val &= CAP_VALID_MASK;
712  	cpu_caps->inheritable.val &= CAP_VALID_MASK;
713  
714  	cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);
715  
716  	return 0;
717  }
718  
719  /*
720   * Attempt to get the on-exec apply capability sets for an executable file from
721   * its xattrs and, if present, apply them to the proposed credentials being
722   * constructed by execve().
723   */
get_file_caps(struct linux_binprm * bprm,const struct file * file,bool * effective,bool * has_fcap)724  static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
725  			 bool *effective, bool *has_fcap)
726  {
727  	int rc = 0;
728  	struct cpu_vfs_cap_data vcaps;
729  
730  	cap_clear(bprm->cred->cap_permitted);
731  
732  	if (!file_caps_enabled)
733  		return 0;
734  
735  	if (!mnt_may_suid(file->f_path.mnt))
736  		return 0;
737  
738  	/*
739  	 * This check is redundant with mnt_may_suid() but is kept to make
740  	 * explicit that capability bits are limited to s_user_ns and its
741  	 * descendants.
742  	 */
743  	if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
744  		return 0;
745  
746  	rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
747  				    file->f_path.dentry, &vcaps);
748  	if (rc < 0) {
749  		if (rc == -EINVAL)
750  			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
751  					bprm->filename);
752  		else if (rc == -ENODATA)
753  			rc = 0;
754  		goto out;
755  	}
756  
757  	rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);
758  
759  out:
760  	if (rc)
761  		cap_clear(bprm->cred->cap_permitted);
762  
763  	return rc;
764  }
765  
root_privileged(void)766  static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
767  
__is_real(kuid_t uid,struct cred * cred)768  static inline bool __is_real(kuid_t uid, struct cred *cred)
769  { return uid_eq(cred->uid, uid); }
770  
__is_eff(kuid_t uid,struct cred * cred)771  static inline bool __is_eff(kuid_t uid, struct cred *cred)
772  { return uid_eq(cred->euid, uid); }
773  
__is_suid(kuid_t uid,struct cred * cred)774  static inline bool __is_suid(kuid_t uid, struct cred *cred)
775  { return !__is_real(uid, cred) && __is_eff(uid, cred); }
776  
777  /*
778   * handle_privileged_root - Handle case of privileged root
779   * @bprm: The execution parameters, including the proposed creds
780   * @has_fcap: Are any file capabilities set?
781   * @effective: Do we have effective root privilege?
782   * @root_uid: This namespace' root UID WRT initial USER namespace
783   *
784   * Handle the case where root is privileged and hasn't been neutered by
785   * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
786   * set UID root and nothing is changed.  If we are root, cap_permitted is
787   * updated.  If we have become set UID root, the effective bit is set.
788   */
handle_privileged_root(struct linux_binprm * bprm,bool has_fcap,bool * effective,kuid_t root_uid)789  static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
790  				   bool *effective, kuid_t root_uid)
791  {
792  	const struct cred *old = current_cred();
793  	struct cred *new = bprm->cred;
794  
795  	if (!root_privileged())
796  		return;
797  	/*
798  	 * If the legacy file capability is set, then don't set privs
799  	 * for a setuid root binary run by a non-root user.  Do set it
800  	 * for a root user just to cause least surprise to an admin.
801  	 */
802  	if (has_fcap && __is_suid(root_uid, new)) {
803  		warn_setuid_and_fcaps_mixed(bprm->filename);
804  		return;
805  	}
806  	/*
807  	 * To support inheritance of root-permissions and suid-root
808  	 * executables under compatibility mode, we override the
809  	 * capability sets for the file.
810  	 */
811  	if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
812  		/* pP' = (cap_bset & ~0) | (pI & ~0) */
813  		new->cap_permitted = cap_combine(old->cap_bset,
814  						 old->cap_inheritable);
815  	}
816  	/*
817  	 * If only the real uid is 0, we do not set the effective bit.
818  	 */
819  	if (__is_eff(root_uid, new))
820  		*effective = true;
821  }
822  
823  #define __cap_gained(field, target, source) \
824  	!cap_issubset(target->cap_##field, source->cap_##field)
825  #define __cap_grew(target, source, cred) \
826  	!cap_issubset(cred->cap_##target, cred->cap_##source)
827  #define __cap_full(field, cred) \
828  	cap_issubset(CAP_FULL_SET, cred->cap_##field)
829  
__is_setuid(struct cred * new,const struct cred * old)830  static inline bool __is_setuid(struct cred *new, const struct cred *old)
831  { return !uid_eq(new->euid, old->uid); }
832  
__is_setgid(struct cred * new,const struct cred * old)833  static inline bool __is_setgid(struct cred *new, const struct cred *old)
834  { return !gid_eq(new->egid, old->gid); }
835  
836  /*
837   * 1) Audit candidate if current->cap_effective is set
838   *
839   * We do not bother to audit if 3 things are true:
840   *   1) cap_effective has all caps
841   *   2) we became root *OR* are were already root
842   *   3) root is supposed to have all caps (SECURE_NOROOT)
843   * Since this is just a normal root execing a process.
844   *
845   * Number 1 above might fail if you don't have a full bset, but I think
846   * that is interesting information to audit.
847   *
848   * A number of other conditions require logging:
849   * 2) something prevented setuid root getting all caps
850   * 3) non-setuid root gets fcaps
851   * 4) non-setuid root gets ambient
852   */
nonroot_raised_pE(struct cred * new,const struct cred * old,kuid_t root,bool has_fcap)853  static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
854  				     kuid_t root, bool has_fcap)
855  {
856  	bool ret = false;
857  
858  	if ((__cap_grew(effective, ambient, new) &&
859  	     !(__cap_full(effective, new) &&
860  	       (__is_eff(root, new) || __is_real(root, new)) &&
861  	       root_privileged())) ||
862  	    (root_privileged() &&
863  	     __is_suid(root, new) &&
864  	     !__cap_full(effective, new)) ||
865  	    (!__is_setuid(new, old) &&
866  	     ((has_fcap &&
867  	       __cap_gained(permitted, new, old)) ||
868  	      __cap_gained(ambient, new, old))))
869  
870  		ret = true;
871  
872  	return ret;
873  }
874  
875  /**
876   * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
877   * @bprm: The execution parameters, including the proposed creds
878   * @file: The file to pull the credentials from
879   *
880   * Set up the proposed credentials for a new execution context being
881   * constructed by execve().  The proposed creds in @bprm->cred is altered,
882   * which won't take effect immediately.
883   *
884   * Return: 0 if successful, -ve on error.
885   */
cap_bprm_creds_from_file(struct linux_binprm * bprm,const struct file * file)886  int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
887  {
888  	/* Process setpcap binaries and capabilities for uid 0 */
889  	const struct cred *old = current_cred();
890  	struct cred *new = bprm->cred;
891  	bool effective = false, has_fcap = false, is_setid;
892  	int ret;
893  	kuid_t root_uid;
894  
895  	if (WARN_ON(!cap_ambient_invariant_ok(old)))
896  		return -EPERM;
897  
898  	ret = get_file_caps(bprm, file, &effective, &has_fcap);
899  	if (ret < 0)
900  		return ret;
901  
902  	root_uid = make_kuid(new->user_ns, 0);
903  
904  	handle_privileged_root(bprm, has_fcap, &effective, root_uid);
905  
906  	/* if we have fs caps, clear dangerous personality flags */
907  	if (__cap_gained(permitted, new, old))
908  		bprm->per_clear |= PER_CLEAR_ON_SETID;
909  
910  	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
911  	 * credentials unless they have the appropriate permit.
912  	 *
913  	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
914  	 */
915  	is_setid = __is_setuid(new, old) || __is_setgid(new, old);
916  
917  	if ((is_setid || __cap_gained(permitted, new, old)) &&
918  	    ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
919  	     !ptracer_capable(current, new->user_ns))) {
920  		/* downgrade; they get no more than they had, and maybe less */
921  		if (!ns_capable(new->user_ns, CAP_SETUID) ||
922  		    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
923  			new->euid = new->uid;
924  			new->egid = new->gid;
925  		}
926  		new->cap_permitted = cap_intersect(new->cap_permitted,
927  						   old->cap_permitted);
928  	}
929  
930  	new->suid = new->fsuid = new->euid;
931  	new->sgid = new->fsgid = new->egid;
932  
933  	/* File caps or setid cancels ambient. */
934  	if (has_fcap || is_setid)
935  		cap_clear(new->cap_ambient);
936  
937  	/*
938  	 * Now that we've computed pA', update pP' to give:
939  	 *   pP' = (X & fP) | (pI & fI) | pA'
940  	 */
941  	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
942  
943  	/*
944  	 * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
945  	 * this is the same as pE' = (fE ? pP' : 0) | pA'.
946  	 */
947  	if (effective)
948  		new->cap_effective = new->cap_permitted;
949  	else
950  		new->cap_effective = new->cap_ambient;
951  
952  	if (WARN_ON(!cap_ambient_invariant_ok(new)))
953  		return -EPERM;
954  
955  	if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
956  		ret = audit_log_bprm_fcaps(bprm, new, old);
957  		if (ret < 0)
958  			return ret;
959  	}
960  
961  	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
962  
963  	if (WARN_ON(!cap_ambient_invariant_ok(new)))
964  		return -EPERM;
965  
966  	/* Check for privilege-elevated exec. */
967  	if (is_setid ||
968  	    (!__is_real(root_uid, new) &&
969  	     (effective ||
970  	      __cap_grew(permitted, ambient, new))))
971  		bprm->secureexec = 1;
972  
973  	return 0;
974  }
975  
976  /**
977   * cap_inode_setxattr - Determine whether an xattr may be altered
978   * @dentry: The inode/dentry being altered
979   * @name: The name of the xattr to be changed
980   * @value: The value that the xattr will be changed to
981   * @size: The size of value
982   * @flags: The replacement flag
983   *
984   * Determine whether an xattr may be altered or set on an inode, returning 0 if
985   * permission is granted, -ve if denied.
986   *
987   * This is used to make sure security xattrs don't get updated or set by those
988   * who aren't privileged to do so.
989   */
cap_inode_setxattr(struct dentry * dentry,const char * name,const void * value,size_t size,int flags)990  int cap_inode_setxattr(struct dentry *dentry, const char *name,
991  		       const void *value, size_t size, int flags)
992  {
993  	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
994  
995  	/* Ignore non-security xattrs */
996  	if (strncmp(name, XATTR_SECURITY_PREFIX,
997  			XATTR_SECURITY_PREFIX_LEN) != 0)
998  		return 0;
999  
1000  	/*
1001  	 * For XATTR_NAME_CAPS the check will be done in
1002  	 * cap_convert_nscap(), called by setxattr()
1003  	 */
1004  	if (strcmp(name, XATTR_NAME_CAPS) == 0)
1005  		return 0;
1006  
1007  	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1008  		return -EPERM;
1009  	return 0;
1010  }
1011  
1012  /**
1013   * cap_inode_removexattr - Determine whether an xattr may be removed
1014   *
1015   * @idmap:	idmap of the mount the inode was found from
1016   * @dentry:	The inode/dentry being altered
1017   * @name:	The name of the xattr to be changed
1018   *
1019   * Determine whether an xattr may be removed from an inode, returning 0 if
1020   * permission is granted, -ve if denied.
1021   *
1022   * If the inode has been found through an idmapped mount the idmap of
1023   * the vfsmount must be passed through @idmap. This function will then
1024   * take care to map the inode according to @idmap before checking
1025   * permissions. On non-idmapped mounts or if permission checking is to be
1026   * performed on the raw inode simply pass @nop_mnt_idmap.
1027   *
1028   * This is used to make sure security xattrs don't get removed by those who
1029   * aren't privileged to remove them.
1030   */
cap_inode_removexattr(struct mnt_idmap * idmap,struct dentry * dentry,const char * name)1031  int cap_inode_removexattr(struct mnt_idmap *idmap,
1032  			  struct dentry *dentry, const char *name)
1033  {
1034  	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
1035  
1036  	/* Ignore non-security xattrs */
1037  	if (strncmp(name, XATTR_SECURITY_PREFIX,
1038  			XATTR_SECURITY_PREFIX_LEN) != 0)
1039  		return 0;
1040  
1041  	if (strcmp(name, XATTR_NAME_CAPS) == 0) {
1042  		/* security.capability gets namespaced */
1043  		struct inode *inode = d_backing_inode(dentry);
1044  		if (!inode)
1045  			return -EINVAL;
1046  		if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
1047  			return -EPERM;
1048  		return 0;
1049  	}
1050  
1051  	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1052  		return -EPERM;
1053  	return 0;
1054  }
1055  
1056  /*
1057   * cap_emulate_setxuid() fixes the effective / permitted capabilities of
1058   * a process after a call to setuid, setreuid, or setresuid.
1059   *
1060   *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
1061   *  {r,e,s}uid != 0, the permitted and effective capabilities are
1062   *  cleared.
1063   *
1064   *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1065   *  capabilities of the process are cleared.
1066   *
1067   *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1068   *  capabilities are set to the permitted capabilities.
1069   *
1070   *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1071   *  never happen.
1072   *
1073   *  -astor
1074   *
1075   * cevans - New behaviour, Oct '99
1076   * A process may, via prctl(), elect to keep its capabilities when it
1077   * calls setuid() and switches away from uid==0. Both permitted and
1078   * effective sets will be retained.
1079   * Without this change, it was impossible for a daemon to drop only some
1080   * of its privilege. The call to setuid(!=0) would drop all privileges!
1081   * Keeping uid 0 is not an option because uid 0 owns too many vital
1082   * files..
1083   * Thanks to Olaf Kirch and Peter Benie for spotting this.
1084   */
cap_emulate_setxuid(struct cred * new,const struct cred * old)1085  static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
1086  {
1087  	kuid_t root_uid = make_kuid(old->user_ns, 0);
1088  
1089  	if ((uid_eq(old->uid, root_uid) ||
1090  	     uid_eq(old->euid, root_uid) ||
1091  	     uid_eq(old->suid, root_uid)) &&
1092  	    (!uid_eq(new->uid, root_uid) &&
1093  	     !uid_eq(new->euid, root_uid) &&
1094  	     !uid_eq(new->suid, root_uid))) {
1095  		if (!issecure(SECURE_KEEP_CAPS)) {
1096  			cap_clear(new->cap_permitted);
1097  			cap_clear(new->cap_effective);
1098  		}
1099  
1100  		/*
1101  		 * Pre-ambient programs expect setresuid to nonroot followed
1102  		 * by exec to drop capabilities.  We should make sure that
1103  		 * this remains the case.
1104  		 */
1105  		cap_clear(new->cap_ambient);
1106  	}
1107  	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
1108  		cap_clear(new->cap_effective);
1109  	if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
1110  		new->cap_effective = new->cap_permitted;
1111  }
1112  
1113  /**
1114   * cap_task_fix_setuid - Fix up the results of setuid() call
1115   * @new: The proposed credentials
1116   * @old: The current task's current credentials
1117   * @flags: Indications of what has changed
1118   *
1119   * Fix up the results of setuid() call before the credential changes are
1120   * actually applied.
1121   *
1122   * Return: 0 to grant the changes, -ve to deny them.
1123   */
cap_task_fix_setuid(struct cred * new,const struct cred * old,int flags)1124  int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
1125  {
1126  	switch (flags) {
1127  	case LSM_SETID_RE:
1128  	case LSM_SETID_ID:
1129  	case LSM_SETID_RES:
1130  		/* juggle the capabilities to follow [RES]UID changes unless
1131  		 * otherwise suppressed */
1132  		if (!issecure(SECURE_NO_SETUID_FIXUP))
1133  			cap_emulate_setxuid(new, old);
1134  		break;
1135  
1136  	case LSM_SETID_FS:
1137  		/* juggle the capabilities to follow FSUID changes, unless
1138  		 * otherwise suppressed
1139  		 *
1140  		 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1141  		 *          if not, we might be a bit too harsh here.
1142  		 */
1143  		if (!issecure(SECURE_NO_SETUID_FIXUP)) {
1144  			kuid_t root_uid = make_kuid(old->user_ns, 0);
1145  			if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
1146  				new->cap_effective =
1147  					cap_drop_fs_set(new->cap_effective);
1148  
1149  			if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
1150  				new->cap_effective =
1151  					cap_raise_fs_set(new->cap_effective,
1152  							 new->cap_permitted);
1153  		}
1154  		break;
1155  
1156  	default:
1157  		return -EINVAL;
1158  	}
1159  
1160  	return 0;
1161  }
1162  
1163  /*
1164   * Rationale: code calling task_setscheduler, task_setioprio, and
1165   * task_setnice, assumes that
1166   *   . if capable(cap_sys_nice), then those actions should be allowed
1167   *   . if not capable(cap_sys_nice), but acting on your own processes,
1168   *   	then those actions should be allowed
1169   * This is insufficient now since you can call code without suid, but
1170   * yet with increased caps.
1171   * So we check for increased caps on the target process.
1172   */
cap_safe_nice(struct task_struct * p)1173  static int cap_safe_nice(struct task_struct *p)
1174  {
1175  	int is_subset, ret = 0;
1176  
1177  	rcu_read_lock();
1178  	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
1179  				 current_cred()->cap_permitted);
1180  	if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1181  		ret = -EPERM;
1182  	rcu_read_unlock();
1183  
1184  	return ret;
1185  }
1186  
1187  /**
1188   * cap_task_setscheduler - Determine if scheduler policy change is permitted
1189   * @p: The task to affect
1190   *
1191   * Determine if the requested scheduler policy change is permitted for the
1192   * specified task.
1193   *
1194   * Return: 0 if permission is granted, -ve if denied.
1195   */
cap_task_setscheduler(struct task_struct * p)1196  int cap_task_setscheduler(struct task_struct *p)
1197  {
1198  	return cap_safe_nice(p);
1199  }
1200  
1201  /**
1202   * cap_task_setioprio - Determine if I/O priority change is permitted
1203   * @p: The task to affect
1204   * @ioprio: The I/O priority to set
1205   *
1206   * Determine if the requested I/O priority change is permitted for the specified
1207   * task.
1208   *
1209   * Return: 0 if permission is granted, -ve if denied.
1210   */
cap_task_setioprio(struct task_struct * p,int ioprio)1211  int cap_task_setioprio(struct task_struct *p, int ioprio)
1212  {
1213  	return cap_safe_nice(p);
1214  }
1215  
1216  /**
1217   * cap_task_setnice - Determine if task priority change is permitted
1218   * @p: The task to affect
1219   * @nice: The nice value to set
1220   *
1221   * Determine if the requested task priority change is permitted for the
1222   * specified task.
1223   *
1224   * Return: 0 if permission is granted, -ve if denied.
1225   */
cap_task_setnice(struct task_struct * p,int nice)1226  int cap_task_setnice(struct task_struct *p, int nice)
1227  {
1228  	return cap_safe_nice(p);
1229  }
1230  
1231  /*
1232   * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
1233   * the current task's bounding set.  Returns 0 on success, -ve on error.
1234   */
cap_prctl_drop(unsigned long cap)1235  static int cap_prctl_drop(unsigned long cap)
1236  {
1237  	struct cred *new;
1238  
1239  	if (!ns_capable(current_user_ns(), CAP_SETPCAP))
1240  		return -EPERM;
1241  	if (!cap_valid(cap))
1242  		return -EINVAL;
1243  
1244  	new = prepare_creds();
1245  	if (!new)
1246  		return -ENOMEM;
1247  	cap_lower(new->cap_bset, cap);
1248  	return commit_creds(new);
1249  }
1250  
1251  /**
1252   * cap_task_prctl - Implement process control functions for this security module
1253   * @option: The process control function requested
1254   * @arg2: The argument data for this function
1255   * @arg3: The argument data for this function
1256   * @arg4: The argument data for this function
1257   * @arg5: The argument data for this function
1258   *
1259   * Allow process control functions (sys_prctl()) to alter capabilities; may
1260   * also deny access to other functions not otherwise implemented here.
1261   *
1262   * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
1263   * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
1264   * modules will consider performing the function.
1265   */
cap_task_prctl(int option,unsigned long arg2,unsigned long arg3,unsigned long arg4,unsigned long arg5)1266  int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1267  		   unsigned long arg4, unsigned long arg5)
1268  {
1269  	const struct cred *old = current_cred();
1270  	struct cred *new;
1271  
1272  	switch (option) {
1273  	case PR_CAPBSET_READ:
1274  		if (!cap_valid(arg2))
1275  			return -EINVAL;
1276  		return !!cap_raised(old->cap_bset, arg2);
1277  
1278  	case PR_CAPBSET_DROP:
1279  		return cap_prctl_drop(arg2);
1280  
1281  	/*
1282  	 * The next four prctl's remain to assist with transitioning a
1283  	 * system from legacy UID=0 based privilege (when filesystem
1284  	 * capabilities are not in use) to a system using filesystem
1285  	 * capabilities only - as the POSIX.1e draft intended.
1286  	 *
1287  	 * Note:
1288  	 *
1289  	 *  PR_SET_SECUREBITS =
1290  	 *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1291  	 *    | issecure_mask(SECURE_NOROOT)
1292  	 *    | issecure_mask(SECURE_NOROOT_LOCKED)
1293  	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
1294  	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1295  	 *
1296  	 * will ensure that the current process and all of its
1297  	 * children will be locked into a pure
1298  	 * capability-based-privilege environment.
1299  	 */
1300  	case PR_SET_SECUREBITS:
1301  		if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
1302  		     & (old->securebits ^ arg2))			/*[1]*/
1303  		    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
1304  		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
1305  		    || (cap_capable(current_cred(),
1306  				    current_cred()->user_ns,
1307  				    CAP_SETPCAP,
1308  				    CAP_OPT_NONE) != 0)			/*[4]*/
1309  			/*
1310  			 * [1] no changing of bits that are locked
1311  			 * [2] no unlocking of locks
1312  			 * [3] no setting of unsupported bits
1313  			 * [4] doing anything requires privilege (go read about
1314  			 *     the "sendmail capabilities bug")
1315  			 */
1316  		    )
1317  			/* cannot change a locked bit */
1318  			return -EPERM;
1319  
1320  		new = prepare_creds();
1321  		if (!new)
1322  			return -ENOMEM;
1323  		new->securebits = arg2;
1324  		return commit_creds(new);
1325  
1326  	case PR_GET_SECUREBITS:
1327  		return old->securebits;
1328  
1329  	case PR_GET_KEEPCAPS:
1330  		return !!issecure(SECURE_KEEP_CAPS);
1331  
1332  	case PR_SET_KEEPCAPS:
1333  		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
1334  			return -EINVAL;
1335  		if (issecure(SECURE_KEEP_CAPS_LOCKED))
1336  			return -EPERM;
1337  
1338  		new = prepare_creds();
1339  		if (!new)
1340  			return -ENOMEM;
1341  		if (arg2)
1342  			new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
1343  		else
1344  			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
1345  		return commit_creds(new);
1346  
1347  	case PR_CAP_AMBIENT:
1348  		if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
1349  			if (arg3 | arg4 | arg5)
1350  				return -EINVAL;
1351  
1352  			new = prepare_creds();
1353  			if (!new)
1354  				return -ENOMEM;
1355  			cap_clear(new->cap_ambient);
1356  			return commit_creds(new);
1357  		}
1358  
1359  		if (((!cap_valid(arg3)) | arg4 | arg5))
1360  			return -EINVAL;
1361  
1362  		if (arg2 == PR_CAP_AMBIENT_IS_SET) {
1363  			return !!cap_raised(current_cred()->cap_ambient, arg3);
1364  		} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
1365  			   arg2 != PR_CAP_AMBIENT_LOWER) {
1366  			return -EINVAL;
1367  		} else {
1368  			if (arg2 == PR_CAP_AMBIENT_RAISE &&
1369  			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
1370  			     !cap_raised(current_cred()->cap_inheritable,
1371  					 arg3) ||
1372  			     issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
1373  				return -EPERM;
1374  
1375  			new = prepare_creds();
1376  			if (!new)
1377  				return -ENOMEM;
1378  			if (arg2 == PR_CAP_AMBIENT_RAISE)
1379  				cap_raise(new->cap_ambient, arg3);
1380  			else
1381  				cap_lower(new->cap_ambient, arg3);
1382  			return commit_creds(new);
1383  		}
1384  
1385  	default:
1386  		/* No functionality available - continue with default */
1387  		return -ENOSYS;
1388  	}
1389  }
1390  
1391  /**
1392   * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1393   * @mm: The VM space in which the new mapping is to be made
1394   * @pages: The size of the mapping
1395   *
1396   * Determine whether the allocation of a new virtual mapping by the current
1397   * task is permitted.
1398   *
1399   * Return: 0 if permission granted, negative error code if not.
1400   */
cap_vm_enough_memory(struct mm_struct * mm,long pages)1401  int cap_vm_enough_memory(struct mm_struct *mm, long pages)
1402  {
1403  	return cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
1404  			   CAP_OPT_NOAUDIT);
1405  }
1406  
1407  /**
1408   * cap_mmap_addr - check if able to map given addr
1409   * @addr: address attempting to be mapped
1410   *
1411   * If the process is attempting to map memory below dac_mmap_min_addr they need
1412   * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
1413   * capability security module.
1414   *
1415   * Return: 0 if this mapping should be allowed or -EPERM if not.
1416   */
cap_mmap_addr(unsigned long addr)1417  int cap_mmap_addr(unsigned long addr)
1418  {
1419  	int ret = 0;
1420  
1421  	if (addr < dac_mmap_min_addr) {
1422  		ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
1423  				  CAP_OPT_NONE);
1424  		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
1425  		if (ret == 0)
1426  			current->flags |= PF_SUPERPRIV;
1427  	}
1428  	return ret;
1429  }
1430  
cap_mmap_file(struct file * file,unsigned long reqprot,unsigned long prot,unsigned long flags)1431  int cap_mmap_file(struct file *file, unsigned long reqprot,
1432  		  unsigned long prot, unsigned long flags)
1433  {
1434  	return 0;
1435  }
1436  
1437  #ifdef CONFIG_SECURITY
1438  
1439  static const struct lsm_id capability_lsmid = {
1440  	.name = "capability",
1441  	.id = LSM_ID_CAPABILITY,
1442  };
1443  
1444  static struct security_hook_list capability_hooks[] __ro_after_init = {
1445  	LSM_HOOK_INIT(capable, cap_capable),
1446  	LSM_HOOK_INIT(settime, cap_settime),
1447  	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
1448  	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
1449  	LSM_HOOK_INIT(capget, cap_capget),
1450  	LSM_HOOK_INIT(capset, cap_capset),
1451  	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
1452  	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
1453  	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
1454  	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
1455  	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
1456  	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
1457  	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
1458  	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
1459  	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
1460  	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
1461  	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
1462  	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1463  };
1464  
capability_init(void)1465  static int __init capability_init(void)
1466  {
1467  	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
1468  			   &capability_lsmid);
1469  	return 0;
1470  }
1471  
1472  DEFINE_LSM(capability) = {
1473  	.name = "capability",
1474  	.order = LSM_ORDER_FIRST,
1475  	.init = capability_init,
1476  };
1477  
1478  #endif /* CONFIG_SECURITY */
1479