1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  Implement mseal() syscall.
4   *
5   *  Copyright (c) 2023,2024 Google, Inc.
6   *
7   *  Author: Jeff Xu <jeffxu@chromium.org>
8   */
9  
10  #include <linux/mempolicy.h>
11  #include <linux/mman.h>
12  #include <linux/mm.h>
13  #include <linux/mm_inline.h>
14  #include <linux/mmu_context.h>
15  #include <linux/syscalls.h>
16  #include <linux/sched.h>
17  #include "internal.h"
18  
set_vma_sealed(struct vm_area_struct * vma)19  static inline void set_vma_sealed(struct vm_area_struct *vma)
20  {
21  	vm_flags_set(vma, VM_SEALED);
22  }
23  
is_madv_discard(int behavior)24  static bool is_madv_discard(int behavior)
25  {
26  	switch (behavior) {
27  	case MADV_FREE:
28  	case MADV_DONTNEED:
29  	case MADV_DONTNEED_LOCKED:
30  	case MADV_REMOVE:
31  	case MADV_DONTFORK:
32  	case MADV_WIPEONFORK:
33  		return true;
34  	}
35  
36  	return false;
37  }
38  
is_ro_anon(struct vm_area_struct * vma)39  static bool is_ro_anon(struct vm_area_struct *vma)
40  {
41  	/* check anonymous mapping. */
42  	if (vma->vm_file || vma->vm_flags & VM_SHARED)
43  		return false;
44  
45  	/*
46  	 * check for non-writable:
47  	 * PROT=RO or PKRU is not writeable.
48  	 */
49  	if (!(vma->vm_flags & VM_WRITE) ||
50  		!arch_vma_access_permitted(vma, true, false, false))
51  		return true;
52  
53  	return false;
54  }
55  
56  /*
57   * Check if a vma is allowed to be modified by madvise.
58   */
can_modify_vma_madv(struct vm_area_struct * vma,int behavior)59  bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
60  {
61  	if (!is_madv_discard(behavior))
62  		return true;
63  
64  	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
65  		return false;
66  
67  	/* Allow by default. */
68  	return true;
69  }
70  
mseal_fixup(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,vm_flags_t newflags)71  static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
72  		struct vm_area_struct **prev, unsigned long start,
73  		unsigned long end, vm_flags_t newflags)
74  {
75  	int ret = 0;
76  	vm_flags_t oldflags = vma->vm_flags;
77  
78  	if (newflags == oldflags)
79  		goto out;
80  
81  	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
82  	if (IS_ERR(vma)) {
83  		ret = PTR_ERR(vma);
84  		goto out;
85  	}
86  
87  	set_vma_sealed(vma);
88  out:
89  	*prev = vma;
90  	return ret;
91  }
92  
93  /*
94   * Check for do_mseal:
95   * 1> start is part of a valid vma.
96   * 2> end is part of a valid vma.
97   * 3> No gap (unallocated address) between start and end.
98   * 4> map is sealable.
99   */
check_mm_seal(unsigned long start,unsigned long end)100  static int check_mm_seal(unsigned long start, unsigned long end)
101  {
102  	struct vm_area_struct *vma;
103  	unsigned long nstart = start;
104  
105  	VMA_ITERATOR(vmi, current->mm, start);
106  
107  	/* going through each vma to check. */
108  	for_each_vma_range(vmi, vma, end) {
109  		if (vma->vm_start > nstart)
110  			/* unallocated memory found. */
111  			return -ENOMEM;
112  
113  		if (vma->vm_end >= end)
114  			return 0;
115  
116  		nstart = vma->vm_end;
117  	}
118  
119  	return -ENOMEM;
120  }
121  
122  /*
123   * Apply sealing.
124   */
apply_mm_seal(unsigned long start,unsigned long end)125  static int apply_mm_seal(unsigned long start, unsigned long end)
126  {
127  	unsigned long nstart;
128  	struct vm_area_struct *vma, *prev;
129  
130  	VMA_ITERATOR(vmi, current->mm, start);
131  
132  	vma = vma_iter_load(&vmi);
133  	/*
134  	 * Note: check_mm_seal should already checked ENOMEM case.
135  	 * so vma should not be null, same for the other ENOMEM cases.
136  	 */
137  	prev = vma_prev(&vmi);
138  	if (start > vma->vm_start)
139  		prev = vma;
140  
141  	nstart = start;
142  	for_each_vma_range(vmi, vma, end) {
143  		int error;
144  		unsigned long tmp;
145  		vm_flags_t newflags;
146  
147  		newflags = vma->vm_flags | VM_SEALED;
148  		tmp = vma->vm_end;
149  		if (tmp > end)
150  			tmp = end;
151  		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
152  		if (error)
153  			return error;
154  		nstart = vma_iter_end(&vmi);
155  	}
156  
157  	return 0;
158  }
159  
160  /*
161   * mseal(2) seals the VM's meta data from
162   * selected syscalls.
163   *
164   * addr/len: VM address range.
165   *
166   *  The address range by addr/len must meet:
167   *   start (addr) must be in a valid VMA.
168   *   end (addr + len) must be in a valid VMA.
169   *   no gap (unallocated memory) between start and end.
170   *   start (addr) must be page aligned.
171   *
172   *  len: len will be page aligned implicitly.
173   *
174   *   Below VMA operations are blocked after sealing.
175   *   1> Unmapping, moving to another location, and shrinking
176   *	the size, via munmap() and mremap(), can leave an empty
177   *	space, therefore can be replaced with a VMA with a new
178   *	set of attributes.
179   *   2> Moving or expanding a different vma into the current location,
180   *	via mremap().
181   *   3> Modifying a VMA via mmap(MAP_FIXED).
182   *   4> Size expansion, via mremap(), does not appear to pose any
183   *	specific risks to sealed VMAs. It is included anyway because
184   *	the use case is unclear. In any case, users can rely on
185   *	merging to expand a sealed VMA.
186   *   5> mprotect and pkey_mprotect.
187   *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
188   *      for anonymous memory, when users don't have write permission to the
189   *	memory. Those behaviors can alter region contents by discarding pages,
190   *	effectively a memset(0) for anonymous memory.
191   *
192   *  flags: reserved.
193   *
194   * return values:
195   *  zero: success.
196   *  -EINVAL:
197   *   invalid input flags.
198   *   start address is not page aligned.
199   *   Address arange (start + len) overflow.
200   *  -ENOMEM:
201   *   addr is not a valid address (not allocated).
202   *   end (start + len) is not a valid address.
203   *   a gap (unallocated memory) between start and end.
204   *  -EPERM:
205   *  - In 32 bit architecture, sealing is not supported.
206   * Note:
207   *  user can call mseal(2) multiple times, adding a seal on an
208   *  already sealed memory is a no-action (no error).
209   *
210   *  unseal() is not supported.
211   */
do_mseal(unsigned long start,size_t len_in,unsigned long flags)212  int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
213  {
214  	size_t len;
215  	int ret = 0;
216  	unsigned long end;
217  	struct mm_struct *mm = current->mm;
218  
219  	ret = can_do_mseal(flags);
220  	if (ret)
221  		return ret;
222  
223  	start = untagged_addr(start);
224  	if (!PAGE_ALIGNED(start))
225  		return -EINVAL;
226  
227  	len = PAGE_ALIGN(len_in);
228  	/* Check to see whether len was rounded up from small -ve to zero. */
229  	if (len_in && !len)
230  		return -EINVAL;
231  
232  	end = start + len;
233  	if (end < start)
234  		return -EINVAL;
235  
236  	if (end == start)
237  		return 0;
238  
239  	if (mmap_write_lock_killable(mm))
240  		return -EINTR;
241  
242  	/*
243  	 * First pass, this helps to avoid
244  	 * partial sealing in case of error in input address range,
245  	 * e.g. ENOMEM error.
246  	 */
247  	ret = check_mm_seal(start, end);
248  	if (ret)
249  		goto out;
250  
251  	/*
252  	 * Second pass, this should success, unless there are errors
253  	 * from vma_modify_flags, e.g. merge/split error, or process
254  	 * reaching the max supported VMAs, however, those cases shall
255  	 * be rare.
256  	 */
257  	ret = apply_mm_seal(start, end);
258  
259  out:
260  	mmap_write_unlock(current->mm);
261  	return ret;
262  }
263  
SYSCALL_DEFINE3(mseal,unsigned long,start,size_t,len,unsigned long,flags)264  SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
265  		flags)
266  {
267  	return do_mseal(start, len, flags);
268  }
269