1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2023 Red Hat
4  */
5 
6 #include <linux/delay.h>
7 #include <linux/mm.h>
8 #include <linux/sched/mm.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 
12 #include "logger.h"
13 #include "memory-alloc.h"
14 #include "permassert.h"
15 
16 /*
17  * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads
18  * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads'
19  * thread_registry and its associated methods implement this tracking.
20  */
21 static struct thread_registry allocating_threads;
22 
allocations_allowed(void)23 static inline bool allocations_allowed(void)
24 {
25 	return vdo_lookup_thread(&allocating_threads) != NULL;
26 }
27 
28 /*
29  * Register the current thread as an allocating thread.
30  *
31  * An optional flag location can be supplied indicating whether, at any given point in time, the
32  * threads associated with that flag should be allocating storage. If the flag is false, a message
33  * will be logged.
34  *
35  * If no flag is supplied, the thread is always allowed to allocate storage without complaint.
36  *
37  * @new_thread: registered_thread structure to use for the current thread
38  * @flag_ptr: Location of the allocation-allowed flag
39  */
vdo_register_allocating_thread(struct registered_thread * new_thread,const bool * flag_ptr)40 void vdo_register_allocating_thread(struct registered_thread *new_thread,
41 				    const bool *flag_ptr)
42 {
43 	if (flag_ptr == NULL) {
44 		static const bool allocation_always_allowed = true;
45 
46 		flag_ptr = &allocation_always_allowed;
47 	}
48 
49 	vdo_register_thread(&allocating_threads, new_thread, flag_ptr);
50 }
51 
52 /* Unregister the current thread as an allocating thread. */
vdo_unregister_allocating_thread(void)53 void vdo_unregister_allocating_thread(void)
54 {
55 	vdo_unregister_thread(&allocating_threads);
56 }
57 
58 /*
59  * We track how much memory has been allocated and freed. When we unload the module, we log an
60  * error if we have not freed all the memory that we allocated. Nearly all memory allocation and
61  * freeing is done using this module.
62  *
63  * We do not use kernel functions like the kvasprintf() method, which allocate memory indirectly
64  * using kmalloc.
65  *
66  * These data structures and methods are used to track the amount of memory used.
67  */
68 
69 /*
70  * We allocate very few large objects, and allocation/deallocation isn't done in a
71  * performance-critical stage for us, so a linked list should be fine.
72  */
73 struct vmalloc_block_info {
74 	void *ptr;
75 	size_t size;
76 	struct vmalloc_block_info *next;
77 };
78 
79 static struct {
80 	spinlock_t lock;
81 	size_t kmalloc_blocks;
82 	size_t kmalloc_bytes;
83 	size_t vmalloc_blocks;
84 	size_t vmalloc_bytes;
85 	size_t peak_bytes;
86 	struct vmalloc_block_info *vmalloc_list;
87 } memory_stats __cacheline_aligned;
88 
update_peak_usage(void)89 static void update_peak_usage(void)
90 {
91 	size_t total_bytes = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
92 
93 	if (total_bytes > memory_stats.peak_bytes)
94 		memory_stats.peak_bytes = total_bytes;
95 }
96 
add_kmalloc_block(size_t size)97 static void add_kmalloc_block(size_t size)
98 {
99 	unsigned long flags;
100 
101 	spin_lock_irqsave(&memory_stats.lock, flags);
102 	memory_stats.kmalloc_blocks++;
103 	memory_stats.kmalloc_bytes += size;
104 	update_peak_usage();
105 	spin_unlock_irqrestore(&memory_stats.lock, flags);
106 }
107 
remove_kmalloc_block(size_t size)108 static void remove_kmalloc_block(size_t size)
109 {
110 	unsigned long flags;
111 
112 	spin_lock_irqsave(&memory_stats.lock, flags);
113 	memory_stats.kmalloc_blocks--;
114 	memory_stats.kmalloc_bytes -= size;
115 	spin_unlock_irqrestore(&memory_stats.lock, flags);
116 }
117 
add_vmalloc_block(struct vmalloc_block_info * block)118 static void add_vmalloc_block(struct vmalloc_block_info *block)
119 {
120 	unsigned long flags;
121 
122 	spin_lock_irqsave(&memory_stats.lock, flags);
123 	block->next = memory_stats.vmalloc_list;
124 	memory_stats.vmalloc_list = block;
125 	memory_stats.vmalloc_blocks++;
126 	memory_stats.vmalloc_bytes += block->size;
127 	update_peak_usage();
128 	spin_unlock_irqrestore(&memory_stats.lock, flags);
129 }
130 
remove_vmalloc_block(void * ptr)131 static void remove_vmalloc_block(void *ptr)
132 {
133 	struct vmalloc_block_info *block;
134 	struct vmalloc_block_info **block_ptr;
135 	unsigned long flags;
136 
137 	spin_lock_irqsave(&memory_stats.lock, flags);
138 	for (block_ptr = &memory_stats.vmalloc_list;
139 	     (block = *block_ptr) != NULL;
140 	     block_ptr = &block->next) {
141 		if (block->ptr == ptr) {
142 			*block_ptr = block->next;
143 			memory_stats.vmalloc_blocks--;
144 			memory_stats.vmalloc_bytes -= block->size;
145 			break;
146 		}
147 	}
148 
149 	spin_unlock_irqrestore(&memory_stats.lock, flags);
150 	if (block != NULL)
151 		vdo_free(block);
152 	else
153 		vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
154 }
155 
156 /*
157  * Determine whether allocating a memory block should use kmalloc or __vmalloc.
158  *
159  * vmalloc can allocate any integral number of pages.
160  *
161  * kmalloc can allocate any number of bytes up to a configured limit, which defaults to 8 megabytes
162  * on some systems. kmalloc is especially good when memory is being both allocated and freed, and
163  * it does this efficiently in a multi CPU environment.
164  *
165  * kmalloc usually rounds the size of the block up to the next power of two, so when the requested
166  * block is bigger than PAGE_SIZE / 2 bytes, kmalloc will never give you less space than the
167  * corresponding vmalloc allocation. Sometimes vmalloc will use less overhead than kmalloc.
168  *
169  * The advantages of kmalloc do not help out UDS or VDO, because we allocate all our memory up
170  * front and do not free and reallocate it. Sometimes we have problems using kmalloc, because the
171  * Linux memory page map can become so fragmented that kmalloc will not give us a 32KB chunk. We
172  * have used vmalloc as a backup to kmalloc in the past, and a follow-up vmalloc of 32KB will work.
173  * But there is no strong case to be made for using kmalloc over vmalloc for these size chunks.
174  *
175  * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB requests. There is no
176  * strong reason for favoring either kmalloc or vmalloc for 4KB requests, except that tracking
177  * vmalloc statistics uses a linked list implementation. Using a simple test, this choice of
178  * boundary results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB results in an
179  * additional 6374 vmalloc calls, which is much less efficient for tracking.
180  *
181  * @size: How many bytes to allocate
182  */
use_kmalloc(size_t size)183 static inline bool use_kmalloc(size_t size)
184 {
185 	return size <= PAGE_SIZE;
186 }
187 
188 /*
189  * Allocate storage based on memory size and alignment, logging an error if the allocation fails.
190  * The memory will be zeroed.
191  *
192  * @size: The size of an object
193  * @align: The required alignment
194  * @what: What is being allocated (for error logging)
195  * @ptr: A pointer to hold the allocated memory
196  *
197  * Return: VDO_SUCCESS or an error code
198  */
vdo_allocate_memory(size_t size,size_t align,const char * what,void * ptr)199 int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
200 {
201 	/*
202 	 * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim
203 	 * procedures that have previously failed if there is some indication that progress has
204 	 * been made elsewhere. It can wait for other tasks to attempt high level approaches to
205 	 * freeing memory such as compaction (which removes fragmentation) and page-out. There is
206 	 * still a definite limit to the number of retries, but it is a larger limit than with
207 	 * __GFP_NORETRY. Allocations with this flag may fail, but only when there is genuinely
208 	 * little unused memory. While these allocations do not directly trigger the OOM killer,
209 	 * their failure indicates that the system is likely to need to use the OOM killer soon.
210 	 * The caller must handle failure, but can reasonably do so by failing a higher-level
211 	 * request, or completing it only in a much less efficient manner.
212 	 */
213 	const gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL;
214 	unsigned int noio_flags;
215 	bool allocations_restricted = !allocations_allowed();
216 	unsigned long start_time;
217 	void *p = NULL;
218 
219 	if (unlikely(ptr == NULL))
220 		return -EINVAL;
221 
222 	if (size == 0) {
223 		*((void **) ptr) = NULL;
224 		return VDO_SUCCESS;
225 	}
226 
227 	if (allocations_restricted)
228 		noio_flags = memalloc_noio_save();
229 
230 	start_time = jiffies;
231 	if (use_kmalloc(size) && (align < PAGE_SIZE)) {
232 		p = kmalloc(size, gfp_flags | __GFP_NOWARN);
233 		if (p == NULL) {
234 			/*
235 			 * It is possible for kmalloc to fail to allocate memory because there is
236 			 * no page available. A short sleep may allow the page reclaimer to
237 			 * free a page.
238 			 */
239 			fsleep(1000);
240 			p = kmalloc(size, gfp_flags);
241 		}
242 
243 		if (p != NULL)
244 			add_kmalloc_block(ksize(p));
245 	} else {
246 		struct vmalloc_block_info *block;
247 
248 		if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) {
249 			/*
250 			 * It is possible for __vmalloc to fail to allocate memory because there
251 			 * are no pages available. A short sleep may allow the page reclaimer
252 			 * to free enough pages for a small allocation.
253 			 *
254 			 * For larger allocations, the page_alloc code is racing against the page
255 			 * reclaimer. If the page reclaimer can stay ahead of page_alloc, the
256 			 * __vmalloc will succeed. But if page_alloc overtakes the page reclaimer,
257 			 * the allocation fails. It is possible that more retries will succeed.
258 			 */
259 			for (;;) {
260 				p = __vmalloc(size, gfp_flags | __GFP_NOWARN);
261 				if (p != NULL)
262 					break;
263 
264 				if (jiffies_to_msecs(jiffies - start_time) > 1000) {
265 					/* Try one more time, logging a failure for this call. */
266 					p = __vmalloc(size, gfp_flags);
267 					break;
268 				}
269 
270 				fsleep(1000);
271 			}
272 
273 			if (p == NULL) {
274 				vdo_free(block);
275 			} else {
276 				block->ptr = p;
277 				block->size = PAGE_ALIGN(size);
278 				add_vmalloc_block(block);
279 			}
280 		}
281 	}
282 
283 	if (allocations_restricted)
284 		memalloc_noio_restore(noio_flags);
285 
286 	if (unlikely(p == NULL)) {
287 		vdo_log_error("Could not allocate %zu bytes for %s in %u msecs",
288 			      size, what, jiffies_to_msecs(jiffies - start_time));
289 		return -ENOMEM;
290 	}
291 
292 	*((void **) ptr) = p;
293 	return VDO_SUCCESS;
294 }
295 
296 /*
297  * Allocate storage based on memory size, failing immediately if the required memory is not
298  * available. The memory will be zeroed.
299  *
300  * @size: The size of an object.
301  * @what: What is being allocated (for error logging)
302  *
303  * Return: pointer to the allocated memory, or NULL if the required space is not available.
304  */
vdo_allocate_memory_nowait(size_t size,const char * what __maybe_unused)305 void *vdo_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
306 {
307 	void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
308 
309 	if (p != NULL)
310 		add_kmalloc_block(ksize(p));
311 
312 	return p;
313 }
314 
vdo_free(void * ptr)315 void vdo_free(void *ptr)
316 {
317 	if (ptr != NULL) {
318 		if (is_vmalloc_addr(ptr)) {
319 			remove_vmalloc_block(ptr);
320 			vfree(ptr);
321 		} else {
322 			remove_kmalloc_block(ksize(ptr));
323 			kfree(ptr);
324 		}
325 	}
326 }
327 
328 /*
329  * Reallocate dynamically allocated memory. There are no alignment guarantees for the reallocated
330  * memory. If the new memory is larger than the old memory, the new space will be zeroed.
331  *
332  * @ptr: The memory to reallocate.
333  * @old_size: The old size of the memory
334  * @size: The new size to allocate
335  * @what: What is being allocated (for error logging)
336  * @new_ptr: A pointer to hold the reallocated pointer
337  *
338  * Return: VDO_SUCCESS or an error code
339  */
vdo_reallocate_memory(void * ptr,size_t old_size,size_t size,const char * what,void * new_ptr)340 int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
341 			  void *new_ptr)
342 {
343 	int result;
344 
345 	if (size == 0) {
346 		vdo_free(ptr);
347 		*(void **) new_ptr = NULL;
348 		return VDO_SUCCESS;
349 	}
350 
351 	result = vdo_allocate(size, char, what, new_ptr);
352 	if (result != VDO_SUCCESS)
353 		return result;
354 
355 	if (ptr != NULL) {
356 		if (old_size < size)
357 			size = old_size;
358 
359 		memcpy(*((void **) new_ptr), ptr, size);
360 		vdo_free(ptr);
361 	}
362 
363 	return VDO_SUCCESS;
364 }
365 
vdo_duplicate_string(const char * string,const char * what,char ** new_string)366 int vdo_duplicate_string(const char *string, const char *what, char **new_string)
367 {
368 	int result;
369 	u8 *dup;
370 
371 	result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
372 	if (result != VDO_SUCCESS)
373 		return result;
374 
375 	memcpy(dup, string, strlen(string) + 1);
376 	*new_string = dup;
377 	return VDO_SUCCESS;
378 }
379 
vdo_memory_init(void)380 void vdo_memory_init(void)
381 {
382 	spin_lock_init(&memory_stats.lock);
383 	vdo_initialize_thread_registry(&allocating_threads);
384 }
385 
vdo_memory_exit(void)386 void vdo_memory_exit(void)
387 {
388 	VDO_ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
389 			    "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
390 			    memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks);
391 	VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
392 			    "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
393 			    memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
394 	vdo_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
395 }
396 
vdo_get_memory_stats(u64 * bytes_used,u64 * peak_bytes_used)397 void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
398 {
399 	unsigned long flags;
400 
401 	spin_lock_irqsave(&memory_stats.lock, flags);
402 	*bytes_used = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
403 	*peak_bytes_used = memory_stats.peak_bytes;
404 	spin_unlock_irqrestore(&memory_stats.lock, flags);
405 }
406 
407 /*
408  * Report stats on any allocated memory that we're tracking. Not all allocation types are
409  * guaranteed to be tracked in bytes (e.g., bios).
410  */
vdo_report_memory_usage(void)411 void vdo_report_memory_usage(void)
412 {
413 	unsigned long flags;
414 	u64 kmalloc_blocks;
415 	u64 kmalloc_bytes;
416 	u64 vmalloc_blocks;
417 	u64 vmalloc_bytes;
418 	u64 peak_usage;
419 	u64 total_bytes;
420 
421 	spin_lock_irqsave(&memory_stats.lock, flags);
422 	kmalloc_blocks = memory_stats.kmalloc_blocks;
423 	kmalloc_bytes = memory_stats.kmalloc_bytes;
424 	vmalloc_blocks = memory_stats.vmalloc_blocks;
425 	vmalloc_bytes = memory_stats.vmalloc_bytes;
426 	peak_usage = memory_stats.peak_bytes;
427 	spin_unlock_irqrestore(&memory_stats.lock, flags);
428 	total_bytes = kmalloc_bytes + vmalloc_bytes;
429 	vdo_log_info("current module memory tracking (actual allocation sizes, not requested):");
430 	vdo_log_info("  %llu bytes in %llu kmalloc blocks",
431 		     (unsigned long long) kmalloc_bytes,
432 		     (unsigned long long) kmalloc_blocks);
433 	vdo_log_info("  %llu bytes in %llu vmalloc blocks",
434 		     (unsigned long long) vmalloc_bytes,
435 		     (unsigned long long) vmalloc_blocks);
436 	vdo_log_info("  total %llu bytes, peak usage %llu bytes",
437 		     (unsigned long long) total_bytes, (unsigned long long) peak_usage);
438 }
439