1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <unistd.h>
6 #include <stdio.h>
7 #include <signal.h>
8 #include <sys/sysinfo.h>
9 #include <string.h>
10 #include <sys/wait.h>
11 #include <sys/mman.h>
12 
13 #include "../kselftest.h"
14 #include "cgroup_util.h"
15 
read_int(const char * path,size_t * value)16 static int read_int(const char *path, size_t *value)
17 {
18 	FILE *file;
19 	int ret = 0;
20 
21 	file = fopen(path, "r");
22 	if (!file)
23 		return -1;
24 	if (fscanf(file, "%ld", value) != 1)
25 		ret = -1;
26 	fclose(file);
27 	return ret;
28 }
29 
set_min_free_kb(size_t value)30 static int set_min_free_kb(size_t value)
31 {
32 	FILE *file;
33 	int ret;
34 
35 	file = fopen("/proc/sys/vm/min_free_kbytes", "w");
36 	if (!file)
37 		return -1;
38 	ret = fprintf(file, "%ld\n", value);
39 	fclose(file);
40 	return ret;
41 }
42 
read_min_free_kb(size_t * value)43 static int read_min_free_kb(size_t *value)
44 {
45 	return read_int("/proc/sys/vm/min_free_kbytes", value);
46 }
47 
get_zswap_stored_pages(size_t * value)48 static int get_zswap_stored_pages(size_t *value)
49 {
50 	return read_int("/sys/kernel/debug/zswap/stored_pages", value);
51 }
52 
get_cg_wb_count(const char * cg)53 static long get_cg_wb_count(const char *cg)
54 {
55 	return cg_read_key_long(cg, "memory.stat", "zswpwb");
56 }
57 
get_zswpout(const char * cgroup)58 static long get_zswpout(const char *cgroup)
59 {
60 	return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
61 }
62 
allocate_and_read_bytes(const char * cgroup,void * arg)63 static int allocate_and_read_bytes(const char *cgroup, void *arg)
64 {
65 	size_t size = (size_t)arg;
66 	char *mem = (char *)malloc(size);
67 	int ret = 0;
68 
69 	if (!mem)
70 		return -1;
71 	for (int i = 0; i < size; i += 4095)
72 		mem[i] = 'a';
73 
74 	/* Go through the allocated memory to (z)swap in and out pages */
75 	for (int i = 0; i < size; i += 4095) {
76 		if (mem[i] != 'a')
77 			ret = -1;
78 	}
79 
80 	free(mem);
81 	return ret;
82 }
83 
allocate_bytes(const char * cgroup,void * arg)84 static int allocate_bytes(const char *cgroup, void *arg)
85 {
86 	size_t size = (size_t)arg;
87 	char *mem = (char *)malloc(size);
88 
89 	if (!mem)
90 		return -1;
91 	for (int i = 0; i < size; i += 4095)
92 		mem[i] = 'a';
93 	free(mem);
94 	return 0;
95 }
96 
setup_test_group_1M(const char * root,const char * name)97 static char *setup_test_group_1M(const char *root, const char *name)
98 {
99 	char *group_name = cg_name(root, name);
100 
101 	if (!group_name)
102 		return NULL;
103 	if (cg_create(group_name))
104 		goto fail;
105 	if (cg_write(group_name, "memory.max", "1M")) {
106 		cg_destroy(group_name);
107 		goto fail;
108 	}
109 	return group_name;
110 fail:
111 	free(group_name);
112 	return NULL;
113 }
114 
115 /*
116  * Sanity test to check that pages are written into zswap.
117  */
test_zswap_usage(const char * root)118 static int test_zswap_usage(const char *root)
119 {
120 	long zswpout_before, zswpout_after;
121 	int ret = KSFT_FAIL;
122 	char *test_group;
123 
124 	test_group = cg_name(root, "no_shrink_test");
125 	if (!test_group)
126 		goto out;
127 	if (cg_create(test_group))
128 		goto out;
129 	if (cg_write(test_group, "memory.max", "1M"))
130 		goto out;
131 
132 	zswpout_before = get_zswpout(test_group);
133 	if (zswpout_before < 0) {
134 		ksft_print_msg("Failed to get zswpout\n");
135 		goto out;
136 	}
137 
138 	/* Allocate more than memory.max to push memory into zswap */
139 	if (cg_run(test_group, allocate_bytes, (void *)MB(4)))
140 		goto out;
141 
142 	/* Verify that pages come into zswap */
143 	zswpout_after = get_zswpout(test_group);
144 	if (zswpout_after <= zswpout_before) {
145 		ksft_print_msg("zswpout does not increase after test program\n");
146 		goto out;
147 	}
148 	ret = KSFT_PASS;
149 
150 out:
151 	cg_destroy(test_group);
152 	free(test_group);
153 	return ret;
154 }
155 
156 /*
157  * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
158  * the cgroup.
159  */
test_swapin_nozswap(const char * root)160 static int test_swapin_nozswap(const char *root)
161 {
162 	int ret = KSFT_FAIL;
163 	char *test_group;
164 	long swap_peak, zswpout;
165 
166 	test_group = cg_name(root, "no_zswap_test");
167 	if (!test_group)
168 		goto out;
169 	if (cg_create(test_group))
170 		goto out;
171 	if (cg_write(test_group, "memory.max", "8M"))
172 		goto out;
173 	if (cg_write(test_group, "memory.zswap.max", "0"))
174 		goto out;
175 
176 	/* Allocate and read more than memory.max to trigger swapin */
177 	if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
178 		goto out;
179 
180 	/* Verify that pages are swapped out, but no zswap happened */
181 	swap_peak = cg_read_long(test_group, "memory.swap.peak");
182 	if (swap_peak < 0) {
183 		ksft_print_msg("failed to get cgroup's swap_peak\n");
184 		goto out;
185 	}
186 
187 	if (swap_peak < MB(24)) {
188 		ksft_print_msg("at least 24MB of memory should be swapped out\n");
189 		goto out;
190 	}
191 
192 	zswpout = get_zswpout(test_group);
193 	if (zswpout < 0) {
194 		ksft_print_msg("failed to get zswpout\n");
195 		goto out;
196 	}
197 
198 	if (zswpout > 0) {
199 		ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
200 		goto out;
201 	}
202 
203 	ret = KSFT_PASS;
204 
205 out:
206 	cg_destroy(test_group);
207 	free(test_group);
208 	return ret;
209 }
210 
211 /* Simple test to verify the (z)swapin code paths */
test_zswapin(const char * root)212 static int test_zswapin(const char *root)
213 {
214 	int ret = KSFT_FAIL;
215 	char *test_group;
216 	long zswpin;
217 
218 	test_group = cg_name(root, "zswapin_test");
219 	if (!test_group)
220 		goto out;
221 	if (cg_create(test_group))
222 		goto out;
223 	if (cg_write(test_group, "memory.max", "8M"))
224 		goto out;
225 	if (cg_write(test_group, "memory.zswap.max", "max"))
226 		goto out;
227 
228 	/* Allocate and read more than memory.max to trigger (z)swap in */
229 	if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
230 		goto out;
231 
232 	zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
233 	if (zswpin < 0) {
234 		ksft_print_msg("failed to get zswpin\n");
235 		goto out;
236 	}
237 
238 	if (zswpin < MB(24) / PAGE_SIZE) {
239 		ksft_print_msg("at least 24MB should be brought back from zswap\n");
240 		goto out;
241 	}
242 
243 	ret = KSFT_PASS;
244 
245 out:
246 	cg_destroy(test_group);
247 	free(test_group);
248 	return ret;
249 }
250 
251 /*
252  * Attempt writeback with the following steps:
253  * 1. Allocate memory.
254  * 2. Reclaim memory equal to the amount that was allocated in step 1.
255       This will move it into zswap.
256  * 3. Save current zswap usage.
257  * 4. Move the memory allocated in step 1 back in from zswap.
258  * 5. Set zswap.max to half the amount that was recorded in step 3.
259  * 6. Attempt to reclaim memory equal to the amount that was allocated,
260       this will either trigger writeback if it's enabled, or reclamation
261       will fail if writeback is disabled as there isn't enough zswap space.
262  */
attempt_writeback(const char * cgroup,void * arg)263 static int attempt_writeback(const char *cgroup, void *arg)
264 {
265 	long pagesize = sysconf(_SC_PAGESIZE);
266 	size_t memsize = MB(4);
267 	char buf[pagesize];
268 	long zswap_usage;
269 	bool wb_enabled = *(bool *) arg;
270 	int ret = -1;
271 	char *mem;
272 
273 	mem = (char *)malloc(memsize);
274 	if (!mem)
275 		return ret;
276 
277 	/*
278 	 * Fill half of each page with increasing data, and keep other
279 	 * half empty, this will result in data that is still compressible
280 	 * and ends up in zswap, with material zswap usage.
281 	 */
282 	for (int i = 0; i < pagesize; i++)
283 		buf[i] = i < pagesize/2 ? (char) i : 0;
284 
285 	for (int i = 0; i < memsize; i += pagesize)
286 		memcpy(&mem[i], buf, pagesize);
287 
288 	/* Try and reclaim allocated memory */
289 	if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
290 		ksft_print_msg("Failed to reclaim all of the requested memory\n");
291 		goto out;
292 	}
293 
294 	zswap_usage = cg_read_long(cgroup, "memory.zswap.current");
295 
296 	/* zswpin */
297 	for (int i = 0; i < memsize; i += pagesize) {
298 		if (memcmp(&mem[i], buf, pagesize)) {
299 			ksft_print_msg("invalid memory\n");
300 			goto out;
301 		}
302 	}
303 
304 	if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2))
305 		goto out;
306 
307 	/*
308 	 * If writeback is enabled, trying to reclaim memory now will trigger a
309 	 * writeback as zswap.max is half of what was needed when reclaim ran the first time.
310 	 * If writeback is disabled, memory reclaim will fail as zswap is limited and
311 	 * it can't writeback to swap.
312 	 */
313 	ret = cg_write_numeric(cgroup, "memory.reclaim", memsize);
314 	if (!wb_enabled)
315 		ret = (ret == -EAGAIN) ? 0 : -1;
316 
317 out:
318 	free(mem);
319 	return ret;
320 }
321 
test_zswap_writeback_one(const char * cgroup,bool wb)322 static int test_zswap_writeback_one(const char *cgroup, bool wb)
323 {
324 	long zswpwb_before, zswpwb_after;
325 
326 	zswpwb_before = get_cg_wb_count(cgroup);
327 	if (zswpwb_before != 0) {
328 		ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before);
329 		return -1;
330 	}
331 
332 	if (cg_run(cgroup, attempt_writeback, (void *) &wb))
333 		return -1;
334 
335 	/* Verify that zswap writeback occurred only if writeback was enabled */
336 	zswpwb_after = get_cg_wb_count(cgroup);
337 	if (zswpwb_after < 0)
338 		return -1;
339 
340 	if (wb != !!zswpwb_after) {
341 		ksft_print_msg("zswpwb_after is %ld while wb is %s",
342 				zswpwb_after, wb ? "enabled" : "disabled");
343 		return -1;
344 	}
345 
346 	return 0;
347 }
348 
349 /* Test to verify the zswap writeback path */
test_zswap_writeback(const char * root,bool wb)350 static int test_zswap_writeback(const char *root, bool wb)
351 {
352 	int ret = KSFT_FAIL;
353 	char *test_group, *test_group_child = NULL;
354 
355 	if (cg_read_strcmp(root, "memory.zswap.writeback", "1"))
356 		return KSFT_SKIP;
357 
358 	test_group = cg_name(root, "zswap_writeback_test");
359 	if (!test_group)
360 		goto out;
361 	if (cg_create(test_group))
362 		goto out;
363 	if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0"))
364 		goto out;
365 
366 	if (test_zswap_writeback_one(test_group, wb))
367 		goto out;
368 
369 	/* Reset memory.zswap.max to max (modified by attempt_writeback), and
370 	 * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1.
371 	 * Thus, the parent's setting shall be what's in effect. */
372 	if (cg_write(test_group, "memory.zswap.max", "max"))
373 		goto out;
374 	if (cg_write(test_group, "cgroup.subtree_control", "+memory"))
375 		goto out;
376 
377 	test_group_child = cg_name(test_group, "zswap_writeback_test_child");
378 	if (!test_group_child)
379 		goto out;
380 	if (cg_create(test_group_child))
381 		goto out;
382 	if (cg_write(test_group_child, "memory.zswap.writeback", "1"))
383 		goto out;
384 
385 	if (test_zswap_writeback_one(test_group_child, wb))
386 		goto out;
387 
388 	ret = KSFT_PASS;
389 
390 out:
391 	if (test_group_child) {
392 		cg_destroy(test_group_child);
393 		free(test_group_child);
394 	}
395 	cg_destroy(test_group);
396 	free(test_group);
397 	return ret;
398 }
399 
test_zswap_writeback_enabled(const char * root)400 static int test_zswap_writeback_enabled(const char *root)
401 {
402 	return test_zswap_writeback(root, true);
403 }
404 
test_zswap_writeback_disabled(const char * root)405 static int test_zswap_writeback_disabled(const char *root)
406 {
407 	return test_zswap_writeback(root, false);
408 }
409 
410 /*
411  * When trying to store a memcg page in zswap, if the memcg hits its memory
412  * limit in zswap, writeback should affect only the zswapped pages of that
413  * memcg.
414  */
test_no_invasive_cgroup_shrink(const char * root)415 static int test_no_invasive_cgroup_shrink(const char *root)
416 {
417 	int ret = KSFT_FAIL;
418 	size_t control_allocation_size = MB(10);
419 	char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL;
420 
421 	wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
422 	if (!wb_group)
423 		return KSFT_FAIL;
424 	if (cg_write(wb_group, "memory.zswap.max", "10K"))
425 		goto out;
426 	control_group = setup_test_group_1M(root, "per_memcg_wb_test2");
427 	if (!control_group)
428 		goto out;
429 
430 	/* Push some test_group2 memory into zswap */
431 	if (cg_enter_current(control_group))
432 		goto out;
433 	control_allocation = malloc(control_allocation_size);
434 	for (int i = 0; i < control_allocation_size; i += 4095)
435 		control_allocation[i] = 'a';
436 	if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1)
437 		goto out;
438 
439 	/* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */
440 	if (cg_run(wb_group, allocate_bytes, (void *)MB(10)))
441 		goto out;
442 
443 	/* Verify that only zswapped memory from gwb_group has been written back */
444 	if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0)
445 		ret = KSFT_PASS;
446 out:
447 	cg_enter_current(root);
448 	if (control_group) {
449 		cg_destroy(control_group);
450 		free(control_group);
451 	}
452 	cg_destroy(wb_group);
453 	free(wb_group);
454 	if (control_allocation)
455 		free(control_allocation);
456 	return ret;
457 }
458 
459 struct no_kmem_bypass_child_args {
460 	size_t target_alloc_bytes;
461 	size_t child_allocated;
462 };
463 
no_kmem_bypass_child(const char * cgroup,void * arg)464 static int no_kmem_bypass_child(const char *cgroup, void *arg)
465 {
466 	struct no_kmem_bypass_child_args *values = arg;
467 	void *allocation;
468 
469 	allocation = malloc(values->target_alloc_bytes);
470 	if (!allocation) {
471 		values->child_allocated = true;
472 		return -1;
473 	}
474 	for (long i = 0; i < values->target_alloc_bytes; i += 4095)
475 		((char *)allocation)[i] = 'a';
476 	values->child_allocated = true;
477 	pause();
478 	free(allocation);
479 	return 0;
480 }
481 
482 /*
483  * When pages owned by a memcg are pushed to zswap by kswapd, they should be
484  * charged to that cgroup. This wasn't the case before commit
485  * cd08d80ecdac("mm: correctly charge compressed memory to its memcg").
486  *
487  * The test first allocates memory in a memcg, then raises min_free_kbytes to
488  * a very high value so that the allocation falls below low wm, then makes
489  * another allocation to trigger kswapd that should push the memcg-owned pages
490  * to zswap and verifies that the zswap pages are correctly charged.
491  *
492  * To be run on a VM with at most 4G of memory.
493  */
test_no_kmem_bypass(const char * root)494 static int test_no_kmem_bypass(const char *root)
495 {
496 	size_t min_free_kb_high, min_free_kb_low, min_free_kb_original;
497 	struct no_kmem_bypass_child_args *values;
498 	size_t trigger_allocation_size;
499 	int wait_child_iteration = 0;
500 	long stored_pages_threshold;
501 	struct sysinfo sys_info;
502 	int ret = KSFT_FAIL;
503 	int child_status;
504 	char *test_group = NULL;
505 	pid_t child_pid;
506 
507 	/* Read sys info and compute test values accordingly */
508 	if (sysinfo(&sys_info) != 0)
509 		return KSFT_FAIL;
510 	if (sys_info.totalram > 5000000000)
511 		return KSFT_SKIP;
512 	values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ |
513 			PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
514 	if (values == MAP_FAILED)
515 		return KSFT_FAIL;
516 	if (read_min_free_kb(&min_free_kb_original))
517 		return KSFT_FAIL;
518 	min_free_kb_high = sys_info.totalram / 2000;
519 	min_free_kb_low = sys_info.totalram / 500000;
520 	values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) +
521 		sys_info.totalram * 5 / 100;
522 	stored_pages_threshold = sys_info.totalram / 5 / 4096;
523 	trigger_allocation_size = sys_info.totalram / 20;
524 
525 	/* Set up test memcg */
526 	test_group = cg_name(root, "kmem_bypass_test");
527 	if (!test_group)
528 		goto out;
529 
530 	/* Spawn memcg child and wait for it to allocate */
531 	set_min_free_kb(min_free_kb_low);
532 	if (cg_create(test_group))
533 		goto out;
534 	values->child_allocated = false;
535 	child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values);
536 	if (child_pid < 0)
537 		goto out;
538 	while (!values->child_allocated && wait_child_iteration++ < 10000)
539 		usleep(1000);
540 
541 	/* Try to wakeup kswapd and let it push child memory to zswap */
542 	set_min_free_kb(min_free_kb_high);
543 	for (int i = 0; i < 20; i++) {
544 		size_t stored_pages;
545 		char *trigger_allocation = malloc(trigger_allocation_size);
546 
547 		if (!trigger_allocation)
548 			break;
549 		for (int i = 0; i < trigger_allocation_size; i += 4095)
550 			trigger_allocation[i] = 'b';
551 		usleep(100000);
552 		free(trigger_allocation);
553 		if (get_zswap_stored_pages(&stored_pages))
554 			break;
555 		if (stored_pages < 0)
556 			break;
557 		/* If memory was pushed to zswap, verify it belongs to memcg */
558 		if (stored_pages > stored_pages_threshold) {
559 			int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped ");
560 			int delta = stored_pages * 4096 - zswapped;
561 			int result_ok = delta < stored_pages * 4096 / 4;
562 
563 			ret = result_ok ? KSFT_PASS : KSFT_FAIL;
564 			break;
565 		}
566 	}
567 
568 	kill(child_pid, SIGTERM);
569 	waitpid(child_pid, &child_status, 0);
570 out:
571 	set_min_free_kb(min_free_kb_original);
572 	cg_destroy(test_group);
573 	free(test_group);
574 	return ret;
575 }
576 
577 #define T(x) { x, #x }
578 struct zswap_test {
579 	int (*fn)(const char *root);
580 	const char *name;
581 } tests[] = {
582 	T(test_zswap_usage),
583 	T(test_swapin_nozswap),
584 	T(test_zswapin),
585 	T(test_zswap_writeback_enabled),
586 	T(test_zswap_writeback_disabled),
587 	T(test_no_kmem_bypass),
588 	T(test_no_invasive_cgroup_shrink),
589 };
590 #undef T
591 
zswap_configured(void)592 static bool zswap_configured(void)
593 {
594 	return access("/sys/module/zswap", F_OK) == 0;
595 }
596 
main(int argc,char ** argv)597 int main(int argc, char **argv)
598 {
599 	char root[PATH_MAX];
600 	int i, ret = EXIT_SUCCESS;
601 
602 	if (cg_find_unified_root(root, sizeof(root), NULL))
603 		ksft_exit_skip("cgroup v2 isn't mounted\n");
604 
605 	if (!zswap_configured())
606 		ksft_exit_skip("zswap isn't configured\n");
607 
608 	/*
609 	 * Check that memory controller is available:
610 	 * memory is listed in cgroup.controllers
611 	 */
612 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
613 		ksft_exit_skip("memory controller isn't available\n");
614 
615 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
616 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
617 			ksft_exit_skip("Failed to set memory controller\n");
618 
619 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
620 		switch (tests[i].fn(root)) {
621 		case KSFT_PASS:
622 			ksft_test_result_pass("%s\n", tests[i].name);
623 			break;
624 		case KSFT_SKIP:
625 			ksft_test_result_skip("%s\n", tests[i].name);
626 			break;
627 		default:
628 			ret = EXIT_FAILURE;
629 			ksft_test_result_fail("%s\n", tests[i].name);
630 			break;
631 		}
632 	}
633 
634 	return ret;
635 }
636