1/* SPDX-License-Identifier: GPL-2.0 */
2#define _GNU_SOURCE
3
4#include <linux/limits.h>
5#include <linux/oom.h>
6#include <fcntl.h>
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <sys/stat.h>
11#include <sys/types.h>
12#include <unistd.h>
13#include <sys/socket.h>
14#include <sys/wait.h>
15#include <arpa/inet.h>
16#include <netinet/in.h>
17#include <netdb.h>
18#include <errno.h>
19#include <sys/mman.h>
20
21#include "../kselftest.h"
22#include "cgroup_util.h"
23
24static bool has_localevents;
25static bool has_recursiveprot;
26
27/*
28 * This test creates two nested cgroups with and without enabling
29 * the memory controller.
30 */
31static int test_memcg_subtree_control(const char *root)
32{
33	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34	int ret = KSFT_FAIL;
35	char buf[PAGE_SIZE];
36
37	/* Create two nested cgroups with the memory controller enabled */
38	parent = cg_name(root, "memcg_test_0");
39	child = cg_name(root, "memcg_test_0/memcg_test_1");
40	if (!parent || !child)
41		goto cleanup_free;
42
43	if (cg_create(parent))
44		goto cleanup_free;
45
46	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47		goto cleanup_parent;
48
49	if (cg_create(child))
50		goto cleanup_parent;
51
52	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53		goto cleanup_child;
54
55	/* Create two nested cgroups without enabling memory controller */
56	parent2 = cg_name(root, "memcg_test_1");
57	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58	if (!parent2 || !child2)
59		goto cleanup_free2;
60
61	if (cg_create(parent2))
62		goto cleanup_free2;
63
64	if (cg_create(child2))
65		goto cleanup_parent2;
66
67	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68		goto cleanup_all;
69
70	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71		goto cleanup_all;
72
73	ret = KSFT_PASS;
74
75cleanup_all:
76	cg_destroy(child2);
77cleanup_parent2:
78	cg_destroy(parent2);
79cleanup_free2:
80	free(parent2);
81	free(child2);
82cleanup_child:
83	cg_destroy(child);
84cleanup_parent:
85	cg_destroy(parent);
86cleanup_free:
87	free(parent);
88	free(child);
89
90	return ret;
91}
92
93static int alloc_anon_50M_check(const char *cgroup, void *arg)
94{
95	size_t size = MB(50);
96	char *buf, *ptr;
97	long anon, current;
98	int ret = -1;
99
100	buf = malloc(size);
101	if (buf == NULL) {
102		fprintf(stderr, "malloc() failed\n");
103		return -1;
104	}
105
106	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
107		*ptr = 0;
108
109	current = cg_read_long(cgroup, "memory.current");
110	if (current < size)
111		goto cleanup;
112
113	if (!values_close(size, current, 3))
114		goto cleanup;
115
116	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
117	if (anon < 0)
118		goto cleanup;
119
120	if (!values_close(anon, current, 3))
121		goto cleanup;
122
123	ret = 0;
124cleanup:
125	free(buf);
126	return ret;
127}
128
129static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
130{
131	size_t size = MB(50);
132	int ret = -1;
133	long current, file;
134	int fd;
135
136	fd = get_temp_fd();
137	if (fd < 0)
138		return -1;
139
140	if (alloc_pagecache(fd, size))
141		goto cleanup;
142
143	current = cg_read_long(cgroup, "memory.current");
144	if (current < size)
145		goto cleanup;
146
147	file = cg_read_key_long(cgroup, "memory.stat", "file ");
148	if (file < 0)
149		goto cleanup;
150
151	if (!values_close(file, current, 10))
152		goto cleanup;
153
154	ret = 0;
155
156cleanup:
157	close(fd);
158	return ret;
159}
160
161/*
162 * This test create a memory cgroup, allocates
163 * some anonymous memory and some pagecache
164 * and check memory.current and some memory.stat values.
165 */
166static int test_memcg_current(const char *root)
167{
168	int ret = KSFT_FAIL;
169	long current;
170	char *memcg;
171
172	memcg = cg_name(root, "memcg_test");
173	if (!memcg)
174		goto cleanup;
175
176	if (cg_create(memcg))
177		goto cleanup;
178
179	current = cg_read_long(memcg, "memory.current");
180	if (current != 0)
181		goto cleanup;
182
183	if (cg_run(memcg, alloc_anon_50M_check, NULL))
184		goto cleanup;
185
186	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
187		goto cleanup;
188
189	ret = KSFT_PASS;
190
191cleanup:
192	cg_destroy(memcg);
193	free(memcg);
194
195	return ret;
196}
197
198static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
199{
200	int fd = (long)arg;
201	int ppid = getppid();
202
203	if (alloc_pagecache(fd, MB(50)))
204		return -1;
205
206	while (getppid() == ppid)
207		sleep(1);
208
209	return 0;
210}
211
212static int alloc_anon_noexit(const char *cgroup, void *arg)
213{
214	int ppid = getppid();
215	size_t size = (unsigned long)arg;
216	char *buf, *ptr;
217
218	buf = malloc(size);
219	if (buf == NULL) {
220		fprintf(stderr, "malloc() failed\n");
221		return -1;
222	}
223
224	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
225		*ptr = 0;
226
227	while (getppid() == ppid)
228		sleep(1);
229
230	free(buf);
231	return 0;
232}
233
234/*
235 * Wait until processes are killed asynchronously by the OOM killer
236 * If we exceed a timeout, fail.
237 */
238static int cg_test_proc_killed(const char *cgroup)
239{
240	int limit;
241
242	for (limit = 10; limit > 0; limit--) {
243		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
244			return 0;
245
246		usleep(100000);
247	}
248	return -1;
249}
250
251static bool reclaim_until(const char *memcg, long goal);
252
253/*
254 * First, this test creates the following hierarchy:
255 * A       memory.min = 0,    memory.max = 200M
256 * A/B     memory.min = 50M
257 * A/B/C   memory.min = 75M,  memory.current = 50M
258 * A/B/D   memory.min = 25M,  memory.current = 50M
259 * A/B/E   memory.min = 0,    memory.current = 50M
260 * A/B/F   memory.min = 500M, memory.current = 0
261 *
262 * (or memory.low if we test soft protection)
263 *
264 * Usages are pagecache and the test keeps a running
265 * process in every leaf cgroup.
266 * Then it creates A/G and creates a significant
267 * memory pressure in A.
268 *
269 * Then it checks actual memory usages and expects that:
270 * A/B    memory.current ~= 50M
271 * A/B/C  memory.current ~= 29M
272 * A/B/D  memory.current ~= 21M
273 * A/B/E  memory.current ~= 0
274 * A/B/F  memory.current  = 0
275 * (for origin of the numbers, see model in memcg_protection.m.)
276 *
277 * After that it tries to allocate more than there is
278 * unprotected memory in A available, and checks that:
279 * a) memory.min protects pagecache even in this case,
280 * b) memory.low allows reclaiming page cache with low events.
281 *
282 * Then we try to reclaim from A/B/C using memory.reclaim until its
283 * usage reaches 10M.
284 * This makes sure that:
285 * (a) We ignore the protection of the reclaim target memcg.
286 * (b) The previously calculated emin value (~29M) should be dismissed.
287 */
288static int test_memcg_protection(const char *root, bool min)
289{
290	int ret = KSFT_FAIL, rc;
291	char *parent[3] = {NULL};
292	char *children[4] = {NULL};
293	const char *attribute = min ? "memory.min" : "memory.low";
294	long c[4];
295	long current;
296	int i, attempts;
297	int fd;
298
299	fd = get_temp_fd();
300	if (fd < 0)
301		goto cleanup;
302
303	parent[0] = cg_name(root, "memcg_test_0");
304	if (!parent[0])
305		goto cleanup;
306
307	parent[1] = cg_name(parent[0], "memcg_test_1");
308	if (!parent[1])
309		goto cleanup;
310
311	parent[2] = cg_name(parent[0], "memcg_test_2");
312	if (!parent[2])
313		goto cleanup;
314
315	if (cg_create(parent[0]))
316		goto cleanup;
317
318	if (cg_read_long(parent[0], attribute)) {
319		/* No memory.min on older kernels is fine */
320		if (min)
321			ret = KSFT_SKIP;
322		goto cleanup;
323	}
324
325	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
326		goto cleanup;
327
328	if (cg_write(parent[0], "memory.max", "200M"))
329		goto cleanup;
330
331	if (cg_write(parent[0], "memory.swap.max", "0"))
332		goto cleanup;
333
334	if (cg_create(parent[1]))
335		goto cleanup;
336
337	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
338		goto cleanup;
339
340	if (cg_create(parent[2]))
341		goto cleanup;
342
343	for (i = 0; i < ARRAY_SIZE(children); i++) {
344		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
345		if (!children[i])
346			goto cleanup;
347
348		if (cg_create(children[i]))
349			goto cleanup;
350
351		if (i > 2)
352			continue;
353
354		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
355			      (void *)(long)fd);
356	}
357
358	if (cg_write(parent[1],   attribute, "50M"))
359		goto cleanup;
360	if (cg_write(children[0], attribute, "75M"))
361		goto cleanup;
362	if (cg_write(children[1], attribute, "25M"))
363		goto cleanup;
364	if (cg_write(children[2], attribute, "0"))
365		goto cleanup;
366	if (cg_write(children[3], attribute, "500M"))
367		goto cleanup;
368
369	attempts = 0;
370	while (!values_close(cg_read_long(parent[1], "memory.current"),
371			     MB(150), 3)) {
372		if (attempts++ > 5)
373			break;
374		sleep(1);
375	}
376
377	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
378		goto cleanup;
379
380	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
381		goto cleanup;
382
383	for (i = 0; i < ARRAY_SIZE(children); i++)
384		c[i] = cg_read_long(children[i], "memory.current");
385
386	if (!values_close(c[0], MB(29), 10))
387		goto cleanup;
388
389	if (!values_close(c[1], MB(21), 10))
390		goto cleanup;
391
392	if (c[3] != 0)
393		goto cleanup;
394
395	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
396	if (min && !rc)
397		goto cleanup;
398	else if (!min && rc) {
399		fprintf(stderr,
400			"memory.low prevents from allocating anon memory\n");
401		goto cleanup;
402	}
403
404	current = min ? MB(50) : MB(30);
405	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
406		goto cleanup;
407
408	if (!reclaim_until(children[0], MB(10)))
409		goto cleanup;
410
411	if (min) {
412		ret = KSFT_PASS;
413		goto cleanup;
414	}
415
416	for (i = 0; i < ARRAY_SIZE(children); i++) {
417		int no_low_events_index = 1;
418		long low, oom;
419
420		oom = cg_read_key_long(children[i], "memory.events", "oom ");
421		low = cg_read_key_long(children[i], "memory.events", "low ");
422
423		if (oom)
424			goto cleanup;
425		if (i <= no_low_events_index && low <= 0)
426			goto cleanup;
427		if (i > no_low_events_index && low)
428			goto cleanup;
429
430	}
431
432	ret = KSFT_PASS;
433
434cleanup:
435	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
436		if (!children[i])
437			continue;
438
439		cg_destroy(children[i]);
440		free(children[i]);
441	}
442
443	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
444		if (!parent[i])
445			continue;
446
447		cg_destroy(parent[i]);
448		free(parent[i]);
449	}
450	close(fd);
451	return ret;
452}
453
454static int test_memcg_min(const char *root)
455{
456	return test_memcg_protection(root, true);
457}
458
459static int test_memcg_low(const char *root)
460{
461	return test_memcg_protection(root, false);
462}
463
464static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
465{
466	size_t size = MB(50);
467	int ret = -1;
468	long current, high, max;
469	int fd;
470
471	high = cg_read_long(cgroup, "memory.high");
472	max = cg_read_long(cgroup, "memory.max");
473	if (high != MB(30) && max != MB(30))
474		return -1;
475
476	fd = get_temp_fd();
477	if (fd < 0)
478		return -1;
479
480	if (alloc_pagecache(fd, size))
481		goto cleanup;
482
483	current = cg_read_long(cgroup, "memory.current");
484	if (!values_close(current, MB(30), 5))
485		goto cleanup;
486
487	ret = 0;
488
489cleanup:
490	close(fd);
491	return ret;
492
493}
494
495/*
496 * This test checks that memory.high limits the amount of
497 * memory which can be consumed by either anonymous memory
498 * or pagecache.
499 */
500static int test_memcg_high(const char *root)
501{
502	int ret = KSFT_FAIL;
503	char *memcg;
504	long high;
505
506	memcg = cg_name(root, "memcg_test");
507	if (!memcg)
508		goto cleanup;
509
510	if (cg_create(memcg))
511		goto cleanup;
512
513	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
514		goto cleanup;
515
516	if (cg_write(memcg, "memory.swap.max", "0"))
517		goto cleanup;
518
519	if (cg_write(memcg, "memory.high", "30M"))
520		goto cleanup;
521
522	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
523		goto cleanup;
524
525	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
526		goto cleanup;
527
528	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
529		goto cleanup;
530
531	high = cg_read_key_long(memcg, "memory.events", "high ");
532	if (high <= 0)
533		goto cleanup;
534
535	ret = KSFT_PASS;
536
537cleanup:
538	cg_destroy(memcg);
539	free(memcg);
540
541	return ret;
542}
543
544static int alloc_anon_mlock(const char *cgroup, void *arg)
545{
546	size_t size = (size_t)arg;
547	void *buf;
548
549	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
550		   0, 0);
551	if (buf == MAP_FAILED)
552		return -1;
553
554	mlock(buf, size);
555	munmap(buf, size);
556	return 0;
557}
558
559/*
560 * This test checks that memory.high is able to throttle big single shot
561 * allocation i.e. large allocation within one kernel entry.
562 */
563static int test_memcg_high_sync(const char *root)
564{
565	int ret = KSFT_FAIL, pid, fd = -1;
566	char *memcg;
567	long pre_high, pre_max;
568	long post_high, post_max;
569
570	memcg = cg_name(root, "memcg_test");
571	if (!memcg)
572		goto cleanup;
573
574	if (cg_create(memcg))
575		goto cleanup;
576
577	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
578	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
579	if (pre_high < 0 || pre_max < 0)
580		goto cleanup;
581
582	if (cg_write(memcg, "memory.swap.max", "0"))
583		goto cleanup;
584
585	if (cg_write(memcg, "memory.high", "30M"))
586		goto cleanup;
587
588	if (cg_write(memcg, "memory.max", "140M"))
589		goto cleanup;
590
591	fd = memcg_prepare_for_wait(memcg);
592	if (fd < 0)
593		goto cleanup;
594
595	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
596	if (pid < 0)
597		goto cleanup;
598
599	cg_wait_for(fd);
600
601	post_high = cg_read_key_long(memcg, "memory.events", "high ");
602	post_max = cg_read_key_long(memcg, "memory.events", "max ");
603	if (post_high < 0 || post_max < 0)
604		goto cleanup;
605
606	if (pre_high == post_high || pre_max != post_max)
607		goto cleanup;
608
609	ret = KSFT_PASS;
610
611cleanup:
612	if (fd >= 0)
613		close(fd);
614	cg_destroy(memcg);
615	free(memcg);
616
617	return ret;
618}
619
620/*
621 * This test checks that memory.max limits the amount of
622 * memory which can be consumed by either anonymous memory
623 * or pagecache.
624 */
625static int test_memcg_max(const char *root)
626{
627	int ret = KSFT_FAIL;
628	char *memcg;
629	long current, max;
630
631	memcg = cg_name(root, "memcg_test");
632	if (!memcg)
633		goto cleanup;
634
635	if (cg_create(memcg))
636		goto cleanup;
637
638	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
639		goto cleanup;
640
641	if (cg_write(memcg, "memory.swap.max", "0"))
642		goto cleanup;
643
644	if (cg_write(memcg, "memory.max", "30M"))
645		goto cleanup;
646
647	/* Should be killed by OOM killer */
648	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
649		goto cleanup;
650
651	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
652		goto cleanup;
653
654	current = cg_read_long(memcg, "memory.current");
655	if (current > MB(30) || !current)
656		goto cleanup;
657
658	max = cg_read_key_long(memcg, "memory.events", "max ");
659	if (max <= 0)
660		goto cleanup;
661
662	ret = KSFT_PASS;
663
664cleanup:
665	cg_destroy(memcg);
666	free(memcg);
667
668	return ret;
669}
670
671/*
672 * Reclaim from @memcg until usage reaches @goal by writing to
673 * memory.reclaim.
674 *
675 * This function will return false if the usage is already below the
676 * goal.
677 *
678 * This function assumes that writing to memory.reclaim is the only
679 * source of change in memory.current (no concurrent allocations or
680 * reclaim).
681 *
682 * This function makes sure memory.reclaim is sane. It will return
683 * false if memory.reclaim's error codes do not make sense, even if
684 * the usage goal was satisfied.
685 */
686static bool reclaim_until(const char *memcg, long goal)
687{
688	char buf[64];
689	int retries, err;
690	long current, to_reclaim;
691	bool reclaimed = false;
692
693	for (retries = 5; retries > 0; retries--) {
694		current = cg_read_long(memcg, "memory.current");
695
696		if (current < goal || values_close(current, goal, 3))
697			break;
698		/* Did memory.reclaim return 0 incorrectly? */
699		else if (reclaimed)
700			return false;
701
702		to_reclaim = current - goal;
703		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
704		err = cg_write(memcg, "memory.reclaim", buf);
705		if (!err)
706			reclaimed = true;
707		else if (err != -EAGAIN)
708			return false;
709	}
710	return reclaimed;
711}
712
713/*
714 * This test checks that memory.reclaim reclaims the given
715 * amount of memory (from both anon and file, if possible).
716 */
717static int test_memcg_reclaim(const char *root)
718{
719	int ret = KSFT_FAIL, fd, retries;
720	char *memcg;
721	long current, expected_usage;
722
723	memcg = cg_name(root, "memcg_test");
724	if (!memcg)
725		goto cleanup;
726
727	if (cg_create(memcg))
728		goto cleanup;
729
730	current = cg_read_long(memcg, "memory.current");
731	if (current != 0)
732		goto cleanup;
733
734	fd = get_temp_fd();
735	if (fd < 0)
736		goto cleanup;
737
738	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
739
740	/*
741	 * If swap is enabled, try to reclaim from both anon and file, else try
742	 * to reclaim from file only.
743	 */
744	if (is_swap_enabled()) {
745		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
746		expected_usage = MB(100);
747	} else
748		expected_usage = MB(50);
749
750	/*
751	 * Wait until current usage reaches the expected usage (or we run out of
752	 * retries).
753	 */
754	retries = 5;
755	while (!values_close(cg_read_long(memcg, "memory.current"),
756			    expected_usage, 10)) {
757		if (retries--) {
758			sleep(1);
759			continue;
760		} else {
761			fprintf(stderr,
762				"failed to allocate %ld for memcg reclaim test\n",
763				expected_usage);
764			goto cleanup;
765		}
766	}
767
768	/*
769	 * Reclaim until current reaches 30M, this makes sure we hit both anon
770	 * and file if swap is enabled.
771	 */
772	if (!reclaim_until(memcg, MB(30)))
773		goto cleanup;
774
775	ret = KSFT_PASS;
776cleanup:
777	cg_destroy(memcg);
778	free(memcg);
779	close(fd);
780
781	return ret;
782}
783
784static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
785{
786	long mem_max = (long)arg;
787	size_t size = MB(50);
788	char *buf, *ptr;
789	long mem_current, swap_current;
790	int ret = -1;
791
792	buf = malloc(size);
793	if (buf == NULL) {
794		fprintf(stderr, "malloc() failed\n");
795		return -1;
796	}
797
798	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
799		*ptr = 0;
800
801	mem_current = cg_read_long(cgroup, "memory.current");
802	if (!mem_current || !values_close(mem_current, mem_max, 3))
803		goto cleanup;
804
805	swap_current = cg_read_long(cgroup, "memory.swap.current");
806	if (!swap_current ||
807	    !values_close(mem_current + swap_current, size, 3))
808		goto cleanup;
809
810	ret = 0;
811cleanup:
812	free(buf);
813	return ret;
814}
815
816/*
817 * This test checks that memory.swap.max limits the amount of
818 * anonymous memory which can be swapped out.
819 */
820static int test_memcg_swap_max(const char *root)
821{
822	int ret = KSFT_FAIL;
823	char *memcg;
824	long max;
825
826	if (!is_swap_enabled())
827		return KSFT_SKIP;
828
829	memcg = cg_name(root, "memcg_test");
830	if (!memcg)
831		goto cleanup;
832
833	if (cg_create(memcg))
834		goto cleanup;
835
836	if (cg_read_long(memcg, "memory.swap.current")) {
837		ret = KSFT_SKIP;
838		goto cleanup;
839	}
840
841	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
842		goto cleanup;
843
844	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
845		goto cleanup;
846
847	if (cg_write(memcg, "memory.swap.max", "30M"))
848		goto cleanup;
849
850	if (cg_write(memcg, "memory.max", "30M"))
851		goto cleanup;
852
853	/* Should be killed by OOM killer */
854	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
855		goto cleanup;
856
857	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
858		goto cleanup;
859
860	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
861		goto cleanup;
862
863	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
864		goto cleanup;
865
866	max = cg_read_key_long(memcg, "memory.events", "max ");
867	if (max <= 0)
868		goto cleanup;
869
870	ret = KSFT_PASS;
871
872cleanup:
873	cg_destroy(memcg);
874	free(memcg);
875
876	return ret;
877}
878
879/*
880 * This test disables swapping and tries to allocate anonymous memory
881 * up to OOM. Then it checks for oom and oom_kill events in
882 * memory.events.
883 */
884static int test_memcg_oom_events(const char *root)
885{
886	int ret = KSFT_FAIL;
887	char *memcg;
888
889	memcg = cg_name(root, "memcg_test");
890	if (!memcg)
891		goto cleanup;
892
893	if (cg_create(memcg))
894		goto cleanup;
895
896	if (cg_write(memcg, "memory.max", "30M"))
897		goto cleanup;
898
899	if (cg_write(memcg, "memory.swap.max", "0"))
900		goto cleanup;
901
902	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
903		goto cleanup;
904
905	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
906		goto cleanup;
907
908	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
909		goto cleanup;
910
911	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
912		goto cleanup;
913
914	ret = KSFT_PASS;
915
916cleanup:
917	cg_destroy(memcg);
918	free(memcg);
919
920	return ret;
921}
922
923struct tcp_server_args {
924	unsigned short port;
925	int ctl[2];
926};
927
928static int tcp_server(const char *cgroup, void *arg)
929{
930	struct tcp_server_args *srv_args = arg;
931	struct sockaddr_in6 saddr = { 0 };
932	socklen_t slen = sizeof(saddr);
933	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
934
935	close(srv_args->ctl[0]);
936	ctl_fd = srv_args->ctl[1];
937
938	saddr.sin6_family = AF_INET6;
939	saddr.sin6_addr = in6addr_any;
940	saddr.sin6_port = htons(srv_args->port);
941
942	sk = socket(AF_INET6, SOCK_STREAM, 0);
943	if (sk < 0)
944		return ret;
945
946	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
947		goto cleanup;
948
949	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
950		write(ctl_fd, &errno, sizeof(errno));
951		goto cleanup;
952	}
953
954	if (listen(sk, 1))
955		goto cleanup;
956
957	ret = 0;
958	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
959		ret = -1;
960		goto cleanup;
961	}
962
963	client_sk = accept(sk, NULL, NULL);
964	if (client_sk < 0)
965		goto cleanup;
966
967	ret = -1;
968	for (;;) {
969		uint8_t buf[0x100000];
970
971		if (write(client_sk, buf, sizeof(buf)) <= 0) {
972			if (errno == ECONNRESET)
973				ret = 0;
974			break;
975		}
976	}
977
978	close(client_sk);
979
980cleanup:
981	close(sk);
982	return ret;
983}
984
985static int tcp_client(const char *cgroup, unsigned short port)
986{
987	const char server[] = "localhost";
988	struct addrinfo *ai;
989	char servport[6];
990	int retries = 0x10; /* nice round number */
991	int sk, ret;
992	long allocated;
993
994	allocated = cg_read_long(cgroup, "memory.current");
995	snprintf(servport, sizeof(servport), "%hd", port);
996	ret = getaddrinfo(server, servport, NULL, &ai);
997	if (ret)
998		return ret;
999
1000	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1001	if (sk < 0)
1002		goto free_ainfo;
1003
1004	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1005	if (ret < 0)
1006		goto close_sk;
1007
1008	ret = KSFT_FAIL;
1009	while (retries--) {
1010		uint8_t buf[0x100000];
1011		long current, sock;
1012
1013		if (read(sk, buf, sizeof(buf)) <= 0)
1014			goto close_sk;
1015
1016		current = cg_read_long(cgroup, "memory.current");
1017		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1018
1019		if (current < 0 || sock < 0)
1020			goto close_sk;
1021
1022		/* exclude the memory not related to socket connection */
1023		if (values_close(current - allocated, sock, 10)) {
1024			ret = KSFT_PASS;
1025			break;
1026		}
1027	}
1028
1029close_sk:
1030	close(sk);
1031free_ainfo:
1032	freeaddrinfo(ai);
1033	return ret;
1034}
1035
1036/*
1037 * This test checks socket memory accounting.
1038 * The test forks a TCP server listens on a random port between 1000
1039 * and 61000. Once it gets a client connection, it starts writing to
1040 * its socket.
1041 * The TCP client interleaves reads from the socket with check whether
1042 * memory.current and memory.stat.sock are similar.
1043 */
1044static int test_memcg_sock(const char *root)
1045{
1046	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1047	unsigned short port;
1048	char *memcg;
1049
1050	memcg = cg_name(root, "memcg_test");
1051	if (!memcg)
1052		goto cleanup;
1053
1054	if (cg_create(memcg))
1055		goto cleanup;
1056
1057	while (bind_retries--) {
1058		struct tcp_server_args args;
1059
1060		if (pipe(args.ctl))
1061			goto cleanup;
1062
1063		port = args.port = 1000 + rand() % 60000;
1064
1065		pid = cg_run_nowait(memcg, tcp_server, &args);
1066		if (pid < 0)
1067			goto cleanup;
1068
1069		close(args.ctl[1]);
1070		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1071			goto cleanup;
1072		close(args.ctl[0]);
1073
1074		if (!err)
1075			break;
1076		if (err != EADDRINUSE)
1077			goto cleanup;
1078
1079		waitpid(pid, NULL, 0);
1080	}
1081
1082	if (err == EADDRINUSE) {
1083		ret = KSFT_SKIP;
1084		goto cleanup;
1085	}
1086
1087	if (tcp_client(memcg, port) != KSFT_PASS)
1088		goto cleanup;
1089
1090	waitpid(pid, &err, 0);
1091	if (WEXITSTATUS(err))
1092		goto cleanup;
1093
1094	if (cg_read_long(memcg, "memory.current") < 0)
1095		goto cleanup;
1096
1097	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1098		goto cleanup;
1099
1100	ret = KSFT_PASS;
1101
1102cleanup:
1103	cg_destroy(memcg);
1104	free(memcg);
1105
1106	return ret;
1107}
1108
1109/*
1110 * This test disables swapping and tries to allocate anonymous memory
1111 * up to OOM with memory.group.oom set. Then it checks that all
1112 * processes in the leaf were killed. It also checks that oom_events
1113 * were propagated to the parent level.
1114 */
1115static int test_memcg_oom_group_leaf_events(const char *root)
1116{
1117	int ret = KSFT_FAIL;
1118	char *parent, *child;
1119	long parent_oom_events;
1120
1121	parent = cg_name(root, "memcg_test_0");
1122	child = cg_name(root, "memcg_test_0/memcg_test_1");
1123
1124	if (!parent || !child)
1125		goto cleanup;
1126
1127	if (cg_create(parent))
1128		goto cleanup;
1129
1130	if (cg_create(child))
1131		goto cleanup;
1132
1133	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1134		goto cleanup;
1135
1136	if (cg_write(child, "memory.max", "50M"))
1137		goto cleanup;
1138
1139	if (cg_write(child, "memory.swap.max", "0"))
1140		goto cleanup;
1141
1142	if (cg_write(child, "memory.oom.group", "1"))
1143		goto cleanup;
1144
1145	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1146	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1147	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1148	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1149		goto cleanup;
1150
1151	if (cg_test_proc_killed(child))
1152		goto cleanup;
1153
1154	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1155		goto cleanup;
1156
1157	parent_oom_events = cg_read_key_long(
1158			parent, "memory.events", "oom_kill ");
1159	/*
1160	 * If memory_localevents is not enabled (the default), the parent should
1161	 * count OOM events in its children groups. Otherwise, it should not
1162	 * have observed any events.
1163	 */
1164	if (has_localevents && parent_oom_events != 0)
1165		goto cleanup;
1166	else if (!has_localevents && parent_oom_events <= 0)
1167		goto cleanup;
1168
1169	ret = KSFT_PASS;
1170
1171cleanup:
1172	if (child)
1173		cg_destroy(child);
1174	if (parent)
1175		cg_destroy(parent);
1176	free(child);
1177	free(parent);
1178
1179	return ret;
1180}
1181
1182/*
1183 * This test disables swapping and tries to allocate anonymous memory
1184 * up to OOM with memory.group.oom set. Then it checks that all
1185 * processes in the parent and leaf were killed.
1186 */
1187static int test_memcg_oom_group_parent_events(const char *root)
1188{
1189	int ret = KSFT_FAIL;
1190	char *parent, *child;
1191
1192	parent = cg_name(root, "memcg_test_0");
1193	child = cg_name(root, "memcg_test_0/memcg_test_1");
1194
1195	if (!parent || !child)
1196		goto cleanup;
1197
1198	if (cg_create(parent))
1199		goto cleanup;
1200
1201	if (cg_create(child))
1202		goto cleanup;
1203
1204	if (cg_write(parent, "memory.max", "80M"))
1205		goto cleanup;
1206
1207	if (cg_write(parent, "memory.swap.max", "0"))
1208		goto cleanup;
1209
1210	if (cg_write(parent, "memory.oom.group", "1"))
1211		goto cleanup;
1212
1213	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1214	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1215	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1216
1217	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1218		goto cleanup;
1219
1220	if (cg_test_proc_killed(child))
1221		goto cleanup;
1222	if (cg_test_proc_killed(parent))
1223		goto cleanup;
1224
1225	ret = KSFT_PASS;
1226
1227cleanup:
1228	if (child)
1229		cg_destroy(child);
1230	if (parent)
1231		cg_destroy(parent);
1232	free(child);
1233	free(parent);
1234
1235	return ret;
1236}
1237
1238/*
1239 * This test disables swapping and tries to allocate anonymous memory
1240 * up to OOM with memory.group.oom set. Then it checks that all
1241 * processes were killed except those set with OOM_SCORE_ADJ_MIN
1242 */
1243static int test_memcg_oom_group_score_events(const char *root)
1244{
1245	int ret = KSFT_FAIL;
1246	char *memcg;
1247	int safe_pid;
1248
1249	memcg = cg_name(root, "memcg_test_0");
1250
1251	if (!memcg)
1252		goto cleanup;
1253
1254	if (cg_create(memcg))
1255		goto cleanup;
1256
1257	if (cg_write(memcg, "memory.max", "50M"))
1258		goto cleanup;
1259
1260	if (cg_write(memcg, "memory.swap.max", "0"))
1261		goto cleanup;
1262
1263	if (cg_write(memcg, "memory.oom.group", "1"))
1264		goto cleanup;
1265
1266	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1267	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1268		goto cleanup;
1269
1270	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1271	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1272		goto cleanup;
1273
1274	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1275		goto cleanup;
1276
1277	if (kill(safe_pid, SIGKILL))
1278		goto cleanup;
1279
1280	ret = KSFT_PASS;
1281
1282cleanup:
1283	if (memcg)
1284		cg_destroy(memcg);
1285	free(memcg);
1286
1287	return ret;
1288}
1289
1290#define T(x) { x, #x }
1291struct memcg_test {
1292	int (*fn)(const char *root);
1293	const char *name;
1294} tests[] = {
1295	T(test_memcg_subtree_control),
1296	T(test_memcg_current),
1297	T(test_memcg_min),
1298	T(test_memcg_low),
1299	T(test_memcg_high),
1300	T(test_memcg_high_sync),
1301	T(test_memcg_max),
1302	T(test_memcg_reclaim),
1303	T(test_memcg_oom_events),
1304	T(test_memcg_swap_max),
1305	T(test_memcg_sock),
1306	T(test_memcg_oom_group_leaf_events),
1307	T(test_memcg_oom_group_parent_events),
1308	T(test_memcg_oom_group_score_events),
1309};
1310#undef T
1311
1312int main(int argc, char **argv)
1313{
1314	char root[PATH_MAX];
1315	int i, proc_status, ret = EXIT_SUCCESS;
1316
1317	if (cg_find_unified_root(root, sizeof(root)))
1318		ksft_exit_skip("cgroup v2 isn't mounted\n");
1319
1320	/*
1321	 * Check that memory controller is available:
1322	 * memory is listed in cgroup.controllers
1323	 */
1324	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1325		ksft_exit_skip("memory controller isn't available\n");
1326
1327	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1328		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1329			ksft_exit_skip("Failed to set memory controller\n");
1330
1331	proc_status = proc_mount_contains("memory_recursiveprot");
1332	if (proc_status < 0)
1333		ksft_exit_skip("Failed to query cgroup mount option\n");
1334	has_recursiveprot = proc_status;
1335
1336	proc_status = proc_mount_contains("memory_localevents");
1337	if (proc_status < 0)
1338		ksft_exit_skip("Failed to query cgroup mount option\n");
1339	has_localevents = proc_status;
1340
1341	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1342		switch (tests[i].fn(root)) {
1343		case KSFT_PASS:
1344			ksft_test_result_pass("%s\n", tests[i].name);
1345			break;
1346		case KSFT_SKIP:
1347			ksft_test_result_skip("%s\n", tests[i].name);
1348			break;
1349		default:
1350			ret = EXIT_FAILURE;
1351			ksft_test_result_fail("%s\n", tests[i].name);
1352			break;
1353		}
1354	}
1355
1356	return ret;
1357}
1358