1// SPDX-License-Identifier: GPL-2.0
2#define _GNU_SOURCE
3#include <sched.h>
4#include <sys/mount.h>
5#include <sys/stat.h>
6#include <sys/types.h>
7#include <linux/limits.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <linux/sched.h>
11#include <fcntl.h>
12#include <unistd.h>
13#include <ftw.h>
14
15#include "cgroup_helpers.h"
16#include "bpf_util.h"
17
18/*
19 * To avoid relying on the system setup, when setup_cgroup_env is called
20 * we create a new mount namespace, and cgroup namespace. The cgroupv2
21 * root is mounted at CGROUP_MOUNT_PATH. Unfortunately, most people don't
22 * have cgroupv2 enabled at this point in time. It's easier to create our
23 * own mount namespace and manage it ourselves. We assume /mnt exists.
24 *
25 * Related cgroupv1 helpers are named *classid*(), since we only use the
26 * net_cls controller for tagging net_cls.classid. We assume the default
27 * mount under /sys/fs/cgroup/net_cls, which should be the case for the
28 * vast majority of users.
29 */
30
31#define WALK_FD_LIMIT			16
32
33#define CGROUP_MOUNT_PATH		"/mnt"
34#define CGROUP_MOUNT_DFLT		"/sys/fs/cgroup"
35#define NETCLS_MOUNT_PATH		CGROUP_MOUNT_DFLT "/net_cls"
36#define CGROUP_WORK_DIR			"/cgroup-test-work-dir"
37
38#define format_cgroup_path_pid(buf, path, pid) \
39	snprintf(buf, sizeof(buf), "%s%s%d%s", CGROUP_MOUNT_PATH, \
40	CGROUP_WORK_DIR, pid, path)
41
42#define format_cgroup_path(buf, path) \
43	format_cgroup_path_pid(buf, path, getpid())
44
45#define format_parent_cgroup_path(buf, path) \
46	format_cgroup_path_pid(buf, path, getppid())
47
48#define format_classid_path_pid(buf, pid)				\
49	snprintf(buf, sizeof(buf), "%s%s%d", NETCLS_MOUNT_PATH,	\
50		 CGROUP_WORK_DIR, pid)
51
52#define format_classid_path(buf)	\
53	format_classid_path_pid(buf, getpid())
54
55static __thread bool cgroup_workdir_mounted;
56
57static void __cleanup_cgroup_environment(void);
58
59static int __enable_controllers(const char *cgroup_path, const char *controllers)
60{
61	char path[PATH_MAX + 1];
62	char enable[PATH_MAX + 1];
63	char *c, *c2;
64	int fd, cfd;
65	ssize_t len;
66
67	/* If not controllers are passed, enable all available controllers */
68	if (!controllers) {
69		snprintf(path, sizeof(path), "%s/cgroup.controllers",
70			 cgroup_path);
71		fd = open(path, O_RDONLY);
72		if (fd < 0) {
73			log_err("Opening cgroup.controllers: %s", path);
74			return 1;
75		}
76		len = read(fd, enable, sizeof(enable) - 1);
77		if (len < 0) {
78			close(fd);
79			log_err("Reading cgroup.controllers: %s", path);
80			return 1;
81		} else if (len == 0) { /* No controllers to enable */
82			close(fd);
83			return 0;
84		}
85		enable[len] = 0;
86		close(fd);
87	} else {
88		bpf_strlcpy(enable, controllers, sizeof(enable));
89	}
90
91	snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path);
92	cfd = open(path, O_RDWR);
93	if (cfd < 0) {
94		log_err("Opening cgroup.subtree_control: %s", path);
95		return 1;
96	}
97
98	for (c = strtok_r(enable, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) {
99		if (dprintf(cfd, "+%s\n", c) <= 0) {
100			log_err("Enabling controller %s: %s", c, path);
101			close(cfd);
102			return 1;
103		}
104	}
105	close(cfd);
106	return 0;
107}
108
109/**
110 * enable_controllers() - Enable cgroup v2 controllers
111 * @relative_path: The cgroup path, relative to the workdir
112 * @controllers: List of controllers to enable in cgroup.controllers format
113 *
114 *
115 * Enable given cgroup v2 controllers, if @controllers is NULL, enable all
116 * available controllers.
117 *
118 * If successful, 0 is returned.
119 */
120int enable_controllers(const char *relative_path, const char *controllers)
121{
122	char cgroup_path[PATH_MAX + 1];
123
124	format_cgroup_path(cgroup_path, relative_path);
125	return __enable_controllers(cgroup_path, controllers);
126}
127
128static int __write_cgroup_file(const char *cgroup_path, const char *file,
129			       const char *buf)
130{
131	char file_path[PATH_MAX + 1];
132	int fd;
133
134	snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file);
135	fd = open(file_path, O_RDWR);
136	if (fd < 0) {
137		log_err("Opening %s", file_path);
138		return 1;
139	}
140
141	if (dprintf(fd, "%s", buf) <= 0) {
142		log_err("Writing to %s", file_path);
143		close(fd);
144		return 1;
145	}
146	close(fd);
147	return 0;
148}
149
150/**
151 * write_cgroup_file() - Write to a cgroup file
152 * @relative_path: The cgroup path, relative to the workdir
153 * @file: The name of the file in cgroupfs to write to
154 * @buf: Buffer to write to the file
155 *
156 * Write to a file in the given cgroup's directory.
157 *
158 * If successful, 0 is returned.
159 */
160int write_cgroup_file(const char *relative_path, const char *file,
161		      const char *buf)
162{
163	char cgroup_path[PATH_MAX - 24];
164
165	format_cgroup_path(cgroup_path, relative_path);
166	return __write_cgroup_file(cgroup_path, file, buf);
167}
168
169/**
170 * write_cgroup_file_parent() - Write to a cgroup file in the parent process
171 *                              workdir
172 * @relative_path: The cgroup path, relative to the parent process workdir
173 * @file: The name of the file in cgroupfs to write to
174 * @buf: Buffer to write to the file
175 *
176 * Write to a file in the given cgroup's directory under the parent process
177 * workdir.
178 *
179 * If successful, 0 is returned.
180 */
181int write_cgroup_file_parent(const char *relative_path, const char *file,
182			     const char *buf)
183{
184	char cgroup_path[PATH_MAX - 24];
185
186	format_parent_cgroup_path(cgroup_path, relative_path);
187	return __write_cgroup_file(cgroup_path, file, buf);
188}
189
190/**
191 * setup_cgroup_environment() - Setup the cgroup environment
192 *
193 * After calling this function, cleanup_cgroup_environment should be called
194 * once testing is complete.
195 *
196 * This function will print an error to stderr and return 1 if it is unable
197 * to setup the cgroup environment. If setup is successful, 0 is returned.
198 */
199int setup_cgroup_environment(void)
200{
201	char cgroup_workdir[PATH_MAX - 24];
202
203	format_cgroup_path(cgroup_workdir, "");
204
205	if (mkdir(CGROUP_MOUNT_PATH, 0777) && errno != EEXIST) {
206		log_err("mkdir mount");
207		return 1;
208	}
209
210	if (unshare(CLONE_NEWNS)) {
211		log_err("unshare");
212		return 1;
213	}
214
215	if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
216		log_err("mount fakeroot");
217		return 1;
218	}
219
220	if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) {
221		log_err("mount cgroup2");
222		return 1;
223	}
224	cgroup_workdir_mounted = true;
225
226	/* Cleanup existing failed runs, now that the environment is setup */
227	__cleanup_cgroup_environment();
228
229	if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) {
230		log_err("mkdir cgroup work dir");
231		return 1;
232	}
233
234	/* Enable all available controllers to increase test coverage */
235	if (__enable_controllers(CGROUP_MOUNT_PATH, NULL) ||
236	    __enable_controllers(cgroup_workdir, NULL))
237		return 1;
238
239	return 0;
240}
241
242static int nftwfunc(const char *filename, const struct stat *statptr,
243		    int fileflags, struct FTW *pfwt)
244{
245	if ((fileflags & FTW_D) && rmdir(filename))
246		log_err("Removing cgroup: %s", filename);
247	return 0;
248}
249
250static int join_cgroup_from_top(const char *cgroup_path)
251{
252	char cgroup_procs_path[PATH_MAX + 1];
253	pid_t pid = getpid();
254	int fd, rc = 0;
255
256	snprintf(cgroup_procs_path, sizeof(cgroup_procs_path),
257		 "%s/cgroup.procs", cgroup_path);
258
259	fd = open(cgroup_procs_path, O_WRONLY);
260	if (fd < 0) {
261		log_err("Opening Cgroup Procs: %s", cgroup_procs_path);
262		return 1;
263	}
264
265	if (dprintf(fd, "%d\n", pid) < 0) {
266		log_err("Joining Cgroup");
267		rc = 1;
268	}
269
270	close(fd);
271	return rc;
272}
273
274/**
275 * join_cgroup() - Join a cgroup
276 * @relative_path: The cgroup path, relative to the workdir, to join
277 *
278 * This function expects a cgroup to already be created, relative to the cgroup
279 * work dir, and it joins it. For example, passing "/my-cgroup" as the path
280 * would actually put the calling process into the cgroup
281 * "/cgroup-test-work-dir/my-cgroup"
282 *
283 * On success, it returns 0, otherwise on failure it returns 1.
284 */
285int join_cgroup(const char *relative_path)
286{
287	char cgroup_path[PATH_MAX + 1];
288
289	format_cgroup_path(cgroup_path, relative_path);
290	return join_cgroup_from_top(cgroup_path);
291}
292
293/**
294 * join_root_cgroup() - Join the root cgroup
295 *
296 * This function joins the root cgroup.
297 *
298 * On success, it returns 0, otherwise on failure it returns 1.
299 */
300int join_root_cgroup(void)
301{
302	return join_cgroup_from_top(CGROUP_MOUNT_PATH);
303}
304
305/**
306 * join_parent_cgroup() - Join a cgroup in the parent process workdir
307 * @relative_path: The cgroup path, relative to parent process workdir, to join
308 *
309 * See join_cgroup().
310 *
311 * On success, it returns 0, otherwise on failure it returns 1.
312 */
313int join_parent_cgroup(const char *relative_path)
314{
315	char cgroup_path[PATH_MAX + 1];
316
317	format_parent_cgroup_path(cgroup_path, relative_path);
318	return join_cgroup_from_top(cgroup_path);
319}
320
321/**
322 * __cleanup_cgroup_environment() - Delete temporary cgroups
323 *
324 * This is a helper for cleanup_cgroup_environment() that is responsible for
325 * deletion of all temporary cgroups that have been created during the test.
326 */
327static void __cleanup_cgroup_environment(void)
328{
329	char cgroup_workdir[PATH_MAX + 1];
330
331	format_cgroup_path(cgroup_workdir, "");
332	join_cgroup_from_top(CGROUP_MOUNT_PATH);
333	nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT);
334}
335
336/**
337 * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment
338 *
339 * This is an idempotent function to delete all temporary cgroups that
340 * have been created during the test and unmount the cgroup testing work
341 * directory.
342 *
343 * At call time, it moves the calling process to the root cgroup, and then
344 * runs the deletion process. It is idempotent, and should not fail, unless
345 * a process is lingering.
346 *
347 * On failure, it will print an error to stderr, and try to continue.
348 */
349void cleanup_cgroup_environment(void)
350{
351	__cleanup_cgroup_environment();
352	if (cgroup_workdir_mounted && umount(CGROUP_MOUNT_PATH))
353		log_err("umount cgroup2");
354	cgroup_workdir_mounted = false;
355}
356
357/**
358 * get_root_cgroup() - Get the FD of the root cgroup
359 *
360 * On success, it returns the file descriptor. On failure, it returns -1.
361 * If there is a failure, it prints the error to stderr.
362 */
363int get_root_cgroup(void)
364{
365	int fd;
366
367	fd = open(CGROUP_MOUNT_PATH, O_RDONLY);
368	if (fd < 0) {
369		log_err("Opening root cgroup");
370		return -1;
371	}
372	return fd;
373}
374
375/*
376 * remove_cgroup() - Remove a cgroup
377 * @relative_path: The cgroup path, relative to the workdir, to remove
378 *
379 * This function expects a cgroup to already be created, relative to the cgroup
380 * work dir. It also expects the cgroup doesn't have any children or live
381 * processes and it removes the cgroup.
382 *
383 * On failure, it will print an error to stderr.
384 */
385void remove_cgroup(const char *relative_path)
386{
387	char cgroup_path[PATH_MAX + 1];
388
389	format_cgroup_path(cgroup_path, relative_path);
390	if (rmdir(cgroup_path))
391		log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path);
392}
393
394/**
395 * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD
396 * @relative_path: The cgroup path, relative to the workdir, to join
397 *
398 * This function creates a cgroup under the top level workdir and returns the
399 * file descriptor. It is idempotent.
400 *
401 * On success, it returns the file descriptor. On failure it returns -1.
402 * If there is a failure, it prints the error to stderr.
403 */
404int create_and_get_cgroup(const char *relative_path)
405{
406	char cgroup_path[PATH_MAX + 1];
407	int fd;
408
409	format_cgroup_path(cgroup_path, relative_path);
410	if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
411		log_err("mkdiring cgroup %s .. %s", relative_path, cgroup_path);
412		return -1;
413	}
414
415	fd = open(cgroup_path, O_RDONLY);
416	if (fd < 0) {
417		log_err("Opening Cgroup");
418		return -1;
419	}
420
421	return fd;
422}
423
424/**
425 * get_cgroup_id_from_path - Get cgroup id for a particular cgroup path
426 * @cgroup_workdir: The absolute cgroup path
427 *
428 * On success, it returns the cgroup id. On failure it returns 0,
429 * which is an invalid cgroup id.
430 * If there is a failure, it prints the error to stderr.
431 */
432unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir)
433{
434	int dirfd, err, flags, mount_id, fhsize;
435	union {
436		unsigned long long cgid;
437		unsigned char raw_bytes[8];
438	} id;
439	struct file_handle *fhp, *fhp2;
440	unsigned long long ret = 0;
441
442	dirfd = AT_FDCWD;
443	flags = 0;
444	fhsize = sizeof(*fhp);
445	fhp = calloc(1, fhsize);
446	if (!fhp) {
447		log_err("calloc");
448		return 0;
449	}
450	err = name_to_handle_at(dirfd, cgroup_workdir, fhp, &mount_id, flags);
451	if (err >= 0 || fhp->handle_bytes != 8) {
452		log_err("name_to_handle_at");
453		goto free_mem;
454	}
455
456	fhsize = sizeof(struct file_handle) + fhp->handle_bytes;
457	fhp2 = realloc(fhp, fhsize);
458	if (!fhp2) {
459		log_err("realloc");
460		goto free_mem;
461	}
462	err = name_to_handle_at(dirfd, cgroup_workdir, fhp2, &mount_id, flags);
463	fhp = fhp2;
464	if (err < 0) {
465		log_err("name_to_handle_at");
466		goto free_mem;
467	}
468
469	memcpy(id.raw_bytes, fhp->f_handle, 8);
470	ret = id.cgid;
471
472free_mem:
473	free(fhp);
474	return ret;
475}
476
477unsigned long long get_cgroup_id(const char *relative_path)
478{
479	char cgroup_workdir[PATH_MAX + 1];
480
481	format_cgroup_path(cgroup_workdir, relative_path);
482	return get_cgroup_id_from_path(cgroup_workdir);
483}
484
485int cgroup_setup_and_join(const char *path) {
486	int cg_fd;
487
488	if (setup_cgroup_environment()) {
489		fprintf(stderr, "Failed to setup cgroup environment\n");
490		return -EINVAL;
491	}
492
493	cg_fd = create_and_get_cgroup(path);
494	if (cg_fd < 0) {
495		fprintf(stderr, "Failed to create test cgroup\n");
496		cleanup_cgroup_environment();
497		return cg_fd;
498	}
499
500	if (join_cgroup(path)) {
501		fprintf(stderr, "Failed to join cgroup\n");
502		cleanup_cgroup_environment();
503		return -EINVAL;
504	}
505	return cg_fd;
506}
507
508/**
509 * setup_classid_environment() - Setup the cgroupv1 net_cls environment
510 *
511 * After calling this function, cleanup_classid_environment should be called
512 * once testing is complete.
513 *
514 * This function will print an error to stderr and return 1 if it is unable
515 * to setup the cgroup environment. If setup is successful, 0 is returned.
516 */
517int setup_classid_environment(void)
518{
519	char cgroup_workdir[PATH_MAX + 1];
520
521	format_classid_path(cgroup_workdir);
522
523	if (mount("tmpfs", CGROUP_MOUNT_DFLT, "tmpfs", 0, NULL) &&
524	    errno != EBUSY) {
525		log_err("mount cgroup base");
526		return 1;
527	}
528
529	if (mkdir(NETCLS_MOUNT_PATH, 0777) && errno != EEXIST) {
530		log_err("mkdir cgroup net_cls");
531		return 1;
532	}
533
534	if (mount("net_cls", NETCLS_MOUNT_PATH, "cgroup", 0, "net_cls")) {
535		if (errno != EBUSY) {
536			log_err("mount cgroup net_cls");
537			return 1;
538		}
539
540		if (rmdir(NETCLS_MOUNT_PATH)) {
541			log_err("rmdir cgroup net_cls");
542			return 1;
543		}
544		if (umount(CGROUP_MOUNT_DFLT)) {
545			log_err("umount cgroup base");
546			return 1;
547		}
548	}
549
550	cleanup_classid_environment();
551
552	if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) {
553		log_err("mkdir cgroup work dir");
554		return 1;
555	}
556
557	return 0;
558}
559
560/**
561 * set_classid() - Set a cgroupv1 net_cls classid
562 *
563 * Writes the classid into the cgroup work dir's net_cls.classid
564 * file in order to later on trigger socket tagging.
565 *
566 * We leverage the current pid as the classid, ensuring unique identification.
567 *
568 * On success, it returns 0, otherwise on failure it returns 1. If there
569 * is a failure, it prints the error to stderr.
570 */
571int set_classid(void)
572{
573	char cgroup_workdir[PATH_MAX - 42];
574	char cgroup_classid_path[PATH_MAX + 1];
575	int fd, rc = 0;
576
577	format_classid_path(cgroup_workdir);
578	snprintf(cgroup_classid_path, sizeof(cgroup_classid_path),
579		 "%s/net_cls.classid", cgroup_workdir);
580
581	fd = open(cgroup_classid_path, O_WRONLY);
582	if (fd < 0) {
583		log_err("Opening cgroup classid: %s", cgroup_classid_path);
584		return 1;
585	}
586
587	if (dprintf(fd, "%u\n", getpid()) < 0) {
588		log_err("Setting cgroup classid");
589		rc = 1;
590	}
591
592	close(fd);
593	return rc;
594}
595
596/**
597 * join_classid() - Join a cgroupv1 net_cls classid
598 *
599 * This function expects the cgroup work dir to be already created, as we
600 * join it here. This causes the process sockets to be tagged with the given
601 * net_cls classid.
602 *
603 * On success, it returns 0, otherwise on failure it returns 1.
604 */
605int join_classid(void)
606{
607	char cgroup_workdir[PATH_MAX + 1];
608
609	format_classid_path(cgroup_workdir);
610	return join_cgroup_from_top(cgroup_workdir);
611}
612
613/**
614 * cleanup_classid_environment() - Cleanup the cgroupv1 net_cls environment
615 *
616 * At call time, it moves the calling process to the root cgroup, and then
617 * runs the deletion process.
618 *
619 * On failure, it will print an error to stderr, and try to continue.
620 */
621void cleanup_classid_environment(void)
622{
623	char cgroup_workdir[PATH_MAX + 1];
624
625	format_classid_path(cgroup_workdir);
626	join_cgroup_from_top(NETCLS_MOUNT_PATH);
627	nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT);
628}
629
630/**
631 * get_classid_cgroup_id - Get the cgroup id of a net_cls cgroup
632 */
633unsigned long long get_classid_cgroup_id(void)
634{
635	char cgroup_workdir[PATH_MAX + 1];
636
637	format_classid_path(cgroup_workdir);
638	return get_cgroup_id_from_path(cgroup_workdir);
639}
640
641/**
642 * get_cgroup1_hierarchy_id - Retrieves the ID of a cgroup1 hierarchy from the cgroup1 subsys name.
643 * @subsys_name: The cgroup1 subsys name, which can be retrieved from /proc/self/cgroup. It can be
644 * a named cgroup like "name=systemd", a controller name like "net_cls", or multi-contollers like
645 * "net_cls,net_prio".
646 */
647int get_cgroup1_hierarchy_id(const char *subsys_name)
648{
649	char *c, *c2, *c3, *c4;
650	bool found = false;
651	char line[1024];
652	FILE *file;
653	int i, id;
654
655	if (!subsys_name)
656		return -1;
657
658	file = fopen("/proc/self/cgroup", "r");
659	if (!file) {
660		log_err("fopen /proc/self/cgroup");
661		return -1;
662	}
663
664	while (fgets(line, 1024, file)) {
665		i = 0;
666		for (c = strtok_r(line, ":", &c2); c && i < 2; c = strtok_r(NULL, ":", &c2)) {
667			if (i == 0) {
668				id = strtol(c, NULL, 10);
669			} else if (i == 1) {
670				if (!strcmp(c, subsys_name)) {
671					found = true;
672					break;
673				}
674
675				/* Multiple subsystems may share one single mount point */
676				for (c3 = strtok_r(c, ",", &c4); c3;
677				     c3 = strtok_r(NULL, ",", &c4)) {
678					if (!strcmp(c, subsys_name)) {
679						found = true;
680						break;
681					}
682				}
683			}
684			i++;
685		}
686		if (found)
687			break;
688	}
689	fclose(file);
690	return found ? id : -1;
691}
692
693/**
694 * open_classid() - Open a cgroupv1 net_cls classid
695 *
696 * This function expects the cgroup work dir to be already created, as we
697 * open it here.
698 *
699 * On success, it returns the file descriptor. On failure it returns -1.
700 */
701int open_classid(void)
702{
703	char cgroup_workdir[PATH_MAX + 1];
704
705	format_classid_path(cgroup_workdir);
706	return open(cgroup_workdir, O_RDONLY);
707}
708