1#include <signal.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <unistd.h>
5#include <errno.h>
6#include <fcntl.h>
7#include <string.h>
8#include <stddef.h>
9#include <sys/sysmacros.h>
10#include <sys/types.h>
11#include <sys/wait.h>
12#include <sys/socket.h>
13#include <sys/stat.h>
14#include <sys/mman.h>
15#include <sys/syscall.h>
16#include <sys/user.h>
17#include <sys/ioctl.h>
18#include <sys/ptrace.h>
19#include <sys/mount.h>
20#include <linux/limits.h>
21#include <linux/filter.h>
22#include <linux/seccomp.h>
23
24#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
25
26static int seccomp(unsigned int op, unsigned int flags, void *args)
27{
28	errno = 0;
29	return syscall(__NR_seccomp, op, flags, args);
30}
31
32static int send_fd(int sock, int fd)
33{
34	struct msghdr msg = {};
35	struct cmsghdr *cmsg;
36	int *fd_ptr;
37	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
38	struct iovec io = {
39		.iov_base = &c,
40		.iov_len = 1,
41	};
42
43	msg.msg_iov = &io;
44	msg.msg_iovlen = 1;
45	msg.msg_control = buf;
46	msg.msg_controllen = sizeof(buf);
47	cmsg = CMSG_FIRSTHDR(&msg);
48	cmsg->cmsg_level = SOL_SOCKET;
49	cmsg->cmsg_type = SCM_RIGHTS;
50	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
51	fd_ptr = (int *)CMSG_DATA(cmsg);
52	*fd_ptr = fd;
53	msg.msg_controllen = cmsg->cmsg_len;
54
55	if (sendmsg(sock, &msg, 0) < 0) {
56		perror("sendmsg");
57		return -1;
58	}
59
60	return 0;
61}
62
63static int recv_fd(int sock)
64{
65	struct msghdr msg = {};
66	struct cmsghdr *cmsg;
67	int *fd_ptr;
68	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
69	struct iovec io = {
70		.iov_base = &c,
71		.iov_len = 1,
72	};
73
74	msg.msg_iov = &io;
75	msg.msg_iovlen = 1;
76	msg.msg_control = buf;
77	msg.msg_controllen = sizeof(buf);
78
79	if (recvmsg(sock, &msg, 0) < 0) {
80		perror("recvmsg");
81		return -1;
82	}
83
84	cmsg = CMSG_FIRSTHDR(&msg);
85	fd_ptr = (int *)CMSG_DATA(cmsg);
86
87	return *fd_ptr;
88}
89
90static int user_trap_syscall(int nr, unsigned int flags)
91{
92	struct sock_filter filter[] = {
93		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
94			offsetof(struct seccomp_data, nr)),
95		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
96		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
97		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
98	};
99
100	struct sock_fprog prog = {
101		.len = (unsigned short)ARRAY_SIZE(filter),
102		.filter = filter,
103	};
104
105	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
106}
107
108static int handle_req(struct seccomp_notif *req,
109		      struct seccomp_notif_resp *resp, int listener)
110{
111	char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
112	int ret = -1, mem;
113
114	resp->id = req->id;
115	resp->error = -EPERM;
116	resp->val = 0;
117
118	if (req->data.nr != __NR_mount) {
119		fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
120		return -1;
121	}
122
123	/* Only allow bind mounts. */
124	if (!(req->data.args[3] & MS_BIND))
125		return 0;
126
127	/*
128	 * Ok, let's read the task's memory to see where they wanted their
129	 * mount to go.
130	 */
131	snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
132	mem = open(path, O_RDONLY);
133	if (mem < 0) {
134		perror("open mem");
135		return -1;
136	}
137
138	/*
139	 * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
140	 * the pid that made the syscall may have died, we need to confirm that
141	 * the pid is still valid after we open its /proc/pid/mem file. We can
142	 * ask the listener fd this as follows.
143	 *
144	 * Note that this check should occur *after* any task-specific
145	 * resources are opened, to make sure that the task has not died and
146	 * we're not wrongly reading someone else's state in order to make
147	 * decisions.
148	 */
149	if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
150		fprintf(stderr, "task died before we could map its memory\n");
151		goto out;
152	}
153
154	/*
155	 * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
156	 * that to avoid another TOCTOU, we should read all of the pointer args
157	 * before we decide to allow the syscall.
158	 */
159	if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
160		perror("seek");
161		goto out;
162	}
163
164	ret = read(mem, source, sizeof(source));
165	if (ret < 0) {
166		perror("read");
167		goto out;
168	}
169
170	if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
171		perror("seek");
172		goto out;
173	}
174
175	ret = read(mem, target, sizeof(target));
176	if (ret < 0) {
177		perror("read");
178		goto out;
179	}
180
181	/*
182	 * Our policy is to only allow bind mounts inside /tmp. This isn't very
183	 * interesting, because we could do unprivlieged bind mounts with user
184	 * namespaces already, but you get the idea.
185	 */
186	if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
187		if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
188			ret = -1;
189			perror("actual mount");
190			goto out;
191		}
192		resp->error = 0;
193	}
194
195	/* Even if we didn't allow it because of policy, generating the
196	 * response was be a success, because we want to tell the worker EPERM.
197	 */
198	ret = 0;
199
200out:
201	close(mem);
202	return ret;
203}
204
205int main(void)
206{
207	int sk_pair[2], ret = 1, status, listener;
208	pid_t worker = 0 , tracer = 0;
209
210	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
211		perror("socketpair");
212		return 1;
213	}
214
215	worker = fork();
216	if (worker < 0) {
217		perror("fork");
218		goto close_pair;
219	}
220
221	if (worker == 0) {
222		listener = user_trap_syscall(__NR_mount,
223					     SECCOMP_FILTER_FLAG_NEW_LISTENER);
224		if (listener < 0) {
225			perror("seccomp");
226			exit(1);
227		}
228
229		/*
230		 * Drop privileges. We definitely can't mount as uid 1000.
231		 */
232		if (setuid(1000) < 0) {
233			perror("setuid");
234			exit(1);
235		}
236
237		/*
238		 * Send the listener to the parent; also serves as
239		 * synchronization.
240		 */
241		if (send_fd(sk_pair[1], listener) < 0)
242			exit(1);
243		close(listener);
244
245		if (mkdir("/tmp/foo", 0755) < 0) {
246			perror("mkdir");
247			exit(1);
248		}
249
250		/*
251		 * Try a bad mount just for grins.
252		 */
253		if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
254			fprintf(stderr, "huh? mounted /dev/sda?\n");
255			exit(1);
256		}
257
258		if (errno != EPERM) {
259			perror("bad error from mount");
260			exit(1);
261		}
262
263		/*
264		 * Ok, we expect this one to succeed.
265		 */
266		if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
267			perror("mount");
268			exit(1);
269		}
270
271		exit(0);
272	}
273
274	/*
275	 * Get the listener from the child.
276	 */
277	listener = recv_fd(sk_pair[0]);
278	if (listener < 0)
279		goto out_kill;
280
281	/*
282	 * Fork a task to handle the requests. This isn't strictly necessary,
283	 * but it makes the particular writing of this sample easier, since we
284	 * can just wait ofr the tracee to exit and kill the tracer.
285	 */
286	tracer = fork();
287	if (tracer < 0) {
288		perror("fork");
289		goto out_kill;
290	}
291
292	if (tracer == 0) {
293		struct seccomp_notif *req;
294		struct seccomp_notif_resp *resp;
295		struct seccomp_notif_sizes sizes;
296
297		if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
298			perror("seccomp(GET_NOTIF_SIZES)");
299			goto out_close;
300		}
301
302		req = malloc(sizes.seccomp_notif);
303		if (!req)
304			goto out_close;
305
306		resp = malloc(sizes.seccomp_notif_resp);
307		if (!resp)
308			goto out_req;
309		memset(resp, 0, sizes.seccomp_notif_resp);
310
311		while (1) {
312			memset(req, 0, sizes.seccomp_notif);
313			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
314				perror("ioctl recv");
315				goto out_resp;
316			}
317
318			if (handle_req(req, resp, listener) < 0)
319				goto out_resp;
320
321			/*
322			 * ENOENT here means that the task may have gotten a
323			 * signal and restarted the syscall. It's up to the
324			 * handler to decide what to do in this case, but for
325			 * the sample code, we just ignore it. Probably
326			 * something better should happen, like undoing the
327			 * mount, or keeping track of the args to make sure we
328			 * don't do it again.
329			 */
330			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
331			    errno != ENOENT) {
332				perror("ioctl send");
333				goto out_resp;
334			}
335		}
336out_resp:
337		free(resp);
338out_req:
339		free(req);
340out_close:
341		close(listener);
342		exit(1);
343	}
344
345	close(listener);
346
347	if (waitpid(worker, &status, 0) != worker) {
348		perror("waitpid");
349		goto out_kill;
350	}
351
352	if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
353		perror("umount2");
354		goto out_kill;
355	}
356
357	if (remove("/tmp/foo") < 0 && errno != ENOENT) {
358		perror("remove");
359		exit(1);
360	}
361
362	if (!WIFEXITED(status) || WEXITSTATUS(status)) {
363		fprintf(stderr, "worker exited nonzero\n");
364		goto out_kill;
365	}
366
367	ret = 0;
368
369out_kill:
370	if (tracer > 0)
371		kill(tracer, SIGKILL);
372	if (worker > 0)
373		kill(worker, SIGKILL);
374
375close_pair:
376	close(sk_pair[0]);
377	close(sk_pair[1]);
378	return ret;
379}
380