1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * syscall_numbering.c - test calling the x86-64 kernel with various
4 * valid and invalid system call numbers.
5 *
6 * Copyright (c) 2018 Andrew Lutomirski
7 */
8
9#define _GNU_SOURCE
10
11#include <stdlib.h>
12#include <stdio.h>
13#include <stdbool.h>
14#include <errno.h>
15#include <unistd.h>
16#include <string.h>
17#include <fcntl.h>
18#include <limits.h>
19#include <signal.h>
20#include <sysexits.h>
21
22#include <sys/ptrace.h>
23#include <sys/user.h>
24#include <sys/wait.h>
25#include <sys/mman.h>
26
27#include <linux/ptrace.h>
28
29/* Common system call numbers */
30#define SYS_READ	  0
31#define SYS_WRITE	  1
32#define SYS_GETPID	 39
33/* x64-only system call numbers */
34#define X64_IOCTL	 16
35#define X64_READV	 19
36#define X64_WRITEV	 20
37/* x32-only system call numbers (without X32_BIT) */
38#define X32_IOCTL	514
39#define X32_READV	515
40#define X32_WRITEV	516
41
42#define X32_BIT 0x40000000
43
44static int nullfd = -1;		/* File descriptor for /dev/null */
45static bool with_x32;		/* x32 supported on this kernel? */
46
47enum ptrace_pass {
48	PTP_NOTHING,
49	PTP_GETREGS,
50	PTP_WRITEBACK,
51	PTP_FUZZRET,
52	PTP_FUZZHIGH,
53	PTP_INTNUM,
54	PTP_DONE
55};
56
57static const char * const ptrace_pass_name[] =
58{
59	[PTP_NOTHING]	= "just stop, no data read",
60	[PTP_GETREGS]	= "only getregs",
61	[PTP_WRITEBACK]	= "getregs, unmodified setregs",
62	[PTP_FUZZRET]	= "modifying the default return",
63	[PTP_FUZZHIGH]	= "clobbering the top 32 bits",
64	[PTP_INTNUM]	= "sign-extending the syscall number",
65};
66
67/*
68 * Shared memory block between tracer and test
69 */
70struct shared {
71	unsigned int nerr;	/* Total error count */
72	unsigned int indent;	/* Message indentation level */
73	enum ptrace_pass ptrace_pass;
74	bool probing_syscall;	/* In probe_syscall() */
75};
76static volatile struct shared *sh;
77
78static inline unsigned int offset(void)
79{
80	unsigned int level = sh ? sh->indent : 0;
81
82	return 8 + level * 4;
83}
84
85#define msg(lvl, fmt, ...) printf("%-*s" fmt, offset(), "[" #lvl "]", \
86				  ## __VA_ARGS__)
87
88#define run(fmt, ...)  msg(RUN,  fmt, ## __VA_ARGS__)
89#define info(fmt, ...) msg(INFO, fmt, ## __VA_ARGS__)
90#define ok(fmt, ...)   msg(OK,   fmt, ## __VA_ARGS__)
91
92#define fail(fmt, ...)					\
93	do {						\
94		msg(FAIL, fmt, ## __VA_ARGS__);		\
95		sh->nerr++;				\
96       } while (0)
97
98#define crit(fmt, ...)					\
99	do {						\
100		sh->indent = 0;				\
101		msg(FAIL, fmt, ## __VA_ARGS__);		\
102		msg(SKIP, "Unable to run test\n");	\
103		exit(EX_OSERR);				\
104       } while (0)
105
106/* Sentinel for ptrace-modified return value */
107#define MODIFIED_BY_PTRACE	-9999
108
109/*
110 * Directly invokes the given syscall with nullfd as the first argument
111 * and the rest zero. Avoids involving glibc wrappers in case they ever
112 * end up intercepting some system calls for some reason, or modify
113 * the system call number itself.
114 */
115static long long probe_syscall(int msb, int lsb)
116{
117	register long long arg1 asm("rdi") = nullfd;
118	register long long arg2 asm("rsi") = 0;
119	register long long arg3 asm("rdx") = 0;
120	register long long arg4 asm("r10") = 0;
121	register long long arg5 asm("r8")  = 0;
122	register long long arg6 asm("r9")  = 0;
123	long long nr = ((long long)msb << 32) | (unsigned int)lsb;
124	long long ret;
125
126	/*
127	 * We pass in an extra copy of the extended system call number
128	 * in %rbx, so we can examine it from the ptrace handler without
129	 * worrying about it being possibly modified. This is to test
130	 * the validity of struct user regs.orig_rax a.k.a.
131	 * struct pt_regs.orig_ax.
132	 */
133	sh->probing_syscall = true;
134	asm volatile("syscall"
135		     : "=a" (ret)
136		     : "a" (nr), "b" (nr),
137		       "r" (arg1), "r" (arg2), "r" (arg3),
138		       "r" (arg4), "r" (arg5), "r" (arg6)
139		     : "rcx", "r11", "memory", "cc");
140	sh->probing_syscall = false;
141
142	return ret;
143}
144
145static const char *syscall_str(int msb, int start, int end)
146{
147	static char buf[64];
148	const char * const type = (start & X32_BIT) ? "x32" : "x64";
149	int lsb = start;
150
151	/*
152	 * Improve readability by stripping the x32 bit, but round
153	 * toward zero so we don't display -1 as -1073741825.
154	 */
155	if (lsb < 0)
156		lsb |= X32_BIT;
157	else
158		lsb &= ~X32_BIT;
159
160	if (start == end)
161		snprintf(buf, sizeof buf, "%s syscall %d:%d",
162			 type, msb, lsb);
163	else
164		snprintf(buf, sizeof buf, "%s syscalls %d:%d..%d",
165			 type, msb, lsb, lsb + (end-start));
166
167	return buf;
168}
169
170static unsigned int _check_for(int msb, int start, int end, long long expect,
171			       const char *expect_str)
172{
173	unsigned int err = 0;
174
175	sh->indent++;
176	if (start != end)
177		sh->indent++;
178
179	for (int nr = start; nr <= end; nr++) {
180		long long ret = probe_syscall(msb, nr);
181
182		if (ret != expect) {
183			fail("%s returned %lld, but it should have returned %s\n",
184			       syscall_str(msb, nr, nr),
185			       ret, expect_str);
186			err++;
187		}
188	}
189
190	if (start != end)
191		sh->indent--;
192
193	if (err) {
194		if (start != end)
195			fail("%s had %u failure%s\n",
196			     syscall_str(msb, start, end),
197			     err, err == 1 ? "s" : "");
198	} else {
199		ok("%s returned %s as expected\n",
200		   syscall_str(msb, start, end), expect_str);
201	}
202
203	sh->indent--;
204
205	return err;
206}
207
208#define check_for(msb,start,end,expect) \
209	_check_for(msb,start,end,expect,#expect)
210
211static bool check_zero(int msb, int nr)
212{
213	return check_for(msb, nr, nr, 0);
214}
215
216static bool check_enosys(int msb, int nr)
217{
218	return check_for(msb, nr, nr, -ENOSYS);
219}
220
221/*
222 * Anyone diagnosing a failure will want to know whether the kernel
223 * supports x32. Tell them. This can also be used to conditionalize
224 * tests based on existence or nonexistence of x32.
225 */
226static bool test_x32(void)
227{
228	long long ret;
229	pid_t mypid = getpid();
230
231	run("Checking for x32 by calling x32 getpid()\n");
232	ret = probe_syscall(0, SYS_GETPID | X32_BIT);
233
234	sh->indent++;
235	if (ret == mypid) {
236		info("x32 is supported\n");
237		with_x32 = true;
238	} else if (ret == -ENOSYS) {
239		info("x32 is not supported\n");
240		with_x32 = false;
241	} else {
242		fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, (long long)mypid);
243		with_x32 = false;
244	}
245	sh->indent--;
246	return with_x32;
247}
248
249static void test_syscalls_common(int msb)
250{
251	enum ptrace_pass pass = sh->ptrace_pass;
252
253	run("Checking some common syscalls as 64 bit\n");
254	check_zero(msb, SYS_READ);
255	check_zero(msb, SYS_WRITE);
256
257	run("Checking some 64-bit only syscalls as 64 bit\n");
258	check_zero(msb, X64_READV);
259	check_zero(msb, X64_WRITEV);
260
261	run("Checking out of range system calls\n");
262	check_for(msb, -64, -2, -ENOSYS);
263	if (pass >= PTP_FUZZRET)
264		check_for(msb, -1, -1, MODIFIED_BY_PTRACE);
265	else
266		check_for(msb, -1, -1, -ENOSYS);
267	check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS);
268	check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS);
269	check_for(msb, INT_MAX-64, INT_MAX-1, -ENOSYS);
270}
271
272static void test_syscalls_with_x32(int msb)
273{
274	/*
275	 * Syscalls 512-547 are "x32" syscalls.  They are
276	 * intended to be called with the x32 (0x40000000) bit
277	 * set.  Calling them without the x32 bit set is
278	 * nonsense and should not work.
279	 */
280	run("Checking x32 syscalls as 64 bit\n");
281	check_for(msb, 512, 547, -ENOSYS);
282
283	run("Checking some common syscalls as x32\n");
284	check_zero(msb, SYS_READ   | X32_BIT);
285	check_zero(msb, SYS_WRITE  | X32_BIT);
286
287	run("Checking some x32 syscalls as x32\n");
288	check_zero(msb, X32_READV  | X32_BIT);
289	check_zero(msb, X32_WRITEV | X32_BIT);
290
291	run("Checking some 64-bit syscalls as x32\n");
292	check_enosys(msb, X64_IOCTL  | X32_BIT);
293	check_enosys(msb, X64_READV  | X32_BIT);
294	check_enosys(msb, X64_WRITEV | X32_BIT);
295}
296
297static void test_syscalls_without_x32(int msb)
298{
299	run("Checking for absence of x32 system calls\n");
300	check_for(msb, 0 | X32_BIT, 999 | X32_BIT, -ENOSYS);
301}
302
303static void test_syscall_numbering(void)
304{
305	static const int msbs[] = {
306		0, 1, -1, X32_BIT-1, X32_BIT, X32_BIT-1, -X32_BIT, INT_MAX,
307		INT_MIN, INT_MIN+1
308	};
309
310	sh->indent++;
311
312	/*
313	 * The MSB is supposed to be ignored, so we loop over a few
314	 * to test that out.
315	 */
316	for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) {
317		int msb = msbs[i];
318		run("Checking system calls with msb = %d (0x%x)\n",
319		    msb, msb);
320
321		sh->indent++;
322
323		test_syscalls_common(msb);
324		if (with_x32)
325			test_syscalls_with_x32(msb);
326		else
327			test_syscalls_without_x32(msb);
328
329		sh->indent--;
330	}
331
332	sh->indent--;
333}
334
335static void syscall_numbering_tracee(void)
336{
337	enum ptrace_pass pass;
338
339	if (ptrace(PTRACE_TRACEME, 0, 0, 0)) {
340		crit("Failed to request tracing\n");
341		return;
342	}
343	raise(SIGSTOP);
344
345	for (sh->ptrace_pass = pass = PTP_NOTHING; pass < PTP_DONE;
346	     sh->ptrace_pass = ++pass) {
347		run("Running tests under ptrace: %s\n", ptrace_pass_name[pass]);
348		test_syscall_numbering();
349	}
350}
351
352static void mess_with_syscall(pid_t testpid, enum ptrace_pass pass)
353{
354	struct user_regs_struct regs;
355
356	sh->probing_syscall = false; /* Do this on entry only */
357
358	/* For these, don't even getregs */
359	if (pass == PTP_NOTHING || pass == PTP_DONE)
360		return;
361
362	ptrace(PTRACE_GETREGS, testpid, NULL, &regs);
363
364	if (regs.orig_rax != regs.rbx) {
365		fail("orig_rax %#llx doesn't match syscall number %#llx\n",
366		     (unsigned long long)regs.orig_rax,
367		     (unsigned long long)regs.rbx);
368	}
369
370	switch (pass) {
371	case PTP_GETREGS:
372		/* Just read, no writeback */
373		return;
374	case PTP_WRITEBACK:
375		/* Write back the same register state verbatim */
376		break;
377	case PTP_FUZZRET:
378		regs.rax = MODIFIED_BY_PTRACE;
379		break;
380	case PTP_FUZZHIGH:
381		regs.rax = MODIFIED_BY_PTRACE;
382		regs.orig_rax = regs.orig_rax | 0xffffffff00000000ULL;
383		break;
384	case PTP_INTNUM:
385		regs.rax = MODIFIED_BY_PTRACE;
386		regs.orig_rax = (int)regs.orig_rax;
387		break;
388	default:
389		crit("invalid ptrace_pass\n");
390		break;
391	}
392
393	ptrace(PTRACE_SETREGS, testpid, NULL, &regs);
394}
395
396static void syscall_numbering_tracer(pid_t testpid)
397{
398	int wstatus;
399
400	do {
401		pid_t wpid = waitpid(testpid, &wstatus, 0);
402		if (wpid < 0 && errno != EINTR)
403			break;
404		if (wpid != testpid)
405			continue;
406		if (!WIFSTOPPED(wstatus))
407			break;	/* Thread exited? */
408
409		if (sh->probing_syscall && WSTOPSIG(wstatus) == SIGTRAP)
410			mess_with_syscall(testpid, sh->ptrace_pass);
411	} while (sh->ptrace_pass != PTP_DONE &&
412		 !ptrace(PTRACE_SYSCALL, testpid, NULL, NULL));
413
414	ptrace(PTRACE_DETACH, testpid, NULL, NULL);
415
416	/* Wait for the child process to terminate */
417	while (waitpid(testpid, &wstatus, 0) != testpid || !WIFEXITED(wstatus))
418		/* wait some more */;
419}
420
421static void test_traced_syscall_numbering(void)
422{
423	pid_t testpid;
424
425	/* Launch the test thread; this thread continues as the tracer thread */
426	testpid = fork();
427
428	if (testpid < 0) {
429		crit("Unable to launch tracer process\n");
430	} else if (testpid == 0) {
431		syscall_numbering_tracee();
432		_exit(0);
433	} else {
434		syscall_numbering_tracer(testpid);
435	}
436}
437
438int main(void)
439{
440	unsigned int nerr;
441
442	/*
443	 * It is quite likely to get a segfault on a failure, so make
444	 * sure the message gets out by setting stdout to nonbuffered.
445	 */
446	setvbuf(stdout, NULL, _IONBF, 0);
447
448	/*
449	 * Harmless file descriptor to work on...
450	 */
451	nullfd = open("/dev/null", O_RDWR);
452	if (nullfd < 0) {
453		crit("Unable to open /dev/null: %s\n", strerror(errno));
454	}
455
456	/*
457	 * Set up a block of shared memory...
458	 */
459	sh = mmap(NULL, sysconf(_SC_PAGE_SIZE), PROT_READ|PROT_WRITE,
460		  MAP_ANONYMOUS|MAP_SHARED, 0, 0);
461	if (sh == MAP_FAILED) {
462		crit("Unable to allocated shared memory block: %s\n",
463		     strerror(errno));
464	}
465
466	with_x32 = test_x32();
467
468	run("Running tests without ptrace...\n");
469	test_syscall_numbering();
470
471	test_traced_syscall_numbering();
472
473	nerr = sh->nerr;
474	if (!nerr) {
475		ok("All system calls succeeded or failed as expected\n");
476		return 0;
477	} else {
478		fail("A total of %u system call%s had incorrect behavior\n",
479		     nerr, nerr != 1 ? "s" : "");
480		return 1;
481	}
482}
483