1/*
2 * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16/*
17 * Fork and exec tiny 1 page executable which precisely controls its VM.
18 * Test /proc/$PID/maps
19 * Test /proc/$PID/smaps
20 * Test /proc/$PID/smaps_rollup
21 * Test /proc/$PID/statm
22 *
23 * FIXME require CONFIG_TMPFS which can be disabled
24 * FIXME test other values from "smaps"
25 * FIXME support other archs
26 */
27#undef NDEBUG
28#include <assert.h>
29#include <errno.h>
30#include <sched.h>
31#include <signal.h>
32#include <stdbool.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <string.h>
36#include <stdlib.h>
37#include <sys/mount.h>
38#include <sys/types.h>
39#include <sys/stat.h>
40#include <sys/wait.h>
41#include <fcntl.h>
42#include <unistd.h>
43#include <sys/syscall.h>
44#include <sys/uio.h>
45#include <linux/kdev_t.h>
46#include <sys/time.h>
47#include <sys/resource.h>
48
49#include "../kselftest.h"
50
51static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
52{
53	return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags);
54}
55
56static void make_private_tmp(void)
57{
58	if (unshare(CLONE_NEWNS) == -1) {
59		if (errno == ENOSYS || errno == EPERM) {
60			exit(4);
61		}
62		exit(1);
63	}
64	if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
65		exit(1);
66	}
67	if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) {
68		exit(1);
69	}
70}
71
72static pid_t pid = -1;
73static void ate(void)
74{
75	if (pid > 0) {
76		kill(pid, SIGTERM);
77	}
78}
79
80struct elf64_hdr {
81	uint8_t e_ident[16];
82	uint16_t e_type;
83	uint16_t e_machine;
84	uint32_t e_version;
85	uint64_t e_entry;
86	uint64_t e_phoff;
87	uint64_t e_shoff;
88	uint32_t e_flags;
89	uint16_t e_ehsize;
90	uint16_t e_phentsize;
91	uint16_t e_phnum;
92	uint16_t e_shentsize;
93	uint16_t e_shnum;
94	uint16_t e_shstrndx;
95};
96
97struct elf64_phdr {
98	uint32_t p_type;
99	uint32_t p_flags;
100	uint64_t p_offset;
101	uint64_t p_vaddr;
102	uint64_t p_paddr;
103	uint64_t p_filesz;
104	uint64_t p_memsz;
105	uint64_t p_align;
106};
107
108#ifdef __x86_64__
109#define PAGE_SIZE 4096
110#define VADDR (1UL << 32)
111#define MAPS_OFFSET 73
112
113#define syscall	0x0f, 0x05
114#define mov_rdi(x)	\
115	0x48, 0xbf,	\
116	(x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff,	\
117	((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
118
119#define mov_rsi(x)	\
120	0x48, 0xbe,	\
121	(x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff,	\
122	((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
123
124#define mov_eax(x)	\
125	0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff
126
127static const uint8_t payload[] = {
128	/* Casually unmap stack, vDSO and everything else. */
129	/* munmap */
130	mov_rdi(VADDR + 4096),
131	mov_rsi((1ULL << 47) - 4096 - VADDR - 4096),
132	mov_eax(11),
133	syscall,
134
135	/* Ping parent. */
136	/* write(0, &c, 1); */
137	0x31, 0xff,					/* xor edi, edi */
138	0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00,	/* lea rsi, [rip] */
139	0xba, 0x01, 0x00, 0x00, 0x00,			/* mov edx, 1 */
140	mov_eax(1),
141	syscall,
142
143	/* 1: pause(); */
144	mov_eax(34),
145	syscall,
146
147	0xeb, 0xf7,	/* jmp 1b */
148};
149
150static int make_exe(const uint8_t *payload, size_t len)
151{
152	struct elf64_hdr h;
153	struct elf64_phdr ph;
154
155	struct iovec iov[3] = {
156		{&h, sizeof(struct elf64_hdr)},
157		{&ph, sizeof(struct elf64_phdr)},
158		{(void *)payload, len},
159	};
160	int fd, fd1;
161	char buf[64];
162
163	memset(&h, 0, sizeof(h));
164	h.e_ident[0] = 0x7f;
165	h.e_ident[1] = 'E';
166	h.e_ident[2] = 'L';
167	h.e_ident[3] = 'F';
168	h.e_ident[4] = 2;
169	h.e_ident[5] = 1;
170	h.e_ident[6] = 1;
171	h.e_ident[7] = 0;
172	h.e_type = 2;
173	h.e_machine = 0x3e;
174	h.e_version = 1;
175	h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr);
176	h.e_phoff = sizeof(struct elf64_hdr);
177	h.e_shoff = 0;
178	h.e_flags = 0;
179	h.e_ehsize = sizeof(struct elf64_hdr);
180	h.e_phentsize = sizeof(struct elf64_phdr);
181	h.e_phnum = 1;
182	h.e_shentsize = 0;
183	h.e_shnum = 0;
184	h.e_shstrndx = 0;
185
186	memset(&ph, 0, sizeof(ph));
187	ph.p_type = 1;
188	ph.p_flags = (1<<2)|1;
189	ph.p_offset = 0;
190	ph.p_vaddr = VADDR;
191	ph.p_paddr = 0;
192	ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
193	ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
194	ph.p_align = 4096;
195
196	fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
197	if (fd == -1) {
198		exit(1);
199	}
200
201	if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) {
202		exit(1);
203	}
204
205	/* Avoid ETXTBSY on exec. */
206	snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
207	fd1 = open(buf, O_RDONLY|O_CLOEXEC);
208	close(fd);
209
210	return fd1;
211}
212#endif
213
214/*
215 * 0: vsyscall VMA doesn't exist	vsyscall=none
216 * 1: vsyscall VMA is --xp		vsyscall=xonly
217 * 2: vsyscall VMA is r-xp		vsyscall=emulate
218 */
219static volatile int g_vsyscall;
220static const char *str_vsyscall;
221
222static const char str_vsyscall_0[] = "";
223static const char str_vsyscall_1[] =
224"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]\n";
225static const char str_vsyscall_2[] =
226"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n";
227
228#ifdef __x86_64__
229static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
230{
231	_exit(g_vsyscall);
232}
233
234/*
235 * vsyscall page can't be unmapped, probe it directly.
236 */
237static void vsyscall(void)
238{
239	pid_t pid;
240	int wstatus;
241
242	pid = fork();
243	if (pid < 0) {
244		fprintf(stderr, "fork, errno %d\n", errno);
245		exit(1);
246	}
247	if (pid == 0) {
248		struct rlimit rlim = {0, 0};
249		(void)setrlimit(RLIMIT_CORE, &rlim);
250
251		/* Hide "segfault at ffffffffff600000" messages. */
252		struct sigaction act;
253		memset(&act, 0, sizeof(struct sigaction));
254		act.sa_flags = SA_SIGINFO;
255		act.sa_sigaction = sigaction_SIGSEGV;
256		(void)sigaction(SIGSEGV, &act, NULL);
257
258		g_vsyscall = 0;
259		/* gettimeofday(NULL, NULL); */
260		uint64_t rax = 0xffffffffff600000;
261		asm volatile (
262			"call *%[rax]"
263			: [rax] "+a" (rax)
264			: "D" (NULL), "S" (NULL)
265			: "rcx", "r11"
266		);
267
268		g_vsyscall = 1;
269		*(volatile int *)0xffffffffff600000UL;
270
271		g_vsyscall = 2;
272		exit(g_vsyscall);
273	}
274	waitpid(pid, &wstatus, 0);
275	if (WIFEXITED(wstatus)) {
276		g_vsyscall = WEXITSTATUS(wstatus);
277	} else {
278		fprintf(stderr, "error: wstatus %08x\n", wstatus);
279		exit(1);
280	}
281}
282
283int main(void)
284{
285	int pipefd[2];
286	int exec_fd;
287
288	vsyscall();
289	switch (g_vsyscall) {
290	case 0:
291		str_vsyscall = str_vsyscall_0;
292		break;
293	case 1:
294		str_vsyscall = str_vsyscall_1;
295		break;
296	case 2:
297		str_vsyscall = str_vsyscall_2;
298		break;
299	default:
300		abort();
301	}
302
303	atexit(ate);
304
305	make_private_tmp();
306
307	/* Reserve fd 0 for 1-byte pipe ping from child. */
308	close(0);
309	if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) {
310		return 1;
311	}
312
313	exec_fd = make_exe(payload, sizeof(payload));
314
315	if (pipe(pipefd) == -1) {
316		return 1;
317	}
318	if (dup2(pipefd[1], 0) != 0) {
319		return 1;
320	}
321
322	pid = fork();
323	if (pid == -1) {
324		return 1;
325	}
326	if (pid == 0) {
327		sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH);
328		return 1;
329	}
330
331	char _;
332	if (read(pipefd[0], &_, 1) != 1) {
333		return 1;
334	}
335
336	struct stat st;
337	if (fstat(exec_fd, &st) == -1) {
338		return 1;
339	}
340
341	/* Generate "head -n1 /proc/$PID/maps" */
342	char buf0[256];
343	memset(buf0, ' ', sizeof(buf0));
344	int len = snprintf(buf0, sizeof(buf0),
345			"%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu",
346			VADDR, VADDR + PAGE_SIZE,
347			MAJOR(st.st_dev), MINOR(st.st_dev),
348			(unsigned long long)st.st_ino);
349	buf0[len] = ' ';
350	snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET,
351		 "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino);
352
353	/* Test /proc/$PID/maps */
354	{
355		const size_t len = strlen(buf0) + strlen(str_vsyscall);
356		char buf[256];
357		ssize_t rv;
358		int fd;
359
360		snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
361		fd = open(buf, O_RDONLY);
362		if (fd == -1) {
363			return 1;
364		}
365		rv = read(fd, buf, sizeof(buf));
366		assert(rv == len);
367		assert(memcmp(buf, buf0, strlen(buf0)) == 0);
368		if (g_vsyscall > 0) {
369			assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0);
370		}
371	}
372
373	/* Test /proc/$PID/smaps */
374	{
375		char buf[4096];
376		ssize_t rv;
377		int fd;
378
379		snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
380		fd = open(buf, O_RDONLY);
381		if (fd == -1) {
382			return 1;
383		}
384		rv = read(fd, buf, sizeof(buf));
385		assert(0 <= rv && rv <= sizeof(buf));
386
387		assert(rv >= strlen(buf0));
388		assert(memcmp(buf, buf0, strlen(buf0)) == 0);
389
390#define RSS1 "Rss:                   4 kB\n"
391#define RSS2 "Rss:                   0 kB\n"
392#define PSS1 "Pss:                   4 kB\n"
393#define PSS2 "Pss:                   0 kB\n"
394		assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
395		       memmem(buf, rv, RSS2, strlen(RSS2)));
396		assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
397		       memmem(buf, rv, PSS2, strlen(PSS2)));
398
399		static const char *S[] = {
400			"Size:                  4 kB\n",
401			"KernelPageSize:        4 kB\n",
402			"MMUPageSize:           4 kB\n",
403			"Anonymous:             0 kB\n",
404			"AnonHugePages:         0 kB\n",
405			"Shared_Hugetlb:        0 kB\n",
406			"Private_Hugetlb:       0 kB\n",
407			"Locked:                0 kB\n",
408		};
409		int i;
410
411		for (i = 0; i < ARRAY_SIZE(S); i++) {
412			assert(memmem(buf, rv, S[i], strlen(S[i])));
413		}
414
415		if (g_vsyscall > 0) {
416			assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall)));
417		}
418	}
419
420	/* Test /proc/$PID/smaps_rollup */
421	{
422		char bufr[256];
423		memset(bufr, ' ', sizeof(bufr));
424		len = snprintf(bufr, sizeof(bufr),
425				"%08lx-%08lx ---p 00000000 00:00 0",
426				VADDR, VADDR + PAGE_SIZE);
427		bufr[len] = ' ';
428		snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET,
429			 "[rollup]\n");
430
431		char buf[1024];
432		ssize_t rv;
433		int fd;
434
435		snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
436		fd = open(buf, O_RDONLY);
437		if (fd == -1) {
438			return 1;
439		}
440		rv = read(fd, buf, sizeof(buf));
441		assert(0 <= rv && rv <= sizeof(buf));
442
443		assert(rv >= strlen(bufr));
444		assert(memcmp(buf, bufr, strlen(bufr)) == 0);
445
446		assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
447		       memmem(buf, rv, RSS2, strlen(RSS2)));
448		assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
449		       memmem(buf, rv, PSS2, strlen(PSS2)));
450
451		static const char *S[] = {
452			"Anonymous:             0 kB\n",
453			"AnonHugePages:         0 kB\n",
454			"Shared_Hugetlb:        0 kB\n",
455			"Private_Hugetlb:       0 kB\n",
456			"Locked:                0 kB\n",
457		};
458		int i;
459
460		for (i = 0; i < ARRAY_SIZE(S); i++) {
461			assert(memmem(buf, rv, S[i], strlen(S[i])));
462		}
463	}
464
465	/* Test /proc/$PID/statm */
466	{
467		char buf[64];
468		ssize_t rv;
469		int fd;
470
471		snprintf(buf, sizeof(buf), "/proc/%u/statm", pid);
472		fd = open(buf, O_RDONLY);
473		if (fd == -1) {
474			return 1;
475		}
476		rv = read(fd, buf, sizeof(buf));
477		assert(rv == 7 * 2);
478
479		assert(buf[0] == '1');	/* ->total_vm */
480		assert(buf[1] == ' ');
481		assert(buf[2] == '0' || buf[2] == '1');	/* rss */
482		assert(buf[3] == ' ');
483		assert(buf[4] == '0' || buf[2] == '1');	/* file rss */
484		assert(buf[5] == ' ');
485		assert(buf[6] == '1');	/* ELF executable segments */
486		assert(buf[7] == ' ');
487		assert(buf[8] == '0');
488		assert(buf[9] == ' ');
489		assert(buf[10] == '0');	/* ->data_vm + ->stack_vm */
490		assert(buf[11] == ' ');
491		assert(buf[12] == '0');
492		assert(buf[13] == '\n');
493	}
494
495	return 0;
496}
497#else
498int main(void)
499{
500	return 4;
501}
502#endif
503