1#define _GNU_SOURCE
2#include <ctype.h>
3#include <errno.h>
4#include <fcntl.h>
5#include <limits.h>
6#include <dirent.h>
7#include <signal.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <stdbool.h>
11#include <string.h>
12#include <unistd.h>
13
14#include <linux/mman.h>
15#include <sys/mman.h>
16#include <sys/wait.h>
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <sys/sysmacros.h>
20#include <sys/vfs.h>
21
22#include "linux/magic.h"
23
24#include "vm_util.h"
25#include "thp_settings.h"
26
27#define BASE_ADDR ((void *)(1UL << 30))
28static unsigned long hpage_pmd_size;
29static unsigned long page_size;
30static int hpage_pmd_nr;
31static int anon_order;
32
33#define PID_SMAPS "/proc/self/smaps"
34#define TEST_FILE "collapse_test_file"
35
36#define MAX_LINE_LENGTH 500
37
38enum vma_type {
39	VMA_ANON,
40	VMA_FILE,
41	VMA_SHMEM,
42};
43
44struct mem_ops {
45	void *(*setup_area)(int nr_hpages);
46	void (*cleanup_area)(void *p, unsigned long size);
47	void (*fault)(void *p, unsigned long start, unsigned long end);
48	bool (*check_huge)(void *addr, int nr_hpages);
49	const char *name;
50};
51
52static struct mem_ops *file_ops;
53static struct mem_ops *anon_ops;
54static struct mem_ops *shmem_ops;
55
56struct collapse_context {
57	void (*collapse)(const char *msg, char *p, int nr_hpages,
58			 struct mem_ops *ops, bool expect);
59	bool enforce_pte_scan_limits;
60	const char *name;
61};
62
63static struct collapse_context *khugepaged_context;
64static struct collapse_context *madvise_context;
65
66struct file_info {
67	const char *dir;
68	char path[PATH_MAX];
69	enum vma_type type;
70	int fd;
71	char dev_queue_read_ahead_path[PATH_MAX];
72};
73
74static struct file_info finfo;
75static bool skip_settings_restore;
76static int exit_status;
77
78static void success(const char *msg)
79{
80	printf(" \e[32m%s\e[0m\n", msg);
81}
82
83static void fail(const char *msg)
84{
85	printf(" \e[31m%s\e[0m\n", msg);
86	exit_status++;
87}
88
89static void skip(const char *msg)
90{
91	printf(" \e[33m%s\e[0m\n", msg);
92}
93
94static void restore_settings_atexit(void)
95{
96	if (skip_settings_restore)
97		return;
98
99	printf("Restore THP and khugepaged settings...");
100	thp_restore_settings();
101	success("OK");
102
103	skip_settings_restore = true;
104}
105
106static void restore_settings(int sig)
107{
108	/* exit() will invoke the restore_settings_atexit handler. */
109	exit(sig ? EXIT_FAILURE : exit_status);
110}
111
112static void save_settings(void)
113{
114	printf("Save THP and khugepaged settings...");
115	if (file_ops && finfo.type == VMA_FILE)
116		thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
117	thp_save_settings();
118
119	success("OK");
120
121	atexit(restore_settings_atexit);
122	signal(SIGTERM, restore_settings);
123	signal(SIGINT, restore_settings);
124	signal(SIGHUP, restore_settings);
125	signal(SIGQUIT, restore_settings);
126}
127
128static void get_finfo(const char *dir)
129{
130	struct stat path_stat;
131	struct statfs fs;
132	char buf[1 << 10];
133	char path[PATH_MAX];
134	char *str, *end;
135
136	finfo.dir = dir;
137	stat(finfo.dir, &path_stat);
138	if (!S_ISDIR(path_stat.st_mode)) {
139		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
140		exit(EXIT_FAILURE);
141	}
142	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
143		     finfo.dir) >= sizeof(finfo.path)) {
144		printf("%s: Pathname is too long\n", __func__);
145		exit(EXIT_FAILURE);
146	}
147	if (statfs(finfo.dir, &fs)) {
148		perror("statfs()");
149		exit(EXIT_FAILURE);
150	}
151	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
152	if (finfo.type == VMA_SHMEM)
153		return;
154
155	/* Find owning device's queue/read_ahead_kb control */
156	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
157		     major(path_stat.st_dev), minor(path_stat.st_dev))
158	    >= sizeof(path)) {
159		printf("%s: Pathname is too long\n", __func__);
160		exit(EXIT_FAILURE);
161	}
162	if (read_file(path, buf, sizeof(buf)) < 0) {
163		perror("read_file(read_num)");
164		exit(EXIT_FAILURE);
165	}
166	if (strstr(buf, "DEVTYPE=disk")) {
167		/* Found it */
168		if (snprintf(finfo.dev_queue_read_ahead_path,
169			     sizeof(finfo.dev_queue_read_ahead_path),
170			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
171			     major(path_stat.st_dev), minor(path_stat.st_dev))
172		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
173			printf("%s: Pathname is too long\n", __func__);
174			exit(EXIT_FAILURE);
175		}
176		return;
177	}
178	if (!strstr(buf, "DEVTYPE=partition")) {
179		printf("%s: Unknown device type: %s\n", __func__, path);
180		exit(EXIT_FAILURE);
181	}
182	/*
183	 * Partition of block device - need to find actual device.
184	 * Using naming convention that devnameN is partition of
185	 * device devname.
186	 */
187	str = strstr(buf, "DEVNAME=");
188	if (!str) {
189		printf("%s: Could not read: %s", __func__, path);
190		exit(EXIT_FAILURE);
191	}
192	str += 8;
193	end = str;
194	while (*end) {
195		if (isdigit(*end)) {
196			*end = '\0';
197			if (snprintf(finfo.dev_queue_read_ahead_path,
198				     sizeof(finfo.dev_queue_read_ahead_path),
199				     "/sys/block/%s/queue/read_ahead_kb",
200				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
201				printf("%s: Pathname is too long\n", __func__);
202				exit(EXIT_FAILURE);
203			}
204			return;
205		}
206		++end;
207	}
208	printf("%s: Could not read: %s\n", __func__, path);
209	exit(EXIT_FAILURE);
210}
211
212static bool check_swap(void *addr, unsigned long size)
213{
214	bool swap = false;
215	int ret;
216	FILE *fp;
217	char buffer[MAX_LINE_LENGTH];
218	char addr_pattern[MAX_LINE_LENGTH];
219
220	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
221		       (unsigned long) addr);
222	if (ret >= MAX_LINE_LENGTH) {
223		printf("%s: Pattern is too long\n", __func__);
224		exit(EXIT_FAILURE);
225	}
226
227
228	fp = fopen(PID_SMAPS, "r");
229	if (!fp) {
230		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
231		exit(EXIT_FAILURE);
232	}
233	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
234		goto err_out;
235
236	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
237		       size >> 10);
238	if (ret >= MAX_LINE_LENGTH) {
239		printf("%s: Pattern is too long\n", __func__);
240		exit(EXIT_FAILURE);
241	}
242	/*
243	 * Fetch the Swap: in the same block and check whether it got
244	 * the expected number of hugeepages next.
245	 */
246	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
247		goto err_out;
248
249	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
250		goto err_out;
251
252	swap = true;
253err_out:
254	fclose(fp);
255	return swap;
256}
257
258static void *alloc_mapping(int nr)
259{
260	void *p;
261
262	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
263		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
264	if (p != BASE_ADDR) {
265		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
266		exit(EXIT_FAILURE);
267	}
268
269	return p;
270}
271
272static void fill_memory(int *p, unsigned long start, unsigned long end)
273{
274	int i;
275
276	for (i = start / page_size; i < end / page_size; i++)
277		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
278}
279
280/*
281 * MADV_COLLAPSE is a best-effort request and may fail if an internal
282 * resource is temporarily unavailable, in which case it will set errno to
283 * EAGAIN.  In such a case, immediately reattempt the operation one more
284 * time.
285 */
286static int madvise_collapse_retry(void *p, unsigned long size)
287{
288	bool retry = true;
289	int ret;
290
291retry:
292	ret = madvise(p, size, MADV_COLLAPSE);
293	if (ret && errno == EAGAIN && retry) {
294		retry = false;
295		goto retry;
296	}
297	return ret;
298}
299
300/*
301 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
302 * validate_memory()'able contents.
303 */
304static void *alloc_hpage(struct mem_ops *ops)
305{
306	void *p = ops->setup_area(1);
307
308	ops->fault(p, 0, hpage_pmd_size);
309
310	/*
311	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
312	 * The latter is ineligible for collapse by MADV_COLLAPSE
313	 * while the former might cause MADV_COLLAPSE to race with
314	 * khugepaged on low-load system (like a test machine), which
315	 * would cause MADV_COLLAPSE to fail with EAGAIN.
316	 */
317	printf("Allocate huge page...");
318	if (madvise_collapse_retry(p, hpage_pmd_size)) {
319		perror("madvise(MADV_COLLAPSE)");
320		exit(EXIT_FAILURE);
321	}
322	if (!ops->check_huge(p, 1)) {
323		perror("madvise(MADV_COLLAPSE)");
324		exit(EXIT_FAILURE);
325	}
326	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
327		perror("madvise(MADV_HUGEPAGE)");
328		exit(EXIT_FAILURE);
329	}
330	success("OK");
331	return p;
332}
333
334static void validate_memory(int *p, unsigned long start, unsigned long end)
335{
336	int i;
337
338	for (i = start / page_size; i < end / page_size; i++) {
339		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
340			printf("Page %d is corrupted: %#x\n",
341					i, p[i * page_size / sizeof(*p)]);
342			exit(EXIT_FAILURE);
343		}
344	}
345}
346
347static void *anon_setup_area(int nr_hpages)
348{
349	return alloc_mapping(nr_hpages);
350}
351
352static void anon_cleanup_area(void *p, unsigned long size)
353{
354	munmap(p, size);
355}
356
357static void anon_fault(void *p, unsigned long start, unsigned long end)
358{
359	fill_memory(p, start, end);
360}
361
362static bool anon_check_huge(void *addr, int nr_hpages)
363{
364	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
365}
366
367static void *file_setup_area(int nr_hpages)
368{
369	int fd;
370	void *p;
371	unsigned long size;
372
373	unlink(finfo.path);  /* Cleanup from previous failed tests */
374	printf("Creating %s for collapse%s...", finfo.path,
375	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
376	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
377		  777);
378	if (fd < 0) {
379		perror("open()");
380		exit(EXIT_FAILURE);
381	}
382
383	size = nr_hpages * hpage_pmd_size;
384	p = alloc_mapping(nr_hpages);
385	fill_memory(p, 0, size);
386	write(fd, p, size);
387	close(fd);
388	munmap(p, size);
389	success("OK");
390
391	printf("Opening %s read only for collapse...", finfo.path);
392	finfo.fd = open(finfo.path, O_RDONLY, 777);
393	if (finfo.fd < 0) {
394		perror("open()");
395		exit(EXIT_FAILURE);
396	}
397	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
398		 MAP_PRIVATE, finfo.fd, 0);
399	if (p == MAP_FAILED || p != BASE_ADDR) {
400		perror("mmap()");
401		exit(EXIT_FAILURE);
402	}
403
404	/* Drop page cache */
405	write_file("/proc/sys/vm/drop_caches", "3", 2);
406	success("OK");
407	return p;
408}
409
410static void file_cleanup_area(void *p, unsigned long size)
411{
412	munmap(p, size);
413	close(finfo.fd);
414	unlink(finfo.path);
415}
416
417static void file_fault(void *p, unsigned long start, unsigned long end)
418{
419	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
420		perror("madvise(MADV_POPULATE_READ");
421		exit(EXIT_FAILURE);
422	}
423}
424
425static bool file_check_huge(void *addr, int nr_hpages)
426{
427	switch (finfo.type) {
428	case VMA_FILE:
429		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
430	case VMA_SHMEM:
431		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
432	default:
433		exit(EXIT_FAILURE);
434		return false;
435	}
436}
437
438static void *shmem_setup_area(int nr_hpages)
439{
440	void *p;
441	unsigned long size = nr_hpages * hpage_pmd_size;
442
443	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
444	if (finfo.fd < 0)  {
445		perror("memfd_create()");
446		exit(EXIT_FAILURE);
447	}
448	if (ftruncate(finfo.fd, size)) {
449		perror("ftruncate()");
450		exit(EXIT_FAILURE);
451	}
452	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
453		 0);
454	if (p != BASE_ADDR) {
455		perror("mmap()");
456		exit(EXIT_FAILURE);
457	}
458	return p;
459}
460
461static void shmem_cleanup_area(void *p, unsigned long size)
462{
463	munmap(p, size);
464	close(finfo.fd);
465}
466
467static bool shmem_check_huge(void *addr, int nr_hpages)
468{
469	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
470}
471
472static struct mem_ops __anon_ops = {
473	.setup_area = &anon_setup_area,
474	.cleanup_area = &anon_cleanup_area,
475	.fault = &anon_fault,
476	.check_huge = &anon_check_huge,
477	.name = "anon",
478};
479
480static struct mem_ops __file_ops = {
481	.setup_area = &file_setup_area,
482	.cleanup_area = &file_cleanup_area,
483	.fault = &file_fault,
484	.check_huge = &file_check_huge,
485	.name = "file",
486};
487
488static struct mem_ops __shmem_ops = {
489	.setup_area = &shmem_setup_area,
490	.cleanup_area = &shmem_cleanup_area,
491	.fault = &anon_fault,
492	.check_huge = &shmem_check_huge,
493	.name = "shmem",
494};
495
496static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
497			       struct mem_ops *ops, bool expect)
498{
499	int ret;
500	struct thp_settings settings = *thp_current_settings();
501
502	printf("%s...", msg);
503
504	/*
505	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
506	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
507	 */
508	settings.thp_enabled = THP_NEVER;
509	settings.shmem_enabled = SHMEM_NEVER;
510	thp_push_settings(&settings);
511
512	/* Clear VM_NOHUGEPAGE */
513	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
514	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
515	if (((bool)ret) == expect)
516		fail("Fail: Bad return value");
517	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
518		fail("Fail: check_huge()");
519	else
520		success("OK");
521
522	thp_pop_settings();
523}
524
525static void madvise_collapse(const char *msg, char *p, int nr_hpages,
526			     struct mem_ops *ops, bool expect)
527{
528	/* Sanity check */
529	if (!ops->check_huge(p, 0)) {
530		printf("Unexpected huge page\n");
531		exit(EXIT_FAILURE);
532	}
533	__madvise_collapse(msg, p, nr_hpages, ops, expect);
534}
535
536#define TICK 500000
537static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
538			  struct mem_ops *ops)
539{
540	int full_scans;
541	int timeout = 6; /* 3 seconds */
542
543	/* Sanity check */
544	if (!ops->check_huge(p, 0)) {
545		printf("Unexpected huge page\n");
546		exit(EXIT_FAILURE);
547	}
548
549	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
550
551	/* Wait until the second full_scan completed */
552	full_scans = thp_read_num("khugepaged/full_scans") + 2;
553
554	printf("%s...", msg);
555	while (timeout--) {
556		if (ops->check_huge(p, nr_hpages))
557			break;
558		if (thp_read_num("khugepaged/full_scans") >= full_scans)
559			break;
560		printf(".");
561		usleep(TICK);
562	}
563
564	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
565
566	return timeout == -1;
567}
568
569static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
570				struct mem_ops *ops, bool expect)
571{
572	if (wait_for_scan(msg, p, nr_hpages, ops)) {
573		if (expect)
574			fail("Timeout");
575		else
576			success("OK");
577		return;
578	}
579
580	/*
581	 * For file and shmem memory, khugepaged only retracts pte entries after
582	 * putting the new hugepage in the page cache. The hugepage must be
583	 * subsequently refaulted to install the pmd mapping for the mm.
584	 */
585	if (ops != &__anon_ops)
586		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
587
588	if (ops->check_huge(p, expect ? nr_hpages : 0))
589		success("OK");
590	else
591		fail("Fail");
592}
593
594static struct collapse_context __khugepaged_context = {
595	.collapse = &khugepaged_collapse,
596	.enforce_pte_scan_limits = true,
597	.name = "khugepaged",
598};
599
600static struct collapse_context __madvise_context = {
601	.collapse = &madvise_collapse,
602	.enforce_pte_scan_limits = false,
603	.name = "madvise",
604};
605
606static bool is_tmpfs(struct mem_ops *ops)
607{
608	return ops == &__file_ops && finfo.type == VMA_SHMEM;
609}
610
611static bool is_anon(struct mem_ops *ops)
612{
613	return ops == &__anon_ops;
614}
615
616static void alloc_at_fault(void)
617{
618	struct thp_settings settings = *thp_current_settings();
619	char *p;
620
621	settings.thp_enabled = THP_ALWAYS;
622	thp_push_settings(&settings);
623
624	p = alloc_mapping(1);
625	*p = 1;
626	printf("Allocate huge page on fault...");
627	if (check_huge_anon(p, 1, hpage_pmd_size))
628		success("OK");
629	else
630		fail("Fail");
631
632	thp_pop_settings();
633
634	madvise(p, page_size, MADV_DONTNEED);
635	printf("Split huge PMD on MADV_DONTNEED...");
636	if (check_huge_anon(p, 0, hpage_pmd_size))
637		success("OK");
638	else
639		fail("Fail");
640	munmap(p, hpage_pmd_size);
641}
642
643static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
644{
645	void *p;
646	int nr_hpages = 4;
647	unsigned long size = nr_hpages * hpage_pmd_size;
648
649	p = ops->setup_area(nr_hpages);
650	ops->fault(p, 0, size);
651	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
652		    ops, true);
653	validate_memory(p, 0, size);
654	ops->cleanup_area(p, size);
655}
656
657static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
658{
659	void *p;
660
661	p = ops->setup_area(1);
662	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
663	ops->cleanup_area(p, hpage_pmd_size);
664}
665
666static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
667{
668	void *p;
669
670	p = ops->setup_area(1);
671	ops->fault(p, 0, page_size);
672	c->collapse("Collapse PTE table with single PTE entry present", p,
673		    1, ops, true);
674	ops->cleanup_area(p, hpage_pmd_size);
675}
676
677static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
678{
679	int max_ptes_none = hpage_pmd_nr / 2;
680	struct thp_settings settings = *thp_current_settings();
681	void *p;
682	int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;
683
684	settings.khugepaged.max_ptes_none = max_ptes_none;
685	thp_push_settings(&settings);
686
687	p = ops->setup_area(1);
688
689	if (is_tmpfs(ops)) {
690		/* shmem pages always in the page cache */
691		printf("tmpfs...");
692		skip("Skip");
693		goto skip;
694	}
695
696	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
697	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
698		    ops, !c->enforce_pte_scan_limits);
699	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
700
701	if (c->enforce_pte_scan_limits) {
702		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
703		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
704			    true);
705		validate_memory(p, 0,
706				(hpage_pmd_nr - max_ptes_none) * page_size);
707	}
708skip:
709	ops->cleanup_area(p, hpage_pmd_size);
710	thp_pop_settings();
711}
712
713static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
714{
715	void *p;
716
717	p = ops->setup_area(1);
718	ops->fault(p, 0, hpage_pmd_size);
719
720	printf("Swapout one page...");
721	if (madvise(p, page_size, MADV_PAGEOUT)) {
722		perror("madvise(MADV_PAGEOUT)");
723		exit(EXIT_FAILURE);
724	}
725	if (check_swap(p, page_size)) {
726		success("OK");
727	} else {
728		fail("Fail");
729		goto out;
730	}
731
732	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
733		    true);
734	validate_memory(p, 0, hpage_pmd_size);
735out:
736	ops->cleanup_area(p, hpage_pmd_size);
737}
738
739static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
740{
741	int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
742	void *p;
743
744	p = ops->setup_area(1);
745	ops->fault(p, 0, hpage_pmd_size);
746
747	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
748	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
749		perror("madvise(MADV_PAGEOUT)");
750		exit(EXIT_FAILURE);
751	}
752	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
753		success("OK");
754	} else {
755		fail("Fail");
756		goto out;
757	}
758
759	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
760		    !c->enforce_pte_scan_limits);
761	validate_memory(p, 0, hpage_pmd_size);
762
763	if (c->enforce_pte_scan_limits) {
764		ops->fault(p, 0, hpage_pmd_size);
765		printf("Swapout %d of %d pages...", max_ptes_swap,
766		       hpage_pmd_nr);
767		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
768			perror("madvise(MADV_PAGEOUT)");
769			exit(EXIT_FAILURE);
770		}
771		if (check_swap(p, max_ptes_swap * page_size)) {
772			success("OK");
773		} else {
774			fail("Fail");
775			goto out;
776		}
777
778		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
779			    1, ops, true);
780		validate_memory(p, 0, hpage_pmd_size);
781	}
782out:
783	ops->cleanup_area(p, hpage_pmd_size);
784}
785
786static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
787{
788	void *p;
789
790	p = alloc_hpage(ops);
791
792	if (is_tmpfs(ops)) {
793		/* MADV_DONTNEED won't evict tmpfs pages */
794		printf("tmpfs...");
795		skip("Skip");
796		goto skip;
797	}
798
799	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
800	printf("Split huge page leaving single PTE mapping compound page...");
801	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
802	if (ops->check_huge(p, 0))
803		success("OK");
804	else
805		fail("Fail");
806
807	c->collapse("Collapse PTE table with single PTE mapping compound page",
808		    p, 1, ops, true);
809	validate_memory(p, 0, page_size);
810skip:
811	ops->cleanup_area(p, hpage_pmd_size);
812}
813
814static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
815{
816	void *p;
817
818	p = alloc_hpage(ops);
819	printf("Split huge page leaving single PTE page table full of compound pages...");
820	madvise(p, page_size, MADV_NOHUGEPAGE);
821	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
822	if (ops->check_huge(p, 0))
823		success("OK");
824	else
825		fail("Fail");
826
827	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
828		    true);
829	validate_memory(p, 0, hpage_pmd_size);
830	ops->cleanup_area(p, hpage_pmd_size);
831}
832
833static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
834{
835	void *p;
836	int i;
837
838	p = ops->setup_area(1);
839	for (i = 0; i < hpage_pmd_nr; i++) {
840		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
841				i + 1, hpage_pmd_nr);
842
843		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
844		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
845		if (!ops->check_huge(BASE_ADDR, 1)) {
846			printf("Failed to allocate huge page\n");
847			exit(EXIT_FAILURE);
848		}
849		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
850
851		p = mremap(BASE_ADDR - i * page_size,
852				i * page_size + hpage_pmd_size,
853				(i + 1) * page_size,
854				MREMAP_MAYMOVE | MREMAP_FIXED,
855				BASE_ADDR + 2 * hpage_pmd_size);
856		if (p == MAP_FAILED) {
857			perror("mremap+unmap");
858			exit(EXIT_FAILURE);
859		}
860
861		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
862				(i + 1) * page_size,
863				(i + 1) * page_size + hpage_pmd_size,
864				MREMAP_MAYMOVE | MREMAP_FIXED,
865				BASE_ADDR - (i + 1) * page_size);
866		if (p == MAP_FAILED) {
867			perror("mremap+alloc");
868			exit(EXIT_FAILURE);
869		}
870	}
871
872	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
873	ops->fault(p, 0, hpage_pmd_size);
874	if (!ops->check_huge(p, 1))
875		success("OK");
876	else
877		fail("Fail");
878
879	c->collapse("Collapse PTE table full of different compound pages", p, 1,
880		    ops, true);
881
882	validate_memory(p, 0, hpage_pmd_size);
883	ops->cleanup_area(p, hpage_pmd_size);
884}
885
886static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
887{
888	int wstatus;
889	void *p;
890
891	p = ops->setup_area(1);
892
893	printf("Allocate small page...");
894	ops->fault(p, 0, page_size);
895	if (ops->check_huge(p, 0))
896		success("OK");
897	else
898		fail("Fail");
899
900	printf("Share small page over fork()...");
901	if (!fork()) {
902		/* Do not touch settings on child exit */
903		skip_settings_restore = true;
904		exit_status = 0;
905
906		if (ops->check_huge(p, 0))
907			success("OK");
908		else
909			fail("Fail");
910
911		ops->fault(p, page_size, 2 * page_size);
912		c->collapse("Collapse PTE table with single page shared with parent process",
913			    p, 1, ops, true);
914
915		validate_memory(p, 0, page_size);
916		ops->cleanup_area(p, hpage_pmd_size);
917		exit(exit_status);
918	}
919
920	wait(&wstatus);
921	exit_status += WEXITSTATUS(wstatus);
922
923	printf("Check if parent still has small page...");
924	if (ops->check_huge(p, 0))
925		success("OK");
926	else
927		fail("Fail");
928	validate_memory(p, 0, page_size);
929	ops->cleanup_area(p, hpage_pmd_size);
930}
931
932static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
933{
934	int wstatus;
935	void *p;
936
937	p = alloc_hpage(ops);
938	printf("Share huge page over fork()...");
939	if (!fork()) {
940		/* Do not touch settings on child exit */
941		skip_settings_restore = true;
942		exit_status = 0;
943
944		if (ops->check_huge(p, 1))
945			success("OK");
946		else
947			fail("Fail");
948
949		printf("Split huge page PMD in child process...");
950		madvise(p, page_size, MADV_NOHUGEPAGE);
951		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
952		if (ops->check_huge(p, 0))
953			success("OK");
954		else
955			fail("Fail");
956		ops->fault(p, 0, page_size);
957
958		thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
959		c->collapse("Collapse PTE table full of compound pages in child",
960			    p, 1, ops, true);
961		thp_write_num("khugepaged/max_ptes_shared",
962			  thp_current_settings()->khugepaged.max_ptes_shared);
963
964		validate_memory(p, 0, hpage_pmd_size);
965		ops->cleanup_area(p, hpage_pmd_size);
966		exit(exit_status);
967	}
968
969	wait(&wstatus);
970	exit_status += WEXITSTATUS(wstatus);
971
972	printf("Check if parent still has huge page...");
973	if (ops->check_huge(p, 1))
974		success("OK");
975	else
976		fail("Fail");
977	validate_memory(p, 0, hpage_pmd_size);
978	ops->cleanup_area(p, hpage_pmd_size);
979}
980
981static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
982{
983	int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
984	int wstatus;
985	void *p;
986
987	p = alloc_hpage(ops);
988	printf("Share huge page over fork()...");
989	if (!fork()) {
990		/* Do not touch settings on child exit */
991		skip_settings_restore = true;
992		exit_status = 0;
993
994		if (ops->check_huge(p, 1))
995			success("OK");
996		else
997			fail("Fail");
998
999		printf("Trigger CoW on page %d of %d...",
1000				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
1001		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1002		if (ops->check_huge(p, 0))
1003			success("OK");
1004		else
1005			fail("Fail");
1006
1007		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1008			    1, ops, !c->enforce_pte_scan_limits);
1009
1010		if (c->enforce_pte_scan_limits) {
1011			printf("Trigger CoW on page %d of %d...",
1012			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1013			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1014				    page_size);
1015			if (ops->check_huge(p, 0))
1016				success("OK");
1017			else
1018				fail("Fail");
1019
1020			c->collapse("Collapse with max_ptes_shared PTEs shared",
1021				    p, 1, ops, true);
1022		}
1023
1024		validate_memory(p, 0, hpage_pmd_size);
1025		ops->cleanup_area(p, hpage_pmd_size);
1026		exit(exit_status);
1027	}
1028
1029	wait(&wstatus);
1030	exit_status += WEXITSTATUS(wstatus);
1031
1032	printf("Check if parent still has huge page...");
1033	if (ops->check_huge(p, 1))
1034		success("OK");
1035	else
1036		fail("Fail");
1037	validate_memory(p, 0, hpage_pmd_size);
1038	ops->cleanup_area(p, hpage_pmd_size);
1039}
1040
1041static void madvise_collapse_existing_thps(struct collapse_context *c,
1042					   struct mem_ops *ops)
1043{
1044	void *p;
1045
1046	p = ops->setup_area(1);
1047	ops->fault(p, 0, hpage_pmd_size);
1048	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1049	validate_memory(p, 0, hpage_pmd_size);
1050
1051	/* c->collapse() will find a hugepage and complain - call directly. */
1052	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1053	validate_memory(p, 0, hpage_pmd_size);
1054	ops->cleanup_area(p, hpage_pmd_size);
1055}
1056
1057/*
1058 * Test race with khugepaged where page tables have been retracted and
1059 * pmd cleared.
1060 */
1061static void madvise_retracted_page_tables(struct collapse_context *c,
1062					  struct mem_ops *ops)
1063{
1064	void *p;
1065	int nr_hpages = 1;
1066	unsigned long size = nr_hpages * hpage_pmd_size;
1067
1068	p = ops->setup_area(nr_hpages);
1069	ops->fault(p, 0, size);
1070
1071	/* Let khugepaged collapse and leave pmd cleared */
1072	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1073			  ops)) {
1074		fail("Timeout");
1075		return;
1076	}
1077	success("OK");
1078	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1079		    true);
1080	validate_memory(p, 0, size);
1081	ops->cleanup_area(p, size);
1082}
1083
1084static void usage(void)
1085{
1086	fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
1087	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1088	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1089	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1090	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1091	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1092	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1093	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1094	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
1095	fprintf(stderr,	"\n\tSupported Options:\n");
1096	fprintf(stderr,	"\t\t-h: This help message.\n");
1097	fprintf(stderr,	"\t\t-s: mTHP size, expressed as page order.\n");
1098	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon allocations.\n");
1099	exit(1);
1100}
1101
1102static void parse_test_type(int argc, char **argv)
1103{
1104	int opt;
1105	char *buf;
1106	const char *token;
1107
1108	while ((opt = getopt(argc, argv, "s:h")) != -1) {
1109		switch (opt) {
1110		case 's':
1111			anon_order = atoi(optarg);
1112			break;
1113		case 'h':
1114		default:
1115			usage();
1116		}
1117	}
1118
1119	argv += optind;
1120	argc -= optind;
1121
1122	if (argc == 0) {
1123		/* Backwards compatibility */
1124		khugepaged_context =  &__khugepaged_context;
1125		madvise_context =  &__madvise_context;
1126		anon_ops = &__anon_ops;
1127		return;
1128	}
1129
1130	buf = strdup(argv[0]);
1131	token = strsep(&buf, ":");
1132
1133	if (!strcmp(token, "all")) {
1134		khugepaged_context =  &__khugepaged_context;
1135		madvise_context =  &__madvise_context;
1136	} else if (!strcmp(token, "khugepaged")) {
1137		khugepaged_context =  &__khugepaged_context;
1138	} else if (!strcmp(token, "madvise")) {
1139		madvise_context =  &__madvise_context;
1140	} else {
1141		usage();
1142	}
1143
1144	if (!buf)
1145		usage();
1146
1147	if (!strcmp(buf, "all")) {
1148		file_ops =  &__file_ops;
1149		anon_ops = &__anon_ops;
1150		shmem_ops = &__shmem_ops;
1151	} else if (!strcmp(buf, "anon")) {
1152		anon_ops = &__anon_ops;
1153	} else if (!strcmp(buf, "file")) {
1154		file_ops =  &__file_ops;
1155	} else if (!strcmp(buf, "shmem")) {
1156		shmem_ops = &__shmem_ops;
1157	} else {
1158		usage();
1159	}
1160
1161	if (!file_ops)
1162		return;
1163
1164	if (argc != 2)
1165		usage();
1166
1167	get_finfo(argv[1]);
1168}
1169
1170int main(int argc, char **argv)
1171{
1172	int hpage_pmd_order;
1173	struct thp_settings default_settings = {
1174		.thp_enabled = THP_MADVISE,
1175		.thp_defrag = THP_DEFRAG_ALWAYS,
1176		.shmem_enabled = SHMEM_ADVISE,
1177		.use_zero_page = 0,
1178		.khugepaged = {
1179			.defrag = 1,
1180			.alloc_sleep_millisecs = 10,
1181			.scan_sleep_millisecs = 10,
1182		},
1183		/*
1184		 * When testing file-backed memory, the collapse path
1185		 * looks at how many pages are found in the page cache, not
1186		 * what pages are mapped. Disable read ahead optimization so
1187		 * pages don't find their way into the page cache unless
1188		 * we mem_ops->fault() them in.
1189		 */
1190		.read_ahead_kb = 0,
1191	};
1192
1193	parse_test_type(argc, argv);
1194
1195	setbuf(stdout, NULL);
1196
1197	page_size = getpagesize();
1198	hpage_pmd_size = read_pmd_pagesize();
1199	if (!hpage_pmd_size) {
1200		printf("Reading PMD pagesize failed");
1201		exit(EXIT_FAILURE);
1202	}
1203	hpage_pmd_nr = hpage_pmd_size / page_size;
1204	hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
1205
1206	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1207	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1208	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1209	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1210	default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
1211	default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
1212
1213	save_settings();
1214	thp_push_settings(&default_settings);
1215
1216	alloc_at_fault();
1217
1218#define TEST(t, c, o) do { \
1219	if (c && o) { \
1220		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1221		t(c, o); \
1222	} \
1223	} while (0)
1224
1225	TEST(collapse_full, khugepaged_context, anon_ops);
1226	TEST(collapse_full, khugepaged_context, file_ops);
1227	TEST(collapse_full, khugepaged_context, shmem_ops);
1228	TEST(collapse_full, madvise_context, anon_ops);
1229	TEST(collapse_full, madvise_context, file_ops);
1230	TEST(collapse_full, madvise_context, shmem_ops);
1231
1232	TEST(collapse_empty, khugepaged_context, anon_ops);
1233	TEST(collapse_empty, madvise_context, anon_ops);
1234
1235	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1236	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1237	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1238	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1239	TEST(collapse_single_pte_entry, madvise_context, file_ops);
1240	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1241
1242	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1243	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1244	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1245	TEST(collapse_max_ptes_none, madvise_context, file_ops);
1246
1247	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1248	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1249	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1250	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1251
1252	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1253	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1254	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1255	TEST(collapse_full_of_compound, madvise_context, anon_ops);
1256	TEST(collapse_full_of_compound, madvise_context, file_ops);
1257	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1258
1259	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1260	TEST(collapse_compound_extreme, madvise_context, anon_ops);
1261
1262	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1263	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1264
1265	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1266	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1267
1268	TEST(collapse_fork, khugepaged_context, anon_ops);
1269	TEST(collapse_fork, madvise_context, anon_ops);
1270
1271	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1272	TEST(collapse_fork_compound, madvise_context, anon_ops);
1273
1274	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1275	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1276
1277	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1278	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1279	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1280
1281	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1282	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1283
1284	restore_settings(0);
1285}
1286