1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9#define _GNU_SOURCE
10#include <stdlib.h>
11#include <string.h>
12#include <stdbool.h>
13#include <stdint.h>
14#include <unistd.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <assert.h>
18#include <linux/mman.h>
19#include <sys/mman.h>
20#include <sys/ioctl.h>
21#include <sys/wait.h>
22#include <linux/memfd.h>
23
24#include "local_config.h"
25#ifdef LOCAL_CONFIG_HAVE_LIBURING
26#include <liburing.h>
27#endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29#include "../../../../mm/gup_test.h"
30#include "../kselftest.h"
31#include "vm_util.h"
32#include "thp_settings.h"
33
34static size_t pagesize;
35static int pagemap_fd;
36static size_t pmdsize;
37static int nr_thpsizes;
38static size_t thpsizes[20];
39static int nr_hugetlbsizes;
40static size_t hugetlbsizes[10];
41static int gup_fd;
42static bool has_huge_zeropage;
43
44static int sz2ord(size_t size)
45{
46	return __builtin_ctzll(size / pagesize);
47}
48
49static int detect_thp_sizes(size_t sizes[], int max)
50{
51	int count = 0;
52	unsigned long orders;
53	size_t kb;
54	int i;
55
56	/* thp not supported at all. */
57	if (!pmdsize)
58		return 0;
59
60	orders = 1UL << sz2ord(pmdsize);
61	orders |= thp_supported_orders();
62
63	for (i = 0; orders && count < max; i++) {
64		if (!(orders & (1UL << i)))
65			continue;
66		orders &= ~(1UL << i);
67		kb = (pagesize >> 10) << i;
68		sizes[count++] = kb * 1024;
69		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70	}
71
72	return count;
73}
74
75static void detect_huge_zeropage(void)
76{
77	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78		      O_RDONLY);
79	size_t enabled = 0;
80	char buf[15];
81	int ret;
82
83	if (fd < 0)
84		return;
85
86	ret = pread(fd, buf, sizeof(buf), 0);
87	if (ret > 0 && ret < sizeof(buf)) {
88		buf[ret] = 0;
89
90		enabled = strtoul(buf, NULL, 10);
91		if (enabled == 1) {
92			has_huge_zeropage = true;
93			ksft_print_msg("[INFO] huge zeropage is enabled\n");
94		}
95	}
96
97	close(fd);
98}
99
100static bool range_is_swapped(void *addr, size_t size)
101{
102	for (; size; addr += pagesize, size -= pagesize)
103		if (!pagemap_is_swapped(pagemap_fd, addr))
104			return false;
105	return true;
106}
107
108struct comm_pipes {
109	int child_ready[2];
110	int parent_ready[2];
111};
112
113static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114{
115	if (pipe(comm_pipes->child_ready) < 0)
116		return -errno;
117	if (pipe(comm_pipes->parent_ready) < 0) {
118		close(comm_pipes->child_ready[0]);
119		close(comm_pipes->child_ready[1]);
120		return -errno;
121	}
122
123	return 0;
124}
125
126static void close_comm_pipes(struct comm_pipes *comm_pipes)
127{
128	close(comm_pipes->child_ready[0]);
129	close(comm_pipes->child_ready[1]);
130	close(comm_pipes->parent_ready[0]);
131	close(comm_pipes->parent_ready[1]);
132}
133
134static int child_memcmp_fn(char *mem, size_t size,
135			   struct comm_pipes *comm_pipes)
136{
137	char *old = malloc(size);
138	char buf;
139
140	/* Backup the original content. */
141	memcpy(old, mem, size);
142
143	/* Wait until the parent modified the page. */
144	write(comm_pipes->child_ready[1], "0", 1);
145	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
146		;
147
148	/* See if we still read the old values. */
149	return memcmp(old, mem, size);
150}
151
152static int child_vmsplice_memcmp_fn(char *mem, size_t size,
153				    struct comm_pipes *comm_pipes)
154{
155	struct iovec iov = {
156		.iov_base = mem,
157		.iov_len = size,
158	};
159	ssize_t cur, total, transferred;
160	char *old, *new;
161	int fds[2];
162	char buf;
163
164	old = malloc(size);
165	new = malloc(size);
166
167	/* Backup the original content. */
168	memcpy(old, mem, size);
169
170	if (pipe(fds) < 0)
171		return -errno;
172
173	/* Trigger a read-only pin. */
174	transferred = vmsplice(fds[1], &iov, 1, 0);
175	if (transferred < 0)
176		return -errno;
177	if (transferred == 0)
178		return -EINVAL;
179
180	/* Unmap it from our page tables. */
181	if (munmap(mem, size) < 0)
182		return -errno;
183
184	/* Wait until the parent modified it. */
185	write(comm_pipes->child_ready[1], "0", 1);
186	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
187		;
188
189	/* See if we still read the old values via the pipe. */
190	for (total = 0; total < transferred; total += cur) {
191		cur = read(fds[0], new + total, transferred - total);
192		if (cur < 0)
193			return -errno;
194	}
195
196	return memcmp(old, new, transferred);
197}
198
199typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
200
201static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
202				  child_fn fn)
203{
204	struct comm_pipes comm_pipes;
205	char buf;
206	int ret;
207
208	ret = setup_comm_pipes(&comm_pipes);
209	if (ret) {
210		ksft_test_result_fail("pipe() failed\n");
211		return;
212	}
213
214	ret = fork();
215	if (ret < 0) {
216		ksft_test_result_fail("fork() failed\n");
217		goto close_comm_pipes;
218	} else if (!ret) {
219		exit(fn(mem, size, &comm_pipes));
220	}
221
222	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
223		;
224
225	if (do_mprotect) {
226		/*
227		 * mprotect() optimizations might try avoiding
228		 * write-faults by directly mapping pages writable.
229		 */
230		ret = mprotect(mem, size, PROT_READ);
231		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
232		if (ret) {
233			ksft_test_result_fail("mprotect() failed\n");
234			write(comm_pipes.parent_ready[1], "0", 1);
235			wait(&ret);
236			goto close_comm_pipes;
237		}
238	}
239
240	/* Modify the page. */
241	memset(mem, 0xff, size);
242	write(comm_pipes.parent_ready[1], "0", 1);
243
244	wait(&ret);
245	if (WIFEXITED(ret))
246		ret = WEXITSTATUS(ret);
247	else
248		ret = -EINVAL;
249
250	ksft_test_result(!ret, "No leak from parent into child\n");
251close_comm_pipes:
252	close_comm_pipes(&comm_pipes);
253}
254
255static void test_cow_in_parent(char *mem, size_t size)
256{
257	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
258}
259
260static void test_cow_in_parent_mprotect(char *mem, size_t size)
261{
262	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
263}
264
265static void test_vmsplice_in_child(char *mem, size_t size)
266{
267	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
268}
269
270static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
271{
272	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
273}
274
275static void do_test_vmsplice_in_parent(char *mem, size_t size,
276				       bool before_fork)
277{
278	struct iovec iov = {
279		.iov_base = mem,
280		.iov_len = size,
281	};
282	ssize_t cur, total, transferred;
283	struct comm_pipes comm_pipes;
284	char *old, *new;
285	int ret, fds[2];
286	char buf;
287
288	old = malloc(size);
289	new = malloc(size);
290
291	memcpy(old, mem, size);
292
293	ret = setup_comm_pipes(&comm_pipes);
294	if (ret) {
295		ksft_test_result_fail("pipe() failed\n");
296		goto free;
297	}
298
299	if (pipe(fds) < 0) {
300		ksft_test_result_fail("pipe() failed\n");
301		goto close_comm_pipes;
302	}
303
304	if (before_fork) {
305		transferred = vmsplice(fds[1], &iov, 1, 0);
306		if (transferred <= 0) {
307			ksft_test_result_fail("vmsplice() failed\n");
308			goto close_pipe;
309		}
310	}
311
312	ret = fork();
313	if (ret < 0) {
314		ksft_test_result_fail("fork() failed\n");
315		goto close_pipe;
316	} else if (!ret) {
317		write(comm_pipes.child_ready[1], "0", 1);
318		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
319			;
320		/* Modify page content in the child. */
321		memset(mem, 0xff, size);
322		exit(0);
323	}
324
325	if (!before_fork) {
326		transferred = vmsplice(fds[1], &iov, 1, 0);
327		if (transferred <= 0) {
328			ksft_test_result_fail("vmsplice() failed\n");
329			wait(&ret);
330			goto close_pipe;
331		}
332	}
333
334	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
335		;
336	if (munmap(mem, size) < 0) {
337		ksft_test_result_fail("munmap() failed\n");
338		goto close_pipe;
339	}
340	write(comm_pipes.parent_ready[1], "0", 1);
341
342	/* Wait until the child is done writing. */
343	wait(&ret);
344	if (!WIFEXITED(ret)) {
345		ksft_test_result_fail("wait() failed\n");
346		goto close_pipe;
347	}
348
349	/* See if we still read the old values. */
350	for (total = 0; total < transferred; total += cur) {
351		cur = read(fds[0], new + total, transferred - total);
352		if (cur < 0) {
353			ksft_test_result_fail("read() failed\n");
354			goto close_pipe;
355		}
356	}
357
358	ksft_test_result(!memcmp(old, new, transferred),
359			 "No leak from child into parent\n");
360close_pipe:
361	close(fds[0]);
362	close(fds[1]);
363close_comm_pipes:
364	close_comm_pipes(&comm_pipes);
365free:
366	free(old);
367	free(new);
368}
369
370static void test_vmsplice_before_fork(char *mem, size_t size)
371{
372	do_test_vmsplice_in_parent(mem, size, true);
373}
374
375static void test_vmsplice_after_fork(char *mem, size_t size)
376{
377	do_test_vmsplice_in_parent(mem, size, false);
378}
379
380#ifdef LOCAL_CONFIG_HAVE_LIBURING
381static void do_test_iouring(char *mem, size_t size, bool use_fork)
382{
383	struct comm_pipes comm_pipes;
384	struct io_uring_cqe *cqe;
385	struct io_uring_sqe *sqe;
386	struct io_uring ring;
387	ssize_t cur, total;
388	struct iovec iov;
389	char *buf, *tmp;
390	int ret, fd;
391	FILE *file;
392
393	ret = setup_comm_pipes(&comm_pipes);
394	if (ret) {
395		ksft_test_result_fail("pipe() failed\n");
396		return;
397	}
398
399	file = tmpfile();
400	if (!file) {
401		ksft_test_result_fail("tmpfile() failed\n");
402		goto close_comm_pipes;
403	}
404	fd = fileno(file);
405	assert(fd);
406
407	tmp = malloc(size);
408	if (!tmp) {
409		ksft_test_result_fail("malloc() failed\n");
410		goto close_file;
411	}
412
413	/* Skip on errors, as we might just lack kernel support. */
414	ret = io_uring_queue_init(1, &ring, 0);
415	if (ret < 0) {
416		ksft_test_result_skip("io_uring_queue_init() failed\n");
417		goto free_tmp;
418	}
419
420	/*
421	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
422	 * | FOLL_LONGTERM the range.
423	 *
424	 * Skip on errors, as we might just lack kernel support or might not
425	 * have sufficient MEMLOCK permissions.
426	 */
427	iov.iov_base = mem;
428	iov.iov_len = size;
429	ret = io_uring_register_buffers(&ring, &iov, 1);
430	if (ret) {
431		ksft_test_result_skip("io_uring_register_buffers() failed\n");
432		goto queue_exit;
433	}
434
435	if (use_fork) {
436		/*
437		 * fork() and keep the child alive until we're done. Note that
438		 * we expect the pinned page to not get shared with the child.
439		 */
440		ret = fork();
441		if (ret < 0) {
442			ksft_test_result_fail("fork() failed\n");
443			goto unregister_buffers;
444		} else if (!ret) {
445			write(comm_pipes.child_ready[1], "0", 1);
446			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
447				;
448			exit(0);
449		}
450
451		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
452			;
453	} else {
454		/*
455		 * Map the page R/O into the page table. Enable softdirty
456		 * tracking to stop the page from getting mapped R/W immediately
457		 * again by mprotect() optimizations. Note that we don't have an
458		 * easy way to test if that worked (the pagemap does not export
459		 * if the page is mapped R/O vs. R/W).
460		 */
461		ret = mprotect(mem, size, PROT_READ);
462		clear_softdirty();
463		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
464		if (ret) {
465			ksft_test_result_fail("mprotect() failed\n");
466			goto unregister_buffers;
467		}
468	}
469
470	/*
471	 * Modify the page and write page content as observed by the fixed
472	 * buffer pin to the file so we can verify it.
473	 */
474	memset(mem, 0xff, size);
475	sqe = io_uring_get_sqe(&ring);
476	if (!sqe) {
477		ksft_test_result_fail("io_uring_get_sqe() failed\n");
478		goto quit_child;
479	}
480	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
481
482	ret = io_uring_submit(&ring);
483	if (ret < 0) {
484		ksft_test_result_fail("io_uring_submit() failed\n");
485		goto quit_child;
486	}
487
488	ret = io_uring_wait_cqe(&ring, &cqe);
489	if (ret < 0) {
490		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
491		goto quit_child;
492	}
493
494	if (cqe->res != size) {
495		ksft_test_result_fail("write_fixed failed\n");
496		goto quit_child;
497	}
498	io_uring_cqe_seen(&ring, cqe);
499
500	/* Read back the file content to the temporary buffer. */
501	total = 0;
502	while (total < size) {
503		cur = pread(fd, tmp + total, size - total, total);
504		if (cur < 0) {
505			ksft_test_result_fail("pread() failed\n");
506			goto quit_child;
507		}
508		total += cur;
509	}
510
511	/* Finally, check if we read what we expected. */
512	ksft_test_result(!memcmp(mem, tmp, size),
513			 "Longterm R/W pin is reliable\n");
514
515quit_child:
516	if (use_fork) {
517		write(comm_pipes.parent_ready[1], "0", 1);
518		wait(&ret);
519	}
520unregister_buffers:
521	io_uring_unregister_buffers(&ring);
522queue_exit:
523	io_uring_queue_exit(&ring);
524free_tmp:
525	free(tmp);
526close_file:
527	fclose(file);
528close_comm_pipes:
529	close_comm_pipes(&comm_pipes);
530}
531
532static void test_iouring_ro(char *mem, size_t size)
533{
534	do_test_iouring(mem, size, false);
535}
536
537static void test_iouring_fork(char *mem, size_t size)
538{
539	do_test_iouring(mem, size, true);
540}
541
542#endif /* LOCAL_CONFIG_HAVE_LIBURING */
543
544enum ro_pin_test {
545	RO_PIN_TEST,
546	RO_PIN_TEST_SHARED,
547	RO_PIN_TEST_PREVIOUSLY_SHARED,
548	RO_PIN_TEST_RO_EXCLUSIVE,
549};
550
551static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
552			   bool fast)
553{
554	struct pin_longterm_test args;
555	struct comm_pipes comm_pipes;
556	char *tmp, buf;
557	__u64 tmp_val;
558	int ret;
559
560	if (gup_fd < 0) {
561		ksft_test_result_skip("gup_test not available\n");
562		return;
563	}
564
565	tmp = malloc(size);
566	if (!tmp) {
567		ksft_test_result_fail("malloc() failed\n");
568		return;
569	}
570
571	ret = setup_comm_pipes(&comm_pipes);
572	if (ret) {
573		ksft_test_result_fail("pipe() failed\n");
574		goto free_tmp;
575	}
576
577	switch (test) {
578	case RO_PIN_TEST:
579		break;
580	case RO_PIN_TEST_SHARED:
581	case RO_PIN_TEST_PREVIOUSLY_SHARED:
582		/*
583		 * Share the pages with our child. As the pages are not pinned,
584		 * this should just work.
585		 */
586		ret = fork();
587		if (ret < 0) {
588			ksft_test_result_fail("fork() failed\n");
589			goto close_comm_pipes;
590		} else if (!ret) {
591			write(comm_pipes.child_ready[1], "0", 1);
592			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
593				;
594			exit(0);
595		}
596
597		/* Wait until our child is ready. */
598		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
599			;
600
601		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
602			/*
603			 * Tell the child to quit now and wait until it quit.
604			 * The pages should now be mapped R/O into our page
605			 * tables, but they are no longer shared.
606			 */
607			write(comm_pipes.parent_ready[1], "0", 1);
608			wait(&ret);
609			if (!WIFEXITED(ret))
610				ksft_print_msg("[INFO] wait() failed\n");
611		}
612		break;
613	case RO_PIN_TEST_RO_EXCLUSIVE:
614		/*
615		 * Map the page R/O into the page table. Enable softdirty
616		 * tracking to stop the page from getting mapped R/W immediately
617		 * again by mprotect() optimizations. Note that we don't have an
618		 * easy way to test if that worked (the pagemap does not export
619		 * if the page is mapped R/O vs. R/W).
620		 */
621		ret = mprotect(mem, size, PROT_READ);
622		clear_softdirty();
623		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
624		if (ret) {
625			ksft_test_result_fail("mprotect() failed\n");
626			goto close_comm_pipes;
627		}
628		break;
629	default:
630		assert(false);
631	}
632
633	/* Take a R/O pin. This should trigger unsharing. */
634	args.addr = (__u64)(uintptr_t)mem;
635	args.size = size;
636	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
637	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
638	if (ret) {
639		if (errno == EINVAL)
640			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
641		else
642			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
643		goto wait;
644	}
645
646	/* Modify the page. */
647	memset(mem, 0xff, size);
648
649	/*
650	 * Read back the content via the pin to the temporary buffer and
651	 * test if we observed the modification.
652	 */
653	tmp_val = (__u64)(uintptr_t)tmp;
654	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
655	if (ret)
656		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
657	else
658		ksft_test_result(!memcmp(mem, tmp, size),
659				 "Longterm R/O pin is reliable\n");
660
661	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
662	if (ret)
663		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
664wait:
665	switch (test) {
666	case RO_PIN_TEST_SHARED:
667		write(comm_pipes.parent_ready[1], "0", 1);
668		wait(&ret);
669		if (!WIFEXITED(ret))
670			ksft_print_msg("[INFO] wait() failed\n");
671		break;
672	default:
673		break;
674	}
675close_comm_pipes:
676	close_comm_pipes(&comm_pipes);
677free_tmp:
678	free(tmp);
679}
680
681static void test_ro_pin_on_shared(char *mem, size_t size)
682{
683	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
684}
685
686static void test_ro_fast_pin_on_shared(char *mem, size_t size)
687{
688	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
689}
690
691static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
692{
693	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
694}
695
696static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
697{
698	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
699}
700
701static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
702{
703	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
704}
705
706static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
707{
708	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
709}
710
711typedef void (*test_fn)(char *mem, size_t size);
712
713static void do_run_with_base_page(test_fn fn, bool swapout)
714{
715	char *mem;
716	int ret;
717
718	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
719		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
720	if (mem == MAP_FAILED) {
721		ksft_test_result_fail("mmap() failed\n");
722		return;
723	}
724
725	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
726	/* Ignore if not around on a kernel. */
727	if (ret && errno != EINVAL) {
728		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
729		goto munmap;
730	}
731
732	/* Populate a base page. */
733	memset(mem, 0, pagesize);
734
735	if (swapout) {
736		madvise(mem, pagesize, MADV_PAGEOUT);
737		if (!pagemap_is_swapped(pagemap_fd, mem)) {
738			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
739			goto munmap;
740		}
741	}
742
743	fn(mem, pagesize);
744munmap:
745	munmap(mem, pagesize);
746}
747
748static void run_with_base_page(test_fn fn, const char *desc)
749{
750	ksft_print_msg("[RUN] %s ... with base page\n", desc);
751	do_run_with_base_page(fn, false);
752}
753
754static void run_with_base_page_swap(test_fn fn, const char *desc)
755{
756	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
757	do_run_with_base_page(fn, true);
758}
759
760enum thp_run {
761	THP_RUN_PMD,
762	THP_RUN_PMD_SWAPOUT,
763	THP_RUN_PTE,
764	THP_RUN_PTE_SWAPOUT,
765	THP_RUN_SINGLE_PTE,
766	THP_RUN_SINGLE_PTE_SWAPOUT,
767	THP_RUN_PARTIAL_MREMAP,
768	THP_RUN_PARTIAL_SHARED,
769};
770
771static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
772{
773	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
774	size_t size, mmap_size, mremap_size;
775	int ret;
776
777	/* For alignment purposes, we need twice the thp size. */
778	mmap_size = 2 * thpsize;
779	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
780			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
781	if (mmap_mem == MAP_FAILED) {
782		ksft_test_result_fail("mmap() failed\n");
783		return;
784	}
785
786	/* We need a THP-aligned memory area. */
787	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
788
789	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
790	if (ret) {
791		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
792		goto munmap;
793	}
794
795	/*
796	 * Try to populate a THP. Touch the first sub-page and test if
797	 * we get the last sub-page populated automatically.
798	 */
799	mem[0] = 0;
800	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
801		ksft_test_result_skip("Did not get a THP populated\n");
802		goto munmap;
803	}
804	memset(mem, 0, thpsize);
805
806	size = thpsize;
807	switch (thp_run) {
808	case THP_RUN_PMD:
809	case THP_RUN_PMD_SWAPOUT:
810		assert(thpsize == pmdsize);
811		break;
812	case THP_RUN_PTE:
813	case THP_RUN_PTE_SWAPOUT:
814		/*
815		 * Trigger PTE-mapping the THP by temporarily mapping a single
816		 * subpage R/O. This is a noop if the THP is not pmdsize (and
817		 * therefore already PTE-mapped).
818		 */
819		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
820		if (ret) {
821			ksft_test_result_fail("mprotect() failed\n");
822			goto munmap;
823		}
824		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
825		if (ret) {
826			ksft_test_result_fail("mprotect() failed\n");
827			goto munmap;
828		}
829		break;
830	case THP_RUN_SINGLE_PTE:
831	case THP_RUN_SINGLE_PTE_SWAPOUT:
832		/*
833		 * Discard all but a single subpage of that PTE-mapped THP. What
834		 * remains is a single PTE mapping a single subpage.
835		 */
836		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
837		if (ret) {
838			ksft_test_result_fail("MADV_DONTNEED failed\n");
839			goto munmap;
840		}
841		size = pagesize;
842		break;
843	case THP_RUN_PARTIAL_MREMAP:
844		/*
845		 * Remap half of the THP. We need some new memory location
846		 * for that.
847		 */
848		mremap_size = thpsize / 2;
849		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
850				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
851		if (mem == MAP_FAILED) {
852			ksft_test_result_fail("mmap() failed\n");
853			goto munmap;
854		}
855		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
856			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
857		if (tmp != mremap_mem) {
858			ksft_test_result_fail("mremap() failed\n");
859			goto munmap;
860		}
861		size = mremap_size;
862		break;
863	case THP_RUN_PARTIAL_SHARED:
864		/*
865		 * Share the first page of the THP with a child and quit the
866		 * child. This will result in some parts of the THP never
867		 * have been shared.
868		 */
869		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
870		if (ret) {
871			ksft_test_result_fail("MADV_DONTFORK failed\n");
872			goto munmap;
873		}
874		ret = fork();
875		if (ret < 0) {
876			ksft_test_result_fail("fork() failed\n");
877			goto munmap;
878		} else if (!ret) {
879			exit(0);
880		}
881		wait(&ret);
882		/* Allow for sharing all pages again. */
883		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
884		if (ret) {
885			ksft_test_result_fail("MADV_DOFORK failed\n");
886			goto munmap;
887		}
888		break;
889	default:
890		assert(false);
891	}
892
893	switch (thp_run) {
894	case THP_RUN_PMD_SWAPOUT:
895	case THP_RUN_PTE_SWAPOUT:
896	case THP_RUN_SINGLE_PTE_SWAPOUT:
897		madvise(mem, size, MADV_PAGEOUT);
898		if (!range_is_swapped(mem, size)) {
899			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
900			goto munmap;
901		}
902		break;
903	default:
904		break;
905	}
906
907	fn(mem, size);
908munmap:
909	munmap(mmap_mem, mmap_size);
910	if (mremap_mem != MAP_FAILED)
911		munmap(mremap_mem, mremap_size);
912}
913
914static void run_with_thp(test_fn fn, const char *desc, size_t size)
915{
916	ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
917		desc, size / 1024);
918	do_run_with_thp(fn, THP_RUN_PMD, size);
919}
920
921static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
922{
923	ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
924		desc, size / 1024);
925	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
926}
927
928static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
929{
930	ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
931		desc, size / 1024);
932	do_run_with_thp(fn, THP_RUN_PTE, size);
933}
934
935static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
936{
937	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
938		desc, size / 1024);
939	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
940}
941
942static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
943{
944	ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
945		desc, size / 1024);
946	do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
947}
948
949static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
950{
951	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
952		desc, size / 1024);
953	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
954}
955
956static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
957{
958	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
959		desc, size / 1024);
960	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
961}
962
963static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
964{
965	ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
966		desc, size / 1024);
967	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
968}
969
970static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
971{
972	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
973	char *mem, *dummy;
974
975	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
976		       hugetlbsize / 1024);
977
978	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
979
980	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
981	if (mem == MAP_FAILED) {
982		ksft_test_result_skip("need more free huge pages\n");
983		return;
984	}
985
986	/* Populate an huge page. */
987	memset(mem, 0, hugetlbsize);
988
989	/*
990	 * We need a total of two hugetlb pages to handle COW/unsharing
991	 * properly, otherwise we might get zapped by a SIGBUS.
992	 */
993	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
994	if (dummy == MAP_FAILED) {
995		ksft_test_result_skip("need more free huge pages\n");
996		goto munmap;
997	}
998	munmap(dummy, hugetlbsize);
999
1000	fn(mem, hugetlbsize);
1001munmap:
1002	munmap(mem, hugetlbsize);
1003}
1004
1005struct test_case {
1006	const char *desc;
1007	test_fn fn;
1008};
1009
1010/*
1011 * Test cases that are specific to anonymous pages: pages in private mappings
1012 * that may get shared via COW during fork().
1013 */
1014static const struct test_case anon_test_cases[] = {
1015	/*
1016	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1017	 * either the child can observe modifications by the parent or the
1018	 * other way around.
1019	 */
1020	{
1021		"Basic COW after fork()",
1022		test_cow_in_parent,
1023	},
1024	/*
1025	 * Basic test, but do an additional mprotect(PROT_READ)+
1026	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1027	 */
1028	{
1029		"Basic COW after fork() with mprotect() optimization",
1030		test_cow_in_parent_mprotect,
1031	},
1032	/*
1033	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1034	 * we miss to break COW, the child observes modifications by the parent.
1035	 * This is CVE-2020-29374 reported by Jann Horn.
1036	 */
1037	{
1038		"vmsplice() + unmap in child",
1039		test_vmsplice_in_child
1040	},
1041	/*
1042	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1043	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1044	 */
1045	{
1046		"vmsplice() + unmap in child with mprotect() optimization",
1047		test_vmsplice_in_child_mprotect
1048	},
1049	/*
1050	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1051	 * fork(); modify in the child. If we miss to break COW, the parent
1052	 * observes modifications by the child.
1053	 */
1054	{
1055		"vmsplice() before fork(), unmap in parent after fork()",
1056		test_vmsplice_before_fork,
1057	},
1058	/*
1059	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1060	 * child. If we miss to break COW, the parent observes modifications by
1061	 * the child.
1062	 */
1063	{
1064		"vmsplice() + unmap in parent after fork()",
1065		test_vmsplice_after_fork,
1066	},
1067#ifdef LOCAL_CONFIG_HAVE_LIBURING
1068	/*
1069	 * Take a R/W longterm pin and then map the page R/O into the page
1070	 * table to trigger a write fault on next access. When modifying the
1071	 * page, the page content must be visible via the pin.
1072	 */
1073	{
1074		"R/O-mapping a page registered as iouring fixed buffer",
1075		test_iouring_ro,
1076	},
1077	/*
1078	 * Take a R/W longterm pin and then fork() a child. When modifying the
1079	 * page, the page content must be visible via the pin. We expect the
1080	 * pinned page to not get shared with the child.
1081	 */
1082	{
1083		"fork() with an iouring fixed buffer",
1084		test_iouring_fork,
1085	},
1086
1087#endif /* LOCAL_CONFIG_HAVE_LIBURING */
1088	/*
1089	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1090	 * When modifying the page via the page table, the page content change
1091	 * must be visible via the pin.
1092	 */
1093	{
1094		"R/O GUP pin on R/O-mapped shared page",
1095		test_ro_pin_on_shared,
1096	},
1097	/* Same as above, but using GUP-fast. */
1098	{
1099		"R/O GUP-fast pin on R/O-mapped shared page",
1100		test_ro_fast_pin_on_shared,
1101	},
1102	/*
1103	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1104	 * was previously shared. When modifying the page via the page table,
1105	 * the page content change must be visible via the pin.
1106	 */
1107	{
1108		"R/O GUP pin on R/O-mapped previously-shared page",
1109		test_ro_pin_on_ro_previously_shared,
1110	},
1111	/* Same as above, but using GUP-fast. */
1112	{
1113		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1114		test_ro_fast_pin_on_ro_previously_shared,
1115	},
1116	/*
1117	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1118	 * When modifying the page via the page table, the page content change
1119	 * must be visible via the pin.
1120	 */
1121	{
1122		"R/O GUP pin on R/O-mapped exclusive page",
1123		test_ro_pin_on_ro_exclusive,
1124	},
1125	/* Same as above, but using GUP-fast. */
1126	{
1127		"R/O GUP-fast pin on R/O-mapped exclusive page",
1128		test_ro_fast_pin_on_ro_exclusive,
1129	},
1130};
1131
1132static void run_anon_test_case(struct test_case const *test_case)
1133{
1134	int i;
1135
1136	run_with_base_page(test_case->fn, test_case->desc);
1137	run_with_base_page_swap(test_case->fn, test_case->desc);
1138	for (i = 0; i < nr_thpsizes; i++) {
1139		size_t size = thpsizes[i];
1140		struct thp_settings settings = *thp_current_settings();
1141
1142		settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1143		settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1144		thp_push_settings(&settings);
1145
1146		if (size == pmdsize) {
1147			run_with_thp(test_case->fn, test_case->desc, size);
1148			run_with_thp_swap(test_case->fn, test_case->desc, size);
1149		}
1150
1151		run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1152		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1153		run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1154		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1155		run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1156		run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1157
1158		thp_pop_settings();
1159	}
1160	for (i = 0; i < nr_hugetlbsizes; i++)
1161		run_with_hugetlb(test_case->fn, test_case->desc,
1162				 hugetlbsizes[i]);
1163}
1164
1165static void run_anon_test_cases(void)
1166{
1167	int i;
1168
1169	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1170
1171	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1172		run_anon_test_case(&anon_test_cases[i]);
1173}
1174
1175static int tests_per_anon_test_case(void)
1176{
1177	int tests = 2 + nr_hugetlbsizes;
1178
1179	tests += 6 * nr_thpsizes;
1180	if (pmdsize)
1181		tests += 2;
1182	return tests;
1183}
1184
1185enum anon_thp_collapse_test {
1186	ANON_THP_COLLAPSE_UNSHARED,
1187	ANON_THP_COLLAPSE_FULLY_SHARED,
1188	ANON_THP_COLLAPSE_LOWER_SHARED,
1189	ANON_THP_COLLAPSE_UPPER_SHARED,
1190};
1191
1192static void do_test_anon_thp_collapse(char *mem, size_t size,
1193				      enum anon_thp_collapse_test test)
1194{
1195	struct comm_pipes comm_pipes;
1196	char buf;
1197	int ret;
1198
1199	ret = setup_comm_pipes(&comm_pipes);
1200	if (ret) {
1201		ksft_test_result_fail("pipe() failed\n");
1202		return;
1203	}
1204
1205	/*
1206	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1207	 * R/O, such that we can try collapsing it later.
1208	 */
1209	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1210	if (ret) {
1211		ksft_test_result_fail("mprotect() failed\n");
1212		goto close_comm_pipes;
1213	}
1214	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1215	if (ret) {
1216		ksft_test_result_fail("mprotect() failed\n");
1217		goto close_comm_pipes;
1218	}
1219
1220	switch (test) {
1221	case ANON_THP_COLLAPSE_UNSHARED:
1222		/* Collapse before actually COW-sharing the page. */
1223		ret = madvise(mem, size, MADV_COLLAPSE);
1224		if (ret) {
1225			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1226					      strerror(errno));
1227			goto close_comm_pipes;
1228		}
1229		break;
1230	case ANON_THP_COLLAPSE_FULLY_SHARED:
1231		/* COW-share the full PTE-mapped THP. */
1232		break;
1233	case ANON_THP_COLLAPSE_LOWER_SHARED:
1234		/* Don't COW-share the upper part of the THP. */
1235		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1236		if (ret) {
1237			ksft_test_result_fail("MADV_DONTFORK failed\n");
1238			goto close_comm_pipes;
1239		}
1240		break;
1241	case ANON_THP_COLLAPSE_UPPER_SHARED:
1242		/* Don't COW-share the lower part of the THP. */
1243		ret = madvise(mem, size / 2, MADV_DONTFORK);
1244		if (ret) {
1245			ksft_test_result_fail("MADV_DONTFORK failed\n");
1246			goto close_comm_pipes;
1247		}
1248		break;
1249	default:
1250		assert(false);
1251	}
1252
1253	ret = fork();
1254	if (ret < 0) {
1255		ksft_test_result_fail("fork() failed\n");
1256		goto close_comm_pipes;
1257	} else if (!ret) {
1258		switch (test) {
1259		case ANON_THP_COLLAPSE_UNSHARED:
1260		case ANON_THP_COLLAPSE_FULLY_SHARED:
1261			exit(child_memcmp_fn(mem, size, &comm_pipes));
1262			break;
1263		case ANON_THP_COLLAPSE_LOWER_SHARED:
1264			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1265			break;
1266		case ANON_THP_COLLAPSE_UPPER_SHARED:
1267			exit(child_memcmp_fn(mem + size / 2, size / 2,
1268					     &comm_pipes));
1269			break;
1270		default:
1271			assert(false);
1272		}
1273	}
1274
1275	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1276		;
1277
1278	switch (test) {
1279	case ANON_THP_COLLAPSE_UNSHARED:
1280		break;
1281	case ANON_THP_COLLAPSE_UPPER_SHARED:
1282	case ANON_THP_COLLAPSE_LOWER_SHARED:
1283		/*
1284		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1285		 * able to actually collapse.
1286		 */
1287		ret = madvise(mem, size, MADV_DOFORK);
1288		if (ret) {
1289			ksft_test_result_fail("MADV_DOFORK failed\n");
1290			write(comm_pipes.parent_ready[1], "0", 1);
1291			wait(&ret);
1292			goto close_comm_pipes;
1293		}
1294		/* FALLTHROUGH */
1295	case ANON_THP_COLLAPSE_FULLY_SHARED:
1296		/* Collapse before anyone modified the COW-shared page. */
1297		ret = madvise(mem, size, MADV_COLLAPSE);
1298		if (ret) {
1299			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1300					      strerror(errno));
1301			write(comm_pipes.parent_ready[1], "0", 1);
1302			wait(&ret);
1303			goto close_comm_pipes;
1304		}
1305		break;
1306	default:
1307		assert(false);
1308	}
1309
1310	/* Modify the page. */
1311	memset(mem, 0xff, size);
1312	write(comm_pipes.parent_ready[1], "0", 1);
1313
1314	wait(&ret);
1315	if (WIFEXITED(ret))
1316		ret = WEXITSTATUS(ret);
1317	else
1318		ret = -EINVAL;
1319
1320	ksft_test_result(!ret, "No leak from parent into child\n");
1321close_comm_pipes:
1322	close_comm_pipes(&comm_pipes);
1323}
1324
1325static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1326{
1327	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1328}
1329
1330static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1331{
1332	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1333}
1334
1335static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1336{
1337	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1338}
1339
1340static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1341{
1342	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1343}
1344
1345/*
1346 * Test cases that are specific to anonymous THP: pages in private mappings
1347 * that may get shared via COW during fork().
1348 */
1349static const struct test_case anon_thp_test_cases[] = {
1350	/*
1351	 * Basic COW test for fork() without any GUP when collapsing a THP
1352	 * before fork().
1353	 *
1354	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1355	 * collapse") might easily get COW handling wrong when not collapsing
1356	 * exclusivity information properly.
1357	 */
1358	{
1359		"Basic COW after fork() when collapsing before fork()",
1360		test_anon_thp_collapse_unshared,
1361	},
1362	/* Basic COW test, but collapse after COW-sharing a full THP. */
1363	{
1364		"Basic COW after fork() when collapsing after fork() (fully shared)",
1365		test_anon_thp_collapse_fully_shared,
1366	},
1367	/*
1368	 * Basic COW test, but collapse after COW-sharing the lower half of a
1369	 * THP.
1370	 */
1371	{
1372		"Basic COW after fork() when collapsing after fork() (lower shared)",
1373		test_anon_thp_collapse_lower_shared,
1374	},
1375	/*
1376	 * Basic COW test, but collapse after COW-sharing the upper half of a
1377	 * THP.
1378	 */
1379	{
1380		"Basic COW after fork() when collapsing after fork() (upper shared)",
1381		test_anon_thp_collapse_upper_shared,
1382	},
1383};
1384
1385static void run_anon_thp_test_cases(void)
1386{
1387	int i;
1388
1389	if (!pmdsize)
1390		return;
1391
1392	ksft_print_msg("[INFO] Anonymous THP tests\n");
1393
1394	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1395		struct test_case const *test_case = &anon_thp_test_cases[i];
1396
1397		ksft_print_msg("[RUN] %s\n", test_case->desc);
1398		do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1399	}
1400}
1401
1402static int tests_per_anon_thp_test_case(void)
1403{
1404	return pmdsize ? 1 : 0;
1405}
1406
1407typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1408
1409static void test_cow(char *mem, const char *smem, size_t size)
1410{
1411	char *old = malloc(size);
1412
1413	/* Backup the original content. */
1414	memcpy(old, smem, size);
1415
1416	/* Modify the page. */
1417	memset(mem, 0xff, size);
1418
1419	/* See if we still read the old values via the other mapping. */
1420	ksft_test_result(!memcmp(smem, old, size),
1421			 "Other mapping not modified\n");
1422	free(old);
1423}
1424
1425static void test_ro_pin(char *mem, const char *smem, size_t size)
1426{
1427	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1428}
1429
1430static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1431{
1432	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1433}
1434
1435static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1436{
1437	char *mem, *smem, tmp;
1438
1439	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1440
1441	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1442		   MAP_PRIVATE | MAP_ANON, -1, 0);
1443	if (mem == MAP_FAILED) {
1444		ksft_test_result_fail("mmap() failed\n");
1445		return;
1446	}
1447
1448	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1449	if (mem == MAP_FAILED) {
1450		ksft_test_result_fail("mmap() failed\n");
1451		goto munmap;
1452	}
1453
1454	/* Read from the page to populate the shared zeropage. */
1455	tmp = *mem + *smem;
1456	asm volatile("" : "+r" (tmp));
1457
1458	fn(mem, smem, pagesize);
1459munmap:
1460	munmap(mem, pagesize);
1461	if (smem != MAP_FAILED)
1462		munmap(smem, pagesize);
1463}
1464
1465static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1466{
1467	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1468	size_t mmap_size;
1469	int ret;
1470
1471	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1472
1473	if (!has_huge_zeropage) {
1474		ksft_test_result_skip("Huge zeropage not enabled\n");
1475		return;
1476	}
1477
1478	/* For alignment purposes, we need twice the thp size. */
1479	mmap_size = 2 * pmdsize;
1480	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1481			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1482	if (mmap_mem == MAP_FAILED) {
1483		ksft_test_result_fail("mmap() failed\n");
1484		return;
1485	}
1486	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1487			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1488	if (mmap_smem == MAP_FAILED) {
1489		ksft_test_result_fail("mmap() failed\n");
1490		goto munmap;
1491	}
1492
1493	/* We need a THP-aligned memory area. */
1494	mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1495	smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1496
1497	ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1498	ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1499	if (ret) {
1500		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1501		goto munmap;
1502	}
1503
1504	/*
1505	 * Read from the memory to populate the huge shared zeropage. Read from
1506	 * the first sub-page and test if we get another sub-page populated
1507	 * automatically.
1508	 */
1509	tmp = *mem + *smem;
1510	asm volatile("" : "+r" (tmp));
1511	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1512	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1513		ksft_test_result_skip("Did not get THPs populated\n");
1514		goto munmap;
1515	}
1516
1517	fn(mem, smem, pmdsize);
1518munmap:
1519	munmap(mmap_mem, mmap_size);
1520	if (mmap_smem != MAP_FAILED)
1521		munmap(mmap_smem, mmap_size);
1522}
1523
1524static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1525{
1526	char *mem, *smem, tmp;
1527	int fd;
1528
1529	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1530
1531	fd = memfd_create("test", 0);
1532	if (fd < 0) {
1533		ksft_test_result_fail("memfd_create() failed\n");
1534		return;
1535	}
1536
1537	/* File consists of a single page filled with zeroes. */
1538	if (fallocate(fd, 0, 0, pagesize)) {
1539		ksft_test_result_fail("fallocate() failed\n");
1540		goto close;
1541	}
1542
1543	/* Create a private mapping of the memfd. */
1544	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1545	if (mem == MAP_FAILED) {
1546		ksft_test_result_fail("mmap() failed\n");
1547		goto close;
1548	}
1549	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1550	if (mem == MAP_FAILED) {
1551		ksft_test_result_fail("mmap() failed\n");
1552		goto munmap;
1553	}
1554
1555	/* Fault the page in. */
1556	tmp = *mem + *smem;
1557	asm volatile("" : "+r" (tmp));
1558
1559	fn(mem, smem, pagesize);
1560munmap:
1561	munmap(mem, pagesize);
1562	if (smem != MAP_FAILED)
1563		munmap(smem, pagesize);
1564close:
1565	close(fd);
1566}
1567
1568static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1569{
1570	char *mem, *smem, tmp;
1571	FILE *file;
1572	int fd;
1573
1574	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1575
1576	file = tmpfile();
1577	if (!file) {
1578		ksft_test_result_fail("tmpfile() failed\n");
1579		return;
1580	}
1581
1582	fd = fileno(file);
1583	if (fd < 0) {
1584		ksft_test_result_skip("fileno() failed\n");
1585		return;
1586	}
1587
1588	/* File consists of a single page filled with zeroes. */
1589	if (fallocate(fd, 0, 0, pagesize)) {
1590		ksft_test_result_fail("fallocate() failed\n");
1591		goto close;
1592	}
1593
1594	/* Create a private mapping of the memfd. */
1595	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1596	if (mem == MAP_FAILED) {
1597		ksft_test_result_fail("mmap() failed\n");
1598		goto close;
1599	}
1600	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1601	if (mem == MAP_FAILED) {
1602		ksft_test_result_fail("mmap() failed\n");
1603		goto munmap;
1604	}
1605
1606	/* Fault the page in. */
1607	tmp = *mem + *smem;
1608	asm volatile("" : "+r" (tmp));
1609
1610	fn(mem, smem, pagesize);
1611munmap:
1612	munmap(mem, pagesize);
1613	if (smem != MAP_FAILED)
1614		munmap(smem, pagesize);
1615close:
1616	fclose(file);
1617}
1618
1619static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1620				   size_t hugetlbsize)
1621{
1622	int flags = MFD_HUGETLB;
1623	char *mem, *smem, tmp;
1624	int fd;
1625
1626	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1627		       hugetlbsize / 1024);
1628
1629	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1630
1631	fd = memfd_create("test", flags);
1632	if (fd < 0) {
1633		ksft_test_result_skip("memfd_create() failed\n");
1634		return;
1635	}
1636
1637	/* File consists of a single page filled with zeroes. */
1638	if (fallocate(fd, 0, 0, hugetlbsize)) {
1639		ksft_test_result_skip("need more free huge pages\n");
1640		goto close;
1641	}
1642
1643	/* Create a private mapping of the memfd. */
1644	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1645		   0);
1646	if (mem == MAP_FAILED) {
1647		ksft_test_result_skip("need more free huge pages\n");
1648		goto close;
1649	}
1650	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1651	if (mem == MAP_FAILED) {
1652		ksft_test_result_fail("mmap() failed\n");
1653		goto munmap;
1654	}
1655
1656	/* Fault the page in. */
1657	tmp = *mem + *smem;
1658	asm volatile("" : "+r" (tmp));
1659
1660	fn(mem, smem, hugetlbsize);
1661munmap:
1662	munmap(mem, hugetlbsize);
1663	if (mem != MAP_FAILED)
1664		munmap(smem, hugetlbsize);
1665close:
1666	close(fd);
1667}
1668
1669struct non_anon_test_case {
1670	const char *desc;
1671	non_anon_test_fn fn;
1672};
1673
1674/*
1675 * Test cases that target any pages in private mappings that are not anonymous:
1676 * pages that may get shared via COW ndependent of fork(). This includes
1677 * the shared zeropage(s), pagecache pages, ...
1678 */
1679static const struct non_anon_test_case non_anon_test_cases[] = {
1680	/*
1681	 * Basic COW test without any GUP. If we miss to break COW, changes are
1682	 * visible via other private/shared mappings.
1683	 */
1684	{
1685		"Basic COW",
1686		test_cow,
1687	},
1688	/*
1689	 * Take a R/O longterm pin. When modifying the page via the page table,
1690	 * the page content change must be visible via the pin.
1691	 */
1692	{
1693		"R/O longterm GUP pin",
1694		test_ro_pin,
1695	},
1696	/* Same as above, but using GUP-fast. */
1697	{
1698		"R/O longterm GUP-fast pin",
1699		test_ro_fast_pin,
1700	},
1701};
1702
1703static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1704{
1705	int i;
1706
1707	run_with_zeropage(test_case->fn, test_case->desc);
1708	run_with_memfd(test_case->fn, test_case->desc);
1709	run_with_tmpfile(test_case->fn, test_case->desc);
1710	if (pmdsize)
1711		run_with_huge_zeropage(test_case->fn, test_case->desc);
1712	for (i = 0; i < nr_hugetlbsizes; i++)
1713		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1714				       hugetlbsizes[i]);
1715}
1716
1717static void run_non_anon_test_cases(void)
1718{
1719	int i;
1720
1721	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1722
1723	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1724		run_non_anon_test_case(&non_anon_test_cases[i]);
1725}
1726
1727static int tests_per_non_anon_test_case(void)
1728{
1729	int tests = 3 + nr_hugetlbsizes;
1730
1731	if (pmdsize)
1732		tests += 1;
1733	return tests;
1734}
1735
1736int main(int argc, char **argv)
1737{
1738	int err;
1739	struct thp_settings default_settings;
1740
1741	ksft_print_header();
1742
1743	pagesize = getpagesize();
1744	pmdsize = read_pmd_pagesize();
1745	if (pmdsize) {
1746		/* Only if THP is supported. */
1747		thp_read_settings(&default_settings);
1748		default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1749		thp_save_settings();
1750		thp_push_settings(&default_settings);
1751
1752		ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1753			       pmdsize / 1024);
1754		nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1755	}
1756	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1757						    ARRAY_SIZE(hugetlbsizes));
1758	detect_huge_zeropage();
1759
1760	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1761		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1762		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1763
1764	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1765	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1766	if (pagemap_fd < 0)
1767		ksft_exit_fail_msg("opening pagemap failed\n");
1768
1769	run_anon_test_cases();
1770	run_anon_thp_test_cases();
1771	run_non_anon_test_cases();
1772
1773	if (pmdsize) {
1774		/* Only if THP is supported. */
1775		thp_restore_settings();
1776	}
1777
1778	err = ksft_get_fail_cnt();
1779	if (err)
1780		ksft_exit_fail_msg("%d out of %d tests failed\n",
1781				   err, ksft_test_num());
1782	return ksft_exit_pass();
1783}
1784