1/*
2 * Copyright (c) 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
3 * Copyright (c) 2022 The FreeBSD Foundation
4 *
5 * Portions of this software were developed by Mark Johnston under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * Test behavior when a mapping of a shared shadow vm object is
32 * invalidated by COW from another mapping.  In particular, when
33 * minherit(INHERT_SHARE) is applied to a COW mapping, a subsequently
34 * forked child process will share the parent's shadow object.  Thus,
35 * pages already mapped into one sharing process may be written from
36 * another, triggering a copy into the shadow object.  The VM system
37 * expects that a fully shadowed page is unmapped, but at one point the
38 * use of a shared shadow object could break this invariant.
39 *
40 * This is a regression test for an issue isolated by rlibby@FreeBSD.org
41 * from an issue detected by stress2's collapse.sh by jeff@FreeBSD.org.
42 * The issue became CVE-2021-29626.
43 *
44 * This file is written as an ATF test suite but may be compiled as a
45 * standalone program with -DSTANDALONE (and optionally -DDEBUG).
46 */
47
48#include <sys/param.h>
49#include <sys/mman.h>
50#include <sys/procctl.h>
51#include <sys/resource.h>
52#include <sys/sysctl.h>
53#include <sys/wait.h>
54
55#include <machine/atomic.h>
56
57#include <err.h>
58#include <errno.h>
59#include <stdbool.h>
60#include <stddef.h>
61#include <stdio.h>
62#include <stdlib.h>
63#include <unistd.h>
64
65#ifdef STANDALONE
66#define	ATF_REQUIRE(x)	do {		\
67	if (!(x))			\
68		errx(1, "%s", #x);	\
69} while (0)
70#else
71#include <atf-c.h>
72#endif
73
74#ifdef DEBUG
75#define	dprintf(...)	printf(__VA_ARGS__)
76#else
77#define	dprintf(...)
78#endif
79
80#define	DEPTH	5
81
82#define	FLAG_COLLAPSE		0x1
83#define	FLAG_BLOCK_XFER		0x2
84#define	FLAG_FULLMOD		0x4
85#define FLAG_MASK		(FLAG_COLLAPSE | FLAG_BLOCK_XFER | FLAG_FULLMOD)
86
87struct shared_state {
88	void *p;
89	size_t len;
90	size_t modlen;
91	size_t pagesize;
92	bool collapse;
93	bool block_xfer;
94	bool lazy_cow;
95	bool okay;
96	volatile bool exiting[DEPTH];
97	volatile bool exit;
98	volatile bool p3_did_write;
99};
100
101/*
102 * Program flow.  There are three or four processes that are descendants
103 * of the process running the test (P0), where arrows go from parents to
104 * children, and thicker arrows indicate sharing a certain memory region
105 * without COW semantics:
106 *     P0 -> P1 -> P2 => P3
107 *             \=> P4
108 * The main idea is that P1 maps a memory region, and that region is
109 * shared with P2/P3, but with COW semantics.  When P3 modifies the
110 * memory, P2 ought to see that modification.  P4 optionally exists to
111 * defeat a COW optimization.
112 */
113
114#define	child_err(...)	do {						\
115	ss->exit = true;						\
116	err(1, __VA_ARGS__);						\
117} while (0)
118
119#define	child_errx(...)	do {						\
120	ss->exit = true;						\
121	errx(1, __VA_ARGS__);						\
122} while (0)
123
124#define	SLEEP_TIME_US	1000
125
126static void child(struct shared_state *ss, int depth);
127
128static pid_t
129child_fork(struct shared_state *ss, int depth)
130{
131	pid_t pid = fork();
132	if (pid == -1)
133		child_err("fork");
134	else if (pid == 0)
135		child(ss, depth);
136	return pid;
137}
138
139static void
140child_fault(struct shared_state *ss)
141{
142	size_t i;
143
144	for (i = 0; i < ss->len; i += ss->pagesize)
145		(void)((volatile char *)ss->p)[i];
146}
147
148static void
149child_write(struct shared_state *ss, int val, size_t len)
150{
151	size_t i;
152
153	for (i = 0; i < len; i += ss->pagesize)
154		((int *)ss->p)[i / sizeof(int)] = val;
155	atomic_thread_fence_rel();
156}
157
158static void
159child_wait_p3_write(struct shared_state *ss)
160{
161	while (!ss->p3_did_write) {
162		if (ss->exit)
163			exit(1);
164		usleep(SLEEP_TIME_US);
165	}
166	atomic_thread_fence_acq();
167}
168
169static void
170child_verify(struct shared_state *ss, int depth, int newval, int oldval)
171{
172	size_t i;
173	int expectval, foundval;
174
175	for (i = 0; i < ss->len; i += ss->pagesize) {
176		expectval = i < ss->modlen ? newval : oldval;
177		foundval = ((int *)ss->p)[i / sizeof(int)];
178		if (foundval == expectval)
179			continue;
180		child_errx("P%d saw %d but expected %d, %d was the old value",
181		    depth, foundval, expectval, oldval);
182	}
183}
184
185static void
186child(struct shared_state *ss, int depth)
187{
188	pid_t mypid, oldval, pid;
189
190	if (depth < 1 || depth >= DEPTH)
191		child_errx("Bad depth %d", depth);
192	mypid = getpid();
193	dprintf("P%d (pid %d) started\n", depth, mypid);
194	switch (depth) {
195	case 1:
196		/* Shared memory undergoing test. */
197		ss->p = mmap(NULL, ss->len, PROT_READ | PROT_WRITE,
198		    MAP_SHARED | MAP_ANON, -1, 0);
199		if (ss->p == MAP_FAILED)
200			child_err("mmap");
201
202		/* P1 stamps the shared memory. */
203		child_write(ss, mypid, ss->len);
204		if (!ss->lazy_cow) {
205			if (mlock(ss->p, ss->len) == -1)
206				child_err("mlock");
207			if (mprotect(ss->p, ss->len, PROT_READ) == -1)
208				child_err("mprotect");
209		}
210		if (ss->block_xfer) {
211			/*
212			 * P4 is forked so that its existence blocks a page COW
213			 * path where the page is simply transferred between
214			 * objects, rather than being copied.
215			 */
216			child_fork(ss, 4);
217		}
218		/*
219		 * P1 specifies that modifications from its child processes not
220		 * be shared with P1.  Child process reads can be serviced from
221		 * pages in P1's object, but writes must be COW'd.
222		 */
223		if (minherit(ss->p, ss->len, INHERIT_COPY) != 0)
224			child_err("minherit");
225		/* Fork P2. */
226		child_fork(ss, depth + 1);
227		/* P1 and P4 wait for P3's writes before exiting. */
228		child_wait_p3_write(ss);
229		child_verify(ss, depth, mypid, mypid);
230		if (!ss->collapse) {
231			/* Hang around to prevent collapse. */
232			while (!ss->exit)
233				usleep(SLEEP_TIME_US);
234		}
235		/* Exit so the P2 -> P1/P4 shadow chain can collapse. */
236		break;
237	case 2:
238		/*
239		 * P2 now specifies that modifications from its child processes
240		 * be shared.  P2 and P3 will share a shadow object.
241		 */
242		if (minherit(ss->p, ss->len, INHERIT_SHARE) != 0)
243			child_err("minherit");
244
245		/*
246		 * P2 faults a page in P1's object before P1 exits and the
247		 * shadow chain is collapsed.  This may be redundant if the
248		 * (read-only) mappings were copied by fork(), but it doesn't
249		 * hurt.
250		 */
251		child_fault(ss);
252		oldval = atomic_load_acq_int(ss->p);
253
254		/* Fork P3. */
255		pid = child_fork(ss, depth + 1);
256		if (ss->collapse) {
257			/* Wait for P1 and P4 to exit, triggering collapse. */
258			while (!ss->exiting[1] ||
259			    (ss->block_xfer && !ss->exiting[4]))
260				usleep(SLEEP_TIME_US);
261			/*
262			 * This is racy, just guess at how long it may take
263			 * them to finish exiting.
264			 */
265			usleep(100 * 1000);
266		}
267		/* P2 waits for P3's modification. */
268		child_wait_p3_write(ss);
269		child_verify(ss, depth, pid, oldval);
270		ss->okay = true;
271		ss->exit = true;
272		break;
273	case 3:
274		/*
275		 * Use mlock()+mprotect() to trigger the COW.  This
276		 * exercises a different COW handler than the one used
277		 * for lazy faults.
278		 */
279		if (!ss->lazy_cow) {
280			if (mlock(ss->p, ss->len) == -1)
281				child_err("mlock");
282			if (mprotect(ss->p, ss->len, PROT_READ | PROT_WRITE) ==
283			    -1)
284				child_err("mprotect");
285		}
286
287		/*
288		 * P3 writes the memory.  A page is faulted into the shared
289		 * P2/P3 shadow object.  P2's mapping of the page in P1's
290		 * object must now be shot down, or else P2 will wrongly
291		 * continue to have that page mapped.
292		 */
293		child_write(ss, mypid, ss->modlen);
294		ss->p3_did_write = true;
295		dprintf("P3 (pid %d) wrote its pid\n", mypid);
296		break;
297	case 4:
298		/* Just hang around until P3 is done writing. */
299		oldval = atomic_load_acq_int(ss->p);
300		child_wait_p3_write(ss);
301		child_verify(ss, depth, oldval, oldval);
302		break;
303	default:
304		child_errx("Bad depth %d", depth);
305	}
306
307	dprintf("P%d (pid %d) exiting\n", depth, mypid);
308	ss->exiting[depth] = true;
309	exit(0);
310}
311
312static void
313do_one_shared_shadow_inval(bool lazy_cow, size_t pagesize, size_t len,
314    unsigned int flags)
315{
316	struct shared_state *ss;
317	pid_t pid;
318	int status;
319
320	pid = getpid();
321
322	dprintf("P0 (pid %d) %s(collapse=%d, block_xfer=%d, full_mod=%d)\n",
323	    pid, __func__, (int)collapse, (int)block_xfer, (int)full_mod);
324
325	ATF_REQUIRE(procctl(P_PID, pid, PROC_REAP_ACQUIRE, NULL) == 0);
326
327	/* Shared memory for coordination. */
328	ss = mmap(NULL, sizeof(*ss), PROT_READ | PROT_WRITE,
329	    MAP_SHARED | MAP_ANON, -1, 0);
330	ATF_REQUIRE(ss != MAP_FAILED);
331
332	ss->len = len;
333	ss->modlen = (flags & FLAG_FULLMOD) ? ss->len : ss->len / 2;
334	ss->pagesize = pagesize;
335	ss->collapse = (flags & FLAG_COLLAPSE) != 0;
336	ss->block_xfer = (flags & FLAG_BLOCK_XFER) != 0;
337	ss->lazy_cow = lazy_cow;
338
339	pid = fork();
340	ATF_REQUIRE(pid != -1);
341	if (pid == 0)
342		child(ss, 1);
343
344	/* Wait for all descendants to exit. */
345	do {
346		pid = wait(&status);
347		ATF_REQUIRE(WIFEXITED(status));
348	} while (pid != -1 || errno != ECHILD);
349
350	atomic_thread_fence_acq();
351	ATF_REQUIRE(ss->okay);
352
353	ATF_REQUIRE(munmap(ss, sizeof(*ss)) == 0);
354	ATF_REQUIRE(procctl(P_PID, getpid(), PROC_REAP_RELEASE, NULL) == 0);
355}
356
357static void
358do_shared_shadow_inval(bool lazy_cow)
359{
360	size_t largepagesize, pagesize, pagesizes[MAXPAGESIZES], sysctllen;
361
362	sysctllen = sizeof(pagesizes);
363	ATF_REQUIRE(sysctlbyname("hw.pagesizes", pagesizes, &sysctllen, NULL,
364	    0) == 0);
365	ATF_REQUIRE(sysctllen >= sizeof(size_t));
366
367	pagesize = pagesizes[0];
368	largepagesize = MAXPAGESIZES >= 2 &&
369	    sysctllen >= 2 * sizeof(size_t) && pagesizes[1] != 0 ?
370	    pagesizes[1] : 2 * 1024 * 1024;
371
372	for (unsigned int i = 0; i <= FLAG_MASK; i++) {
373		do_one_shared_shadow_inval(lazy_cow, pagesize,
374		    pagesize, i);
375		do_one_shared_shadow_inval(lazy_cow, pagesize,
376		    2 * pagesize, i);
377		do_one_shared_shadow_inval(lazy_cow, pagesize,
378		    largepagesize - pagesize, i);
379		do_one_shared_shadow_inval(lazy_cow, pagesize,
380		    largepagesize, i);
381		do_one_shared_shadow_inval(lazy_cow, pagesize,
382		    largepagesize + pagesize, i);
383	}
384}
385
386static void
387do_shared_shadow_inval_eager(void)
388{
389	struct rlimit rl;
390
391	rl.rlim_cur = rl.rlim_max = RLIM_INFINITY;
392	ATF_REQUIRE(setrlimit(RLIMIT_MEMLOCK, &rl) == 0);
393
394	do_shared_shadow_inval(false);
395}
396
397static void
398do_shared_shadow_inval_lazy(void)
399{
400	do_shared_shadow_inval(true);
401}
402
403#ifdef STANDALONE
404int
405main(void)
406{
407	do_shared_shadow_inval_lazy();
408	do_shared_shadow_inval_eager();
409	printf("pass\n");
410}
411#else
412ATF_TC_WITHOUT_HEAD(shared_shadow_inval__lazy_cow);
413ATF_TC_BODY(shared_shadow_inval__lazy_cow, tc)
414{
415	do_shared_shadow_inval_lazy();
416}
417
418ATF_TC(shared_shadow_inval__eager_cow);
419ATF_TC_HEAD(shared_shadow_inval__eager_cow, tc)
420{
421	/* Needed to raise the mlock() limit. */
422	atf_tc_set_md_var(tc, "require.user", "root");
423}
424ATF_TC_BODY(shared_shadow_inval__eager_cow, tc)
425{
426	do_shared_shadow_inval_eager();
427}
428
429ATF_TP_ADD_TCS(tp)
430{
431	ATF_TP_ADD_TC(tp, shared_shadow_inval__lazy_cow);
432	ATF_TP_ADD_TC(tp, shared_shadow_inval__eager_cow);
433	return (atf_no_error());
434}
435#endif /* !STANDALONE */
436