1// SPDX-License-Identifier: GPL-2.0
2/*
3 * shstk.c - Intel shadow stack support
4 *
5 * Copyright (c) 2021, Intel Corporation.
6 * Yu-cheng Yu <yu-cheng.yu@intel.com>
7 */
8
9#include <linux/sched.h>
10#include <linux/bitops.h>
11#include <linux/types.h>
12#include <linux/mm.h>
13#include <linux/mman.h>
14#include <linux/slab.h>
15#include <linux/uaccess.h>
16#include <linux/sched/signal.h>
17#include <linux/compat.h>
18#include <linux/sizes.h>
19#include <linux/user.h>
20#include <linux/syscalls.h>
21#include <asm/msr.h>
22#include <asm/fpu/xstate.h>
23#include <asm/fpu/types.h>
24#include <asm/shstk.h>
25#include <asm/special_insns.h>
26#include <asm/fpu/api.h>
27#include <asm/prctl.h>
28
29#define SS_FRAME_SIZE 8
30
31static bool features_enabled(unsigned long features)
32{
33	return current->thread.features & features;
34}
35
36static void features_set(unsigned long features)
37{
38	current->thread.features |= features;
39}
40
41static void features_clr(unsigned long features)
42{
43	current->thread.features &= ~features;
44}
45
46/*
47 * Create a restore token on the shadow stack.  A token is always 8-byte
48 * and aligned to 8.
49 */
50static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
51{
52	unsigned long addr;
53
54	/* Token must be aligned */
55	if (!IS_ALIGNED(ssp, 8))
56		return -EINVAL;
57
58	addr = ssp - SS_FRAME_SIZE;
59
60	/*
61	 * SSP is aligned, so reserved bits and mode bit are a zero, just mark
62	 * the token 64-bit.
63	 */
64	ssp |= BIT(0);
65
66	if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
67		return -EFAULT;
68
69	if (token_addr)
70		*token_addr = addr;
71
72	return 0;
73}
74
75/*
76 * VM_SHADOW_STACK will have a guard page. This helps userspace protect
77 * itself from attacks. The reasoning is as follows:
78 *
79 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
80 * INCSSP instruction can increment the shadow stack pointer. It is the
81 * shadow stack analog of an instruction like:
82 *
83 *   addq $0x80, %rsp
84 *
85 * However, there is one important difference between an ADD on %rsp
86 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
87 * memory of the first and last elements that were "popped". It can be
88 * thought of as acting like this:
89 *
90 * READ_ONCE(ssp);       // read+discard top element on stack
91 * ssp += nr_to_pop * 8; // move the shadow stack
92 * READ_ONCE(ssp-8);     // read+discard last popped stack element
93 *
94 * The maximum distance INCSSP can move the SSP is 2040 bytes, before
95 * it would read the memory. Therefore a single page gap will be enough
96 * to prevent any operation from shifting the SSP to an adjacent stack,
97 * since it would have to land in the gap at least once, causing a
98 * fault.
99 */
100static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
101				 unsigned long token_offset, bool set_res_tok)
102{
103	int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
104	struct mm_struct *mm = current->mm;
105	unsigned long mapped_addr, unused;
106
107	if (addr)
108		flags |= MAP_FIXED_NOREPLACE;
109
110	mmap_write_lock(mm);
111	mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
112			      VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
113	mmap_write_unlock(mm);
114
115	if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
116		goto out;
117
118	if (create_rstor_token(mapped_addr + token_offset, NULL)) {
119		vm_munmap(mapped_addr, size);
120		return -EINVAL;
121	}
122
123out:
124	return mapped_addr;
125}
126
127static unsigned long adjust_shstk_size(unsigned long size)
128{
129	if (size)
130		return PAGE_ALIGN(size);
131
132	return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
133}
134
135static void unmap_shadow_stack(u64 base, u64 size)
136{
137	int r;
138
139	r = vm_munmap(base, size);
140
141	/*
142	 * mmap_write_lock_killable() failed with -EINTR. This means
143	 * the process is about to die and have it's MM cleaned up.
144	 * This task shouldn't ever make it back to userspace. In this
145	 * case it is ok to leak a shadow stack, so just exit out.
146	 */
147	if (r == -EINTR)
148		return;
149
150	/*
151	 * For all other types of vm_munmap() failure, either the
152	 * system is out of memory or there is bug.
153	 */
154	WARN_ON_ONCE(r);
155}
156
157static int shstk_setup(void)
158{
159	struct thread_shstk *shstk = &current->thread.shstk;
160	unsigned long addr, size;
161
162	/* Already enabled */
163	if (features_enabled(ARCH_SHSTK_SHSTK))
164		return 0;
165
166	/* Also not supported for 32 bit and x32 */
167	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
168		return -EOPNOTSUPP;
169
170	size = adjust_shstk_size(0);
171	addr = alloc_shstk(0, size, 0, false);
172	if (IS_ERR_VALUE(addr))
173		return PTR_ERR((void *)addr);
174
175	fpregs_lock_and_load();
176	wrmsrl(MSR_IA32_PL3_SSP, addr + size);
177	wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
178	fpregs_unlock();
179
180	shstk->base = addr;
181	shstk->size = size;
182	features_set(ARCH_SHSTK_SHSTK);
183
184	return 0;
185}
186
187void reset_thread_features(void)
188{
189	memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
190	current->thread.features = 0;
191	current->thread.features_locked = 0;
192}
193
194unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
195				       unsigned long stack_size)
196{
197	struct thread_shstk *shstk = &tsk->thread.shstk;
198	unsigned long addr, size;
199
200	/*
201	 * If shadow stack is not enabled on the new thread, skip any
202	 * switch to a new shadow stack.
203	 */
204	if (!features_enabled(ARCH_SHSTK_SHSTK))
205		return 0;
206
207	/*
208	 * For CLONE_VFORK the child will share the parents shadow stack.
209	 * Make sure to clear the internal tracking of the thread shadow
210	 * stack so the freeing logic run for child knows to leave it alone.
211	 */
212	if (clone_flags & CLONE_VFORK) {
213		shstk->base = 0;
214		shstk->size = 0;
215		return 0;
216	}
217
218	/*
219	 * For !CLONE_VM the child will use a copy of the parents shadow
220	 * stack.
221	 */
222	if (!(clone_flags & CLONE_VM))
223		return 0;
224
225	size = adjust_shstk_size(stack_size);
226	addr = alloc_shstk(0, size, 0, false);
227	if (IS_ERR_VALUE(addr))
228		return addr;
229
230	shstk->base = addr;
231	shstk->size = size;
232
233	return addr + size;
234}
235
236static unsigned long get_user_shstk_addr(void)
237{
238	unsigned long long ssp;
239
240	fpregs_lock_and_load();
241
242	rdmsrl(MSR_IA32_PL3_SSP, ssp);
243
244	fpregs_unlock();
245
246	return ssp;
247}
248
249#define SHSTK_DATA_BIT BIT(63)
250
251static int put_shstk_data(u64 __user *addr, u64 data)
252{
253	if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
254		return -EINVAL;
255
256	/*
257	 * Mark the high bit so that the sigframe can't be processed as a
258	 * return address.
259	 */
260	if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
261		return -EFAULT;
262	return 0;
263}
264
265static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
266{
267	unsigned long ldata;
268
269	if (unlikely(get_user(ldata, addr)))
270		return -EFAULT;
271
272	if (!(ldata & SHSTK_DATA_BIT))
273		return -EINVAL;
274
275	*data = ldata & ~SHSTK_DATA_BIT;
276
277	return 0;
278}
279
280static int shstk_push_sigframe(unsigned long *ssp)
281{
282	unsigned long target_ssp = *ssp;
283
284	/* Token must be aligned */
285	if (!IS_ALIGNED(target_ssp, 8))
286		return -EINVAL;
287
288	*ssp -= SS_FRAME_SIZE;
289	if (put_shstk_data((void __user *)*ssp, target_ssp))
290		return -EFAULT;
291
292	return 0;
293}
294
295static int shstk_pop_sigframe(unsigned long *ssp)
296{
297	struct vm_area_struct *vma;
298	unsigned long token_addr;
299	bool need_to_check_vma;
300	int err = 1;
301
302	/*
303	 * It is possible for the SSP to be off the end of a shadow stack by 4
304	 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
305	 * before it, it might be this case, so check that the address being
306	 * read is actually shadow stack.
307	 */
308	if (!IS_ALIGNED(*ssp, 8))
309		return -EINVAL;
310
311	need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
312
313	if (need_to_check_vma)
314		mmap_read_lock_killable(current->mm);
315
316	err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
317	if (unlikely(err))
318		goto out_err;
319
320	if (need_to_check_vma) {
321		vma = find_vma(current->mm, *ssp);
322		if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
323			err = -EFAULT;
324			goto out_err;
325		}
326
327		mmap_read_unlock(current->mm);
328	}
329
330	/* Restore SSP aligned? */
331	if (unlikely(!IS_ALIGNED(token_addr, 8)))
332		return -EINVAL;
333
334	/* SSP in userspace? */
335	if (unlikely(token_addr >= TASK_SIZE_MAX))
336		return -EINVAL;
337
338	*ssp = token_addr;
339
340	return 0;
341out_err:
342	if (need_to_check_vma)
343		mmap_read_unlock(current->mm);
344	return err;
345}
346
347int setup_signal_shadow_stack(struct ksignal *ksig)
348{
349	void __user *restorer = ksig->ka.sa.sa_restorer;
350	unsigned long ssp;
351	int err;
352
353	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
354	    !features_enabled(ARCH_SHSTK_SHSTK))
355		return 0;
356
357	if (!restorer)
358		return -EINVAL;
359
360	ssp = get_user_shstk_addr();
361	if (unlikely(!ssp))
362		return -EINVAL;
363
364	err = shstk_push_sigframe(&ssp);
365	if (unlikely(err))
366		return err;
367
368	/* Push restorer address */
369	ssp -= SS_FRAME_SIZE;
370	err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
371	if (unlikely(err))
372		return -EFAULT;
373
374	fpregs_lock_and_load();
375	wrmsrl(MSR_IA32_PL3_SSP, ssp);
376	fpregs_unlock();
377
378	return 0;
379}
380
381int restore_signal_shadow_stack(void)
382{
383	unsigned long ssp;
384	int err;
385
386	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
387	    !features_enabled(ARCH_SHSTK_SHSTK))
388		return 0;
389
390	ssp = get_user_shstk_addr();
391	if (unlikely(!ssp))
392		return -EINVAL;
393
394	err = shstk_pop_sigframe(&ssp);
395	if (unlikely(err))
396		return err;
397
398	fpregs_lock_and_load();
399	wrmsrl(MSR_IA32_PL3_SSP, ssp);
400	fpregs_unlock();
401
402	return 0;
403}
404
405void shstk_free(struct task_struct *tsk)
406{
407	struct thread_shstk *shstk = &tsk->thread.shstk;
408
409	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
410	    !features_enabled(ARCH_SHSTK_SHSTK))
411		return;
412
413	/*
414	 * When fork() with CLONE_VM fails, the child (tsk) already has a
415	 * shadow stack allocated, and exit_thread() calls this function to
416	 * free it.  In this case the parent (current) and the child share
417	 * the same mm struct.
418	 */
419	if (!tsk->mm || tsk->mm != current->mm)
420		return;
421
422	/*
423	 * If shstk->base is NULL, then this task is not managing its
424	 * own shadow stack (CLONE_VFORK). So skip freeing it.
425	 */
426	if (!shstk->base)
427		return;
428
429	/*
430	 * shstk->base is NULL for CLONE_VFORK child tasks, and so is
431	 * normal. But size = 0 on a shstk->base is not normal and
432	 * indicated an attempt to free the thread shadow stack twice.
433	 * Warn about it.
434	 */
435	if (WARN_ON(!shstk->size))
436		return;
437
438	unmap_shadow_stack(shstk->base, shstk->size);
439
440	shstk->size = 0;
441}
442
443static int wrss_control(bool enable)
444{
445	u64 msrval;
446
447	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
448		return -EOPNOTSUPP;
449
450	/*
451	 * Only enable WRSS if shadow stack is enabled. If shadow stack is not
452	 * enabled, WRSS will already be disabled, so don't bother clearing it
453	 * when disabling.
454	 */
455	if (!features_enabled(ARCH_SHSTK_SHSTK))
456		return -EPERM;
457
458	/* Already enabled/disabled? */
459	if (features_enabled(ARCH_SHSTK_WRSS) == enable)
460		return 0;
461
462	fpregs_lock_and_load();
463	rdmsrl(MSR_IA32_U_CET, msrval);
464
465	if (enable) {
466		features_set(ARCH_SHSTK_WRSS);
467		msrval |= CET_WRSS_EN;
468	} else {
469		features_clr(ARCH_SHSTK_WRSS);
470		if (!(msrval & CET_WRSS_EN))
471			goto unlock;
472
473		msrval &= ~CET_WRSS_EN;
474	}
475
476	wrmsrl(MSR_IA32_U_CET, msrval);
477
478unlock:
479	fpregs_unlock();
480
481	return 0;
482}
483
484static int shstk_disable(void)
485{
486	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
487		return -EOPNOTSUPP;
488
489	/* Already disabled? */
490	if (!features_enabled(ARCH_SHSTK_SHSTK))
491		return 0;
492
493	fpregs_lock_and_load();
494	/* Disable WRSS too when disabling shadow stack */
495	wrmsrl(MSR_IA32_U_CET, 0);
496	wrmsrl(MSR_IA32_PL3_SSP, 0);
497	fpregs_unlock();
498
499	shstk_free(current);
500	features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
501
502	return 0;
503}
504
505SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
506{
507	bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
508	unsigned long aligned_size;
509
510	if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
511		return -EOPNOTSUPP;
512
513	if (flags & ~SHADOW_STACK_SET_TOKEN)
514		return -EINVAL;
515
516	/* If there isn't space for a token */
517	if (set_tok && size < 8)
518		return -ENOSPC;
519
520	if (addr && addr < SZ_4G)
521		return -ERANGE;
522
523	/*
524	 * An overflow would result in attempting to write the restore token
525	 * to the wrong location. Not catastrophic, but just return the right
526	 * error code and block it.
527	 */
528	aligned_size = PAGE_ALIGN(size);
529	if (aligned_size < size)
530		return -EOVERFLOW;
531
532	return alloc_shstk(addr, aligned_size, size, set_tok);
533}
534
535long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
536{
537	unsigned long features = arg2;
538
539	if (option == ARCH_SHSTK_STATUS) {
540		return put_user(task->thread.features, (unsigned long __user *)arg2);
541	}
542
543	if (option == ARCH_SHSTK_LOCK) {
544		task->thread.features_locked |= features;
545		return 0;
546	}
547
548	/* Only allow via ptrace */
549	if (task != current) {
550		if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
551			task->thread.features_locked &= ~features;
552			return 0;
553		}
554		return -EINVAL;
555	}
556
557	/* Do not allow to change locked features */
558	if (features & task->thread.features_locked)
559		return -EPERM;
560
561	/* Only support enabling/disabling one feature at a time. */
562	if (hweight_long(features) > 1)
563		return -EINVAL;
564
565	if (option == ARCH_SHSTK_DISABLE) {
566		if (features & ARCH_SHSTK_WRSS)
567			return wrss_control(false);
568		if (features & ARCH_SHSTK_SHSTK)
569			return shstk_disable();
570		return -EINVAL;
571	}
572
573	/* Handle ARCH_SHSTK_ENABLE */
574	if (features & ARCH_SHSTK_SHSTK)
575		return shstk_setup();
576	if (features & ARCH_SHSTK_WRSS)
577		return wrss_control(true);
578	return -EINVAL;
579}
580