1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
5 * Copyright (C) 2002 Andi Kleen
6 *
7 * This handles calls from both 32bit and 64bit mode.
8 *
9 * Lock order:
10 *	context.ldt_usr_sem
11 *	  mmap_lock
12 *	    context.lock
13 */
14
15#include <linux/errno.h>
16#include <linux/gfp.h>
17#include <linux/sched.h>
18#include <linux/string.h>
19#include <linux/mm.h>
20#include <linux/smp.h>
21#include <linux/syscalls.h>
22#include <linux/slab.h>
23#include <linux/vmalloc.h>
24#include <linux/uaccess.h>
25
26#include <asm/ldt.h>
27#include <asm/tlb.h>
28#include <asm/desc.h>
29#include <asm/mmu_context.h>
30#include <asm/pgtable_areas.h>
31
32#include <xen/xen.h>
33
34/* This is a multiple of PAGE_SIZE. */
35#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
36
37static inline void *ldt_slot_va(int slot)
38{
39	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
40}
41
42void load_mm_ldt(struct mm_struct *mm)
43{
44	struct ldt_struct *ldt;
45
46	/* READ_ONCE synchronizes with smp_store_release */
47	ldt = READ_ONCE(mm->context.ldt);
48
49	/*
50	 * Any change to mm->context.ldt is followed by an IPI to all
51	 * CPUs with the mm active.  The LDT will not be freed until
52	 * after the IPI is handled by all such CPUs.  This means that
53	 * if the ldt_struct changes before we return, the values we see
54	 * will be safe, and the new values will be loaded before we run
55	 * any user code.
56	 *
57	 * NB: don't try to convert this to use RCU without extreme care.
58	 * We would still need IRQs off, because we don't want to change
59	 * the local LDT after an IPI loaded a newer value than the one
60	 * that we can see.
61	 */
62
63	if (unlikely(ldt)) {
64		if (static_cpu_has(X86_FEATURE_PTI)) {
65			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
66				/*
67				 * Whoops -- either the new LDT isn't mapped
68				 * (if slot == -1) or is mapped into a bogus
69				 * slot (if slot > 1).
70				 */
71				clear_LDT();
72				return;
73			}
74
75			/*
76			 * If page table isolation is enabled, ldt->entries
77			 * will not be mapped in the userspace pagetables.
78			 * Tell the CPU to access the LDT through the alias
79			 * at ldt_slot_va(ldt->slot).
80			 */
81			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
82		} else {
83			set_ldt(ldt->entries, ldt->nr_entries);
84		}
85	} else {
86		clear_LDT();
87	}
88}
89
90void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
91{
92	/*
93	 * Load the LDT if either the old or new mm had an LDT.
94	 *
95	 * An mm will never go from having an LDT to not having an LDT.  Two
96	 * mms never share an LDT, so we don't gain anything by checking to
97	 * see whether the LDT changed.  There's also no guarantee that
98	 * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
99	 * then prev->context.ldt will also be non-NULL.
100	 *
101	 * If we really cared, we could optimize the case where prev == next
102	 * and we're exiting lazy mode.  Most of the time, if this happens,
103	 * we don't actually need to reload LDTR, but modify_ldt() is mostly
104	 * used by legacy code and emulators where we don't need this level of
105	 * performance.
106	 *
107	 * This uses | instead of || because it generates better code.
108	 */
109	if (unlikely((unsigned long)prev->context.ldt |
110		     (unsigned long)next->context.ldt))
111		load_mm_ldt(next);
112
113	DEBUG_LOCKS_WARN_ON(preemptible());
114}
115
116static void refresh_ldt_segments(void)
117{
118#ifdef CONFIG_X86_64
119	unsigned short sel;
120
121	/*
122	 * Make sure that the cached DS and ES descriptors match the updated
123	 * LDT.
124	 */
125	savesegment(ds, sel);
126	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
127		loadsegment(ds, sel);
128
129	savesegment(es, sel);
130	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
131		loadsegment(es, sel);
132#endif
133}
134
135/* context.lock is held by the task which issued the smp function call */
136static void flush_ldt(void *__mm)
137{
138	struct mm_struct *mm = __mm;
139
140	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
141		return;
142
143	load_mm_ldt(mm);
144
145	refresh_ldt_segments();
146}
147
148/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
149static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
150{
151	struct ldt_struct *new_ldt;
152	unsigned int alloc_size;
153
154	if (num_entries > LDT_ENTRIES)
155		return NULL;
156
157	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
158	if (!new_ldt)
159		return NULL;
160
161	BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
162	alloc_size = num_entries * LDT_ENTRY_SIZE;
163
164	/*
165	 * Xen is very picky: it requires a page-aligned LDT that has no
166	 * trailing nonzero bytes in any page that contains LDT descriptors.
167	 * Keep it simple: zero the whole allocation and never allocate less
168	 * than PAGE_SIZE.
169	 */
170	if (alloc_size > PAGE_SIZE)
171		new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
172	else
173		new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
174
175	if (!new_ldt->entries) {
176		kfree(new_ldt);
177		return NULL;
178	}
179
180	/* The new LDT isn't aliased for PTI yet. */
181	new_ldt->slot = -1;
182
183	new_ldt->nr_entries = num_entries;
184	return new_ldt;
185}
186
187#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
188
189static void do_sanity_check(struct mm_struct *mm,
190			    bool had_kernel_mapping,
191			    bool had_user_mapping)
192{
193	if (mm->context.ldt) {
194		/*
195		 * We already had an LDT.  The top-level entry should already
196		 * have been allocated and synchronized with the usermode
197		 * tables.
198		 */
199		WARN_ON(!had_kernel_mapping);
200		if (boot_cpu_has(X86_FEATURE_PTI))
201			WARN_ON(!had_user_mapping);
202	} else {
203		/*
204		 * This is the first time we're mapping an LDT for this process.
205		 * Sync the pgd to the usermode tables.
206		 */
207		WARN_ON(had_kernel_mapping);
208		if (boot_cpu_has(X86_FEATURE_PTI))
209			WARN_ON(had_user_mapping);
210	}
211}
212
213#ifdef CONFIG_X86_PAE
214
215static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
216{
217	p4d_t *p4d;
218	pud_t *pud;
219
220	if (pgd->pgd == 0)
221		return NULL;
222
223	p4d = p4d_offset(pgd, va);
224	if (p4d_none(*p4d))
225		return NULL;
226
227	pud = pud_offset(p4d, va);
228	if (pud_none(*pud))
229		return NULL;
230
231	return pmd_offset(pud, va);
232}
233
234static void map_ldt_struct_to_user(struct mm_struct *mm)
235{
236	pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
237	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
238	pmd_t *k_pmd, *u_pmd;
239
240	k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
241	u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
242
243	if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
244		set_pmd(u_pmd, *k_pmd);
245}
246
247static void sanity_check_ldt_mapping(struct mm_struct *mm)
248{
249	pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
250	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
251	bool had_kernel, had_user;
252	pmd_t *k_pmd, *u_pmd;
253
254	k_pmd      = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
255	u_pmd      = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
256	had_kernel = (k_pmd->pmd != 0);
257	had_user   = (u_pmd->pmd != 0);
258
259	do_sanity_check(mm, had_kernel, had_user);
260}
261
262#else /* !CONFIG_X86_PAE */
263
264static void map_ldt_struct_to_user(struct mm_struct *mm)
265{
266	pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
267
268	if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
269		set_pgd(kernel_to_user_pgdp(pgd), *pgd);
270}
271
272static void sanity_check_ldt_mapping(struct mm_struct *mm)
273{
274	pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
275	bool had_kernel = (pgd->pgd != 0);
276	bool had_user   = (kernel_to_user_pgdp(pgd)->pgd != 0);
277
278	do_sanity_check(mm, had_kernel, had_user);
279}
280
281#endif /* CONFIG_X86_PAE */
282
283/*
284 * If PTI is enabled, this maps the LDT into the kernelmode and
285 * usermode tables for the given mm.
286 */
287static int
288map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
289{
290	unsigned long va;
291	bool is_vmalloc;
292	spinlock_t *ptl;
293	int i, nr_pages;
294
295	if (!boot_cpu_has(X86_FEATURE_PTI))
296		return 0;
297
298	/*
299	 * Any given ldt_struct should have map_ldt_struct() called at most
300	 * once.
301	 */
302	WARN_ON(ldt->slot != -1);
303
304	/* Check if the current mappings are sane */
305	sanity_check_ldt_mapping(mm);
306
307	is_vmalloc = is_vmalloc_addr(ldt->entries);
308
309	nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
310
311	for (i = 0; i < nr_pages; i++) {
312		unsigned long offset = i << PAGE_SHIFT;
313		const void *src = (char *)ldt->entries + offset;
314		unsigned long pfn;
315		pgprot_t pte_prot;
316		pte_t pte, *ptep;
317
318		va = (unsigned long)ldt_slot_va(slot) + offset;
319		pfn = is_vmalloc ? vmalloc_to_pfn(src) :
320			page_to_pfn(virt_to_page(src));
321		/*
322		 * Treat the PTI LDT range as a *userspace* range.
323		 * get_locked_pte() will allocate all needed pagetables
324		 * and account for them in this mm.
325		 */
326		ptep = get_locked_pte(mm, va, &ptl);
327		if (!ptep)
328			return -ENOMEM;
329		/*
330		 * Map it RO so the easy to find address is not a primary
331		 * target via some kernel interface which misses a
332		 * permission check.
333		 */
334		pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
335		/* Filter out unsuppored __PAGE_KERNEL* bits: */
336		pgprot_val(pte_prot) &= __supported_pte_mask;
337		pte = pfn_pte(pfn, pte_prot);
338		set_pte_at(mm, va, ptep, pte);
339		pte_unmap_unlock(ptep, ptl);
340	}
341
342	/* Propagate LDT mapping to the user page-table */
343	map_ldt_struct_to_user(mm);
344
345	ldt->slot = slot;
346	return 0;
347}
348
349static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
350{
351	unsigned long va;
352	int i, nr_pages;
353
354	if (!ldt)
355		return;
356
357	/* LDT map/unmap is only required for PTI */
358	if (!boot_cpu_has(X86_FEATURE_PTI))
359		return;
360
361	nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
362
363	for (i = 0; i < nr_pages; i++) {
364		unsigned long offset = i << PAGE_SHIFT;
365		spinlock_t *ptl;
366		pte_t *ptep;
367
368		va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
369		ptep = get_locked_pte(mm, va, &ptl);
370		if (!WARN_ON_ONCE(!ptep)) {
371			pte_clear(mm, va, ptep);
372			pte_unmap_unlock(ptep, ptl);
373		}
374	}
375
376	va = (unsigned long)ldt_slot_va(ldt->slot);
377	flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
378}
379
380#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
381
382static int
383map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
384{
385	return 0;
386}
387
388static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
389{
390}
391#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
392
393static void free_ldt_pgtables(struct mm_struct *mm)
394{
395#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
396	struct mmu_gather tlb;
397	unsigned long start = LDT_BASE_ADDR;
398	unsigned long end = LDT_END_ADDR;
399
400	if (!boot_cpu_has(X86_FEATURE_PTI))
401		return;
402
403	/*
404	 * Although free_pgd_range() is intended for freeing user
405	 * page-tables, it also works out for kernel mappings on x86.
406	 * We use tlb_gather_mmu_fullmm() to avoid confusing the
407	 * range-tracking logic in __tlb_adjust_range().
408	 */
409	tlb_gather_mmu_fullmm(&tlb, mm);
410	free_pgd_range(&tlb, start, end, start, end);
411	tlb_finish_mmu(&tlb);
412#endif
413}
414
415/* After calling this, the LDT is immutable. */
416static void finalize_ldt_struct(struct ldt_struct *ldt)
417{
418	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
419}
420
421static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
422{
423	mutex_lock(&mm->context.lock);
424
425	/* Synchronizes with READ_ONCE in load_mm_ldt. */
426	smp_store_release(&mm->context.ldt, ldt);
427
428	/* Activate the LDT for all CPUs using currents mm. */
429	on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
430
431	mutex_unlock(&mm->context.lock);
432}
433
434static void free_ldt_struct(struct ldt_struct *ldt)
435{
436	if (likely(!ldt))
437		return;
438
439	paravirt_free_ldt(ldt->entries, ldt->nr_entries);
440	if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
441		vfree_atomic(ldt->entries);
442	else
443		free_page((unsigned long)ldt->entries);
444	kfree(ldt);
445}
446
447/*
448 * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
449 * the new task is not running, so nothing can be installed.
450 */
451int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
452{
453	struct ldt_struct *new_ldt;
454	int retval = 0;
455
456	if (!old_mm)
457		return 0;
458
459	mutex_lock(&old_mm->context.lock);
460	if (!old_mm->context.ldt)
461		goto out_unlock;
462
463	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
464	if (!new_ldt) {
465		retval = -ENOMEM;
466		goto out_unlock;
467	}
468
469	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
470	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
471	finalize_ldt_struct(new_ldt);
472
473	retval = map_ldt_struct(mm, new_ldt, 0);
474	if (retval) {
475		free_ldt_pgtables(mm);
476		free_ldt_struct(new_ldt);
477		goto out_unlock;
478	}
479	mm->context.ldt = new_ldt;
480
481out_unlock:
482	mutex_unlock(&old_mm->context.lock);
483	return retval;
484}
485
486/*
487 * No need to lock the MM as we are the last user
488 *
489 * 64bit: Don't touch the LDT register - we're already in the next thread.
490 */
491void destroy_context_ldt(struct mm_struct *mm)
492{
493	free_ldt_struct(mm->context.ldt);
494	mm->context.ldt = NULL;
495}
496
497void ldt_arch_exit_mmap(struct mm_struct *mm)
498{
499	free_ldt_pgtables(mm);
500}
501
502static int read_ldt(void __user *ptr, unsigned long bytecount)
503{
504	struct mm_struct *mm = current->mm;
505	unsigned long entries_size;
506	int retval;
507
508	down_read(&mm->context.ldt_usr_sem);
509
510	if (!mm->context.ldt) {
511		retval = 0;
512		goto out_unlock;
513	}
514
515	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
516		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
517
518	entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
519	if (entries_size > bytecount)
520		entries_size = bytecount;
521
522	if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
523		retval = -EFAULT;
524		goto out_unlock;
525	}
526
527	if (entries_size != bytecount) {
528		/* Zero-fill the rest and pretend we read bytecount bytes. */
529		if (clear_user(ptr + entries_size, bytecount - entries_size)) {
530			retval = -EFAULT;
531			goto out_unlock;
532		}
533	}
534	retval = bytecount;
535
536out_unlock:
537	up_read(&mm->context.ldt_usr_sem);
538	return retval;
539}
540
541static int read_default_ldt(void __user *ptr, unsigned long bytecount)
542{
543	/* CHECKME: Can we use _one_ random number ? */
544#ifdef CONFIG_X86_32
545	unsigned long size = 5 * sizeof(struct desc_struct);
546#else
547	unsigned long size = 128;
548#endif
549	if (bytecount > size)
550		bytecount = size;
551	if (clear_user(ptr, bytecount))
552		return -EFAULT;
553	return bytecount;
554}
555
556static bool allow_16bit_segments(void)
557{
558	if (!IS_ENABLED(CONFIG_X86_16BIT))
559		return false;
560
561#ifdef CONFIG_XEN_PV
562	/*
563	 * Xen PV does not implement ESPFIX64, which means that 16-bit
564	 * segments will not work correctly.  Until either Xen PV implements
565	 * ESPFIX64 and can signal this fact to the guest or unless someone
566	 * provides compelling evidence that allowing broken 16-bit segments
567	 * is worthwhile, disallow 16-bit segments under Xen PV.
568	 */
569	if (xen_pv_domain()) {
570		pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
571		return false;
572	}
573#endif
574
575	return true;
576}
577
578static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
579{
580	struct mm_struct *mm = current->mm;
581	struct ldt_struct *new_ldt, *old_ldt;
582	unsigned int old_nr_entries, new_nr_entries;
583	struct user_desc ldt_info;
584	struct desc_struct ldt;
585	int error;
586
587	error = -EINVAL;
588	if (bytecount != sizeof(ldt_info))
589		goto out;
590	error = -EFAULT;
591	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
592		goto out;
593
594	error = -EINVAL;
595	if (ldt_info.entry_number >= LDT_ENTRIES)
596		goto out;
597	if (ldt_info.contents == 3) {
598		if (oldmode)
599			goto out;
600		if (ldt_info.seg_not_present == 0)
601			goto out;
602	}
603
604	if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
605	    LDT_empty(&ldt_info)) {
606		/* The user wants to clear the entry. */
607		memset(&ldt, 0, sizeof(ldt));
608	} else {
609		if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
610			error = -EINVAL;
611			goto out;
612		}
613
614		fill_ldt(&ldt, &ldt_info);
615		if (oldmode)
616			ldt.avl = 0;
617	}
618
619	if (down_write_killable(&mm->context.ldt_usr_sem))
620		return -EINTR;
621
622	old_ldt       = mm->context.ldt;
623	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
624	new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
625
626	error = -ENOMEM;
627	new_ldt = alloc_ldt_struct(new_nr_entries);
628	if (!new_ldt)
629		goto out_unlock;
630
631	if (old_ldt)
632		memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
633
634	new_ldt->entries[ldt_info.entry_number] = ldt;
635	finalize_ldt_struct(new_ldt);
636
637	/*
638	 * If we are using PTI, map the new LDT into the userspace pagetables.
639	 * If there is already an LDT, use the other slot so that other CPUs
640	 * will continue to use the old LDT until install_ldt() switches
641	 * them over to the new LDT.
642	 */
643	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
644	if (error) {
645		/*
646		 * This only can fail for the first LDT setup. If an LDT is
647		 * already installed then the PTE page is already
648		 * populated. Mop up a half populated page table.
649		 */
650		if (!WARN_ON_ONCE(old_ldt))
651			free_ldt_pgtables(mm);
652		free_ldt_struct(new_ldt);
653		goto out_unlock;
654	}
655
656	install_ldt(mm, new_ldt);
657	unmap_ldt_struct(mm, old_ldt);
658	free_ldt_struct(old_ldt);
659	error = 0;
660
661out_unlock:
662	up_write(&mm->context.ldt_usr_sem);
663out:
664	return error;
665}
666
667SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
668		unsigned long , bytecount)
669{
670	int ret = -ENOSYS;
671
672	switch (func) {
673	case 0:
674		ret = read_ldt(ptr, bytecount);
675		break;
676	case 1:
677		ret = write_ldt(ptr, bytecount, 1);
678		break;
679	case 2:
680		ret = read_default_ldt(ptr, bytecount);
681		break;
682	case 0x11:
683		ret = write_ldt(ptr, bytecount, 0);
684		break;
685	}
686	/*
687	 * The SYSCALL_DEFINE() macros give us an 'unsigned long'
688	 * return type, but the ABI for sys_modify_ldt() expects
689	 * 'int'.  This cast gives us an int-sized value in %rax
690	 * for the return code.  The 'unsigned' is necessary so
691	 * the compiler does not try to sign-extend the negative
692	 * return codes into the high half of the register when
693	 * taking the value from int->long.
694	 */
695	return (unsigned int)ret;
696}
697