1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * AMD SVM-SEV Host Support.
4 *
5 * Copyright (C) 2023 Advanced Micro Devices, Inc.
6 *
7 * Author: Ashish Kalra <ashish.kalra@amd.com>
8 *
9 */
10
11#include <linux/cc_platform.h>
12#include <linux/printk.h>
13#include <linux/mm_types.h>
14#include <linux/set_memory.h>
15#include <linux/memblock.h>
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/cpumask.h>
19#include <linux/iommu.h>
20#include <linux/amd-iommu.h>
21
22#include <asm/sev.h>
23#include <asm/processor.h>
24#include <asm/setup.h>
25#include <asm/svm.h>
26#include <asm/smp.h>
27#include <asm/cpu.h>
28#include <asm/apic.h>
29#include <asm/cpuid.h>
30#include <asm/cmdline.h>
31#include <asm/iommu.h>
32
33/*
34 * The RMP entry format is not architectural. The format is defined in PPR
35 * Family 19h Model 01h, Rev B1 processor.
36 */
37struct rmpentry {
38	union {
39		struct {
40			u64 assigned	: 1,
41			    pagesize	: 1,
42			    immutable	: 1,
43			    rsvd1	: 9,
44			    gpa		: 39,
45			    asid	: 10,
46			    vmsa	: 1,
47			    validated	: 1,
48			    rsvd2	: 1;
49		};
50		u64 lo;
51	};
52	u64 hi;
53} __packed;
54
55/*
56 * The first 16KB from the RMP_BASE is used by the processor for the
57 * bookkeeping, the range needs to be added during the RMP entry lookup.
58 */
59#define RMPTABLE_CPU_BOOKKEEPING_SZ	0x4000
60
61/* Mask to apply to a PFN to get the first PFN of a 2MB page */
62#define PFN_PMD_MASK	GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
63
64static u64 probed_rmp_base, probed_rmp_size;
65static struct rmpentry *rmptable __ro_after_init;
66static u64 rmptable_max_pfn __ro_after_init;
67
68static LIST_HEAD(snp_leaked_pages_list);
69static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
70
71static unsigned long snp_nr_leaked_pages;
72
73#undef pr_fmt
74#define pr_fmt(fmt)	"SEV-SNP: " fmt
75
76static int __mfd_enable(unsigned int cpu)
77{
78	u64 val;
79
80	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
81		return 0;
82
83	rdmsrl(MSR_AMD64_SYSCFG, val);
84
85	val |= MSR_AMD64_SYSCFG_MFDM;
86
87	wrmsrl(MSR_AMD64_SYSCFG, val);
88
89	return 0;
90}
91
92static __init void mfd_enable(void *arg)
93{
94	__mfd_enable(smp_processor_id());
95}
96
97static int __snp_enable(unsigned int cpu)
98{
99	u64 val;
100
101	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
102		return 0;
103
104	rdmsrl(MSR_AMD64_SYSCFG, val);
105
106	val |= MSR_AMD64_SYSCFG_SNP_EN;
107	val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
108
109	wrmsrl(MSR_AMD64_SYSCFG, val);
110
111	return 0;
112}
113
114static __init void snp_enable(void *arg)
115{
116	__snp_enable(smp_processor_id());
117}
118
119#define RMP_ADDR_MASK GENMASK_ULL(51, 13)
120
121bool snp_probe_rmptable_info(void)
122{
123	u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end;
124
125	rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
126	rdmsrl(MSR_AMD64_RMP_END, rmp_end);
127
128	if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
129		pr_err("Memory for the RMP table has not been reserved by BIOS\n");
130		return false;
131	}
132
133	if (rmp_base > rmp_end) {
134		pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
135		return false;
136	}
137
138	rmp_sz = rmp_end - rmp_base + 1;
139
140	/*
141	 * Calculate the amount the memory that must be reserved by the BIOS to
142	 * address the whole RAM, including the bookkeeping area. The RMP itself
143	 * must also be covered.
144	 */
145	max_rmp_pfn = max_pfn;
146	if (PHYS_PFN(rmp_end) > max_pfn)
147		max_rmp_pfn = PHYS_PFN(rmp_end);
148
149	calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
150
151	if (calc_rmp_sz > rmp_sz) {
152		pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
153		       calc_rmp_sz, rmp_sz);
154		return false;
155	}
156
157	probed_rmp_base = rmp_base;
158	probed_rmp_size = rmp_sz;
159
160	pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
161		probed_rmp_base, probed_rmp_base + probed_rmp_size - 1);
162
163	return true;
164}
165
166/*
167 * Do the necessary preparations which are verified by the firmware as
168 * described in the SNP_INIT_EX firmware command description in the SNP
169 * firmware ABI spec.
170 */
171static int __init snp_rmptable_init(void)
172{
173	void *rmptable_start;
174	u64 rmptable_size;
175	u64 val;
176
177	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
178		return 0;
179
180	if (!amd_iommu_snp_en)
181		goto nosnp;
182
183	if (!probed_rmp_size)
184		goto nosnp;
185
186	rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
187	if (!rmptable_start) {
188		pr_err("Failed to map RMP table\n");
189		return 1;
190	}
191
192	/*
193	 * Check if SEV-SNP is already enabled, this can happen in case of
194	 * kexec boot.
195	 */
196	rdmsrl(MSR_AMD64_SYSCFG, val);
197	if (val & MSR_AMD64_SYSCFG_SNP_EN)
198		goto skip_enable;
199
200	memset(rmptable_start, 0, probed_rmp_size);
201
202	/* Flush the caches to ensure that data is written before SNP is enabled. */
203	wbinvd_on_all_cpus();
204
205	/* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
206	on_each_cpu(mfd_enable, NULL, 1);
207
208	on_each_cpu(snp_enable, NULL, 1);
209
210skip_enable:
211	rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
212	rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
213
214	rmptable = (struct rmpentry *)rmptable_start;
215	rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
216
217	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
218
219	/*
220	 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
221	 * notifier is invoked to do SNP IOMMU shutdown before kdump.
222	 */
223	crash_kexec_post_notifiers = true;
224
225	return 0;
226
227nosnp:
228	cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
229	return -ENOSYS;
230}
231
232/*
233 * This must be called after the IOMMU has been initialized.
234 */
235device_initcall(snp_rmptable_init);
236
237static struct rmpentry *get_rmpentry(u64 pfn)
238{
239	if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
240		return ERR_PTR(-EFAULT);
241
242	return &rmptable[pfn];
243}
244
245static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
246{
247	struct rmpentry *large_entry, *entry;
248
249	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
250		return ERR_PTR(-ENODEV);
251
252	entry = get_rmpentry(pfn);
253	if (IS_ERR(entry))
254		return entry;
255
256	/*
257	 * Find the authoritative RMP entry for a PFN. This can be either a 4K
258	 * RMP entry or a special large RMP entry that is authoritative for a
259	 * whole 2M area.
260	 */
261	large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
262	if (IS_ERR(large_entry))
263		return large_entry;
264
265	*level = RMP_TO_PG_LEVEL(large_entry->pagesize);
266
267	return entry;
268}
269
270int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
271{
272	struct rmpentry *e;
273
274	e = __snp_lookup_rmpentry(pfn, level);
275	if (IS_ERR(e))
276		return PTR_ERR(e);
277
278	*assigned = !!e->assigned;
279	return 0;
280}
281EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
282
283/*
284 * Dump the raw RMP entry for a particular PFN. These bits are documented in the
285 * PPR for a particular CPU model and provide useful information about how a
286 * particular PFN is being utilized by the kernel/firmware at the time certain
287 * unexpected events occur, such as RMP faults.
288 */
289static void dump_rmpentry(u64 pfn)
290{
291	u64 pfn_i, pfn_end;
292	struct rmpentry *e;
293	int level;
294
295	e = __snp_lookup_rmpentry(pfn, &level);
296	if (IS_ERR(e)) {
297		pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
298		       pfn, PTR_ERR(e));
299		return;
300	}
301
302	if (e->assigned) {
303		pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
304			pfn, e->lo, e->hi);
305		return;
306	}
307
308	/*
309	 * If the RMP entry for a particular PFN is not in an assigned state,
310	 * then it is sometimes useful to get an idea of whether or not any RMP
311	 * entries for other PFNs within the same 2MB region are assigned, since
312	 * those too can affect the ability to access a particular PFN in
313	 * certain situations, such as when the PFN is being accessed via a 2MB
314	 * mapping in the host page table.
315	 */
316	pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
317	pfn_end = pfn_i + PTRS_PER_PMD;
318
319	pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
320		pfn, pfn_i, pfn_end);
321
322	while (pfn_i < pfn_end) {
323		e = __snp_lookup_rmpentry(pfn_i, &level);
324		if (IS_ERR(e)) {
325			pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
326			       PTR_ERR(e), pfn_i);
327			pfn_i++;
328			continue;
329		}
330
331		if (e->lo || e->hi)
332			pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
333		pfn_i++;
334	}
335}
336
337void snp_dump_hva_rmpentry(unsigned long hva)
338{
339	unsigned long paddr;
340	unsigned int level;
341	pgd_t *pgd;
342	pte_t *pte;
343
344	pgd = __va(read_cr3_pa());
345	pgd += pgd_index(hva);
346	pte = lookup_address_in_pgd(pgd, hva, &level);
347
348	if (!pte) {
349		pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
350		return;
351	}
352
353	paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
354	dump_rmpentry(PHYS_PFN(paddr));
355}
356
357/*
358 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
359 * Validated bit.
360 */
361int psmash(u64 pfn)
362{
363	unsigned long paddr = pfn << PAGE_SHIFT;
364	int ret;
365
366	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
367		return -ENODEV;
368
369	if (!pfn_valid(pfn))
370		return -EINVAL;
371
372	/* Binutils version 2.36 supports the PSMASH mnemonic. */
373	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
374		      : "=a" (ret)
375		      : "a" (paddr)
376		      : "memory", "cc");
377
378	return ret;
379}
380EXPORT_SYMBOL_GPL(psmash);
381
382/*
383 * If the kernel uses a 2MB or larger directmap mapping to write to an address,
384 * and that mapping contains any 4KB pages that are set to private in the RMP
385 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
386 * owns the PFNs being transitioned will never attempt such a write, but other
387 * kernel tasks writing to other PFNs in the range may trigger these checks
388 * inadvertently due a large directmap mapping that happens to overlap such a
389 * PFN.
390 *
391 * Prevent this by splitting any 2MB+ mappings that might end up containing a
392 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
393 * PFN/rmp_level passed in.
394 *
395 * Note that there is no attempt here to scan all the RMP entries for the 2MB
396 * physical range, since it would only be worthwhile in determining if a
397 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
398 * the same shared/private state, thus avoiding the need to split the mapping.
399 * But that would mean the entries are currently in a mixed state, and so the
400 * mapping would have already been split as a result of prior transitions.
401 * And since the 4K split is only done if the mapping is 2MB+, and there isn't
402 * currently a mechanism in place to restore 2MB+ mappings, such a check would
403 * not provide any usable benefit.
404 *
405 * More specifics on how these checks are carried out can be found in APM
406 * Volume 2, "RMP and VMPL Access Checks".
407 */
408static int adjust_direct_map(u64 pfn, int rmp_level)
409{
410	unsigned long vaddr;
411	unsigned int level;
412	int npages, ret;
413	pte_t *pte;
414
415	/*
416	 * pfn_to_kaddr() will return a vaddr only within the direct
417	 * map range.
418	 */
419	vaddr = (unsigned long)pfn_to_kaddr(pfn);
420
421	/* Only 4KB/2MB RMP entries are supported by current hardware. */
422	if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
423		return -EINVAL;
424
425	if (!pfn_valid(pfn))
426		return -EINVAL;
427
428	if (rmp_level == PG_LEVEL_2M &&
429	    (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
430		return -EINVAL;
431
432	/*
433	 * If an entire 2MB physical range is being transitioned, then there is
434	 * no risk of RMP #PFs due to write accesses from overlapping mappings,
435	 * since even accesses from 1GB mappings will be treated as 2MB accesses
436	 * as far as RMP table checks are concerned.
437	 */
438	if (rmp_level == PG_LEVEL_2M)
439		return 0;
440
441	pte = lookup_address(vaddr, &level);
442	if (!pte || pte_none(*pte))
443		return 0;
444
445	if (level == PG_LEVEL_4K)
446		return 0;
447
448	npages = page_level_size(rmp_level) / PAGE_SIZE;
449	ret = set_memory_4k(vaddr, npages);
450	if (ret)
451		pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
452			pfn, ret);
453
454	return ret;
455}
456
457/*
458 * It is expected that those operations are seldom enough so that no mutual
459 * exclusion of updaters is needed and thus the overlap error condition below
460 * should happen very rarely and would get resolved relatively quickly by
461 * the firmware.
462 *
463 * If not, one could consider introducing a mutex or so here to sync concurrent
464 * RMP updates and thus diminish the amount of cases where firmware needs to
465 * lock 2M ranges to protect against concurrent updates.
466 *
467 * The optimal solution would be range locking to avoid locking disjoint
468 * regions unnecessarily but there's no support for that yet.
469 */
470static int rmpupdate(u64 pfn, struct rmp_state *state)
471{
472	unsigned long paddr = pfn << PAGE_SHIFT;
473	int ret, level;
474
475	if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
476		return -ENODEV;
477
478	level = RMP_TO_PG_LEVEL(state->pagesize);
479
480	if (adjust_direct_map(pfn, level))
481		return -EFAULT;
482
483	do {
484		/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
485		asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
486			     : "=a" (ret)
487			     : "a" (paddr), "c" ((unsigned long)state)
488			     : "memory", "cc");
489	} while (ret == RMPUPDATE_FAIL_OVERLAP);
490
491	if (ret) {
492		pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
493		       pfn, level, ret);
494		dump_rmpentry(pfn);
495		dump_stack();
496		return -EFAULT;
497	}
498
499	return 0;
500}
501
502/* Transition a page to guest-owned/private state in the RMP table. */
503int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
504{
505	struct rmp_state state;
506
507	memset(&state, 0, sizeof(state));
508	state.assigned = 1;
509	state.asid = asid;
510	state.immutable = immutable;
511	state.gpa = gpa;
512	state.pagesize = PG_LEVEL_TO_RMP(level);
513
514	return rmpupdate(pfn, &state);
515}
516EXPORT_SYMBOL_GPL(rmp_make_private);
517
518/* Transition a page to hypervisor-owned/shared state in the RMP table. */
519int rmp_make_shared(u64 pfn, enum pg_level level)
520{
521	struct rmp_state state;
522
523	memset(&state, 0, sizeof(state));
524	state.pagesize = PG_LEVEL_TO_RMP(level);
525
526	return rmpupdate(pfn, &state);
527}
528EXPORT_SYMBOL_GPL(rmp_make_shared);
529
530void snp_leak_pages(u64 pfn, unsigned int npages)
531{
532	struct page *page = pfn_to_page(pfn);
533
534	pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
535
536	spin_lock(&snp_leaked_pages_list_lock);
537	while (npages--) {
538
539		/*
540		 * Reuse the page's buddy list for chaining into the leaked
541		 * pages list. This page should not be on a free list currently
542		 * and is also unsafe to be added to a free list.
543		 */
544		if (likely(!PageCompound(page)) ||
545
546			/*
547			 * Skip inserting tail pages of compound page as
548			 * page->buddy_list of tail pages is not usable.
549			 */
550		    (PageHead(page) && compound_nr(page) <= npages))
551			list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
552
553		dump_rmpentry(pfn);
554		snp_nr_leaked_pages++;
555		pfn++;
556		page++;
557	}
558	spin_unlock(&snp_leaked_pages_list_lock);
559}
560EXPORT_SYMBOL_GPL(snp_leak_pages);
561
562void kdump_sev_callback(void)
563{
564	/*
565	 * Do wbinvd() on remote CPUs when SNP is enabled in order to
566	 * safely do SNP_SHUTDOWN on the local CPU.
567	 */
568	if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
569		wbinvd();
570}
571