1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright(c) 2023 Intel Corporation.
4 *
5 * Intel Trusted Domain Extensions (TDX) support
6 */
7
8#define pr_fmt(fmt)	"virt/tdx: " fmt
9
10#include <linux/types.h>
11#include <linux/cache.h>
12#include <linux/init.h>
13#include <linux/errno.h>
14#include <linux/printk.h>
15#include <linux/cpu.h>
16#include <linux/spinlock.h>
17#include <linux/percpu-defs.h>
18#include <linux/mutex.h>
19#include <linux/list.h>
20#include <linux/memblock.h>
21#include <linux/memory.h>
22#include <linux/minmax.h>
23#include <linux/sizes.h>
24#include <linux/pfn.h>
25#include <linux/align.h>
26#include <linux/sort.h>
27#include <linux/log2.h>
28#include <linux/acpi.h>
29#include <linux/suspend.h>
30#include <linux/acpi.h>
31#include <asm/page.h>
32#include <asm/special_insns.h>
33#include <asm/msr-index.h>
34#include <asm/msr.h>
35#include <asm/cpufeature.h>
36#include <asm/tdx.h>
37#include <asm/intel-family.h>
38#include <asm/processor.h>
39#include <asm/mce.h>
40#include "tdx.h"
41
42static u32 tdx_global_keyid __ro_after_init;
43static u32 tdx_guest_keyid_start __ro_after_init;
44static u32 tdx_nr_guest_keyids __ro_after_init;
45
46static DEFINE_PER_CPU(bool, tdx_lp_initialized);
47
48static struct tdmr_info_list tdx_tdmr_list;
49
50static enum tdx_module_status_t tdx_module_status;
51static DEFINE_MUTEX(tdx_module_lock);
52
53/* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
54static LIST_HEAD(tdx_memlist);
55
56typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
57
58static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
59{
60	pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
61}
62
63static inline void seamcall_err_ret(u64 fn, u64 err,
64				    struct tdx_module_args *args)
65{
66	seamcall_err(fn, err, args);
67	pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
68			args->rcx, args->rdx, args->r8);
69	pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
70			args->r9, args->r10, args->r11);
71}
72
73static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
74				 u64 fn, struct tdx_module_args *args)
75{
76	u64 sret = sc_retry(func, fn, args);
77
78	if (sret == TDX_SUCCESS)
79		return 0;
80
81	if (sret == TDX_SEAMCALL_VMFAILINVALID)
82		return -ENODEV;
83
84	if (sret == TDX_SEAMCALL_GP)
85		return -EOPNOTSUPP;
86
87	if (sret == TDX_SEAMCALL_UD)
88		return -EACCES;
89
90	err_func(fn, sret, args);
91	return -EIO;
92}
93
94#define seamcall_prerr(__fn, __args)						\
95	sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
96
97#define seamcall_prerr_ret(__fn, __args)					\
98	sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
99
100/*
101 * Do the module global initialization once and return its result.
102 * It can be done on any cpu.  It's always called with interrupts
103 * disabled.
104 */
105static int try_init_module_global(void)
106{
107	struct tdx_module_args args = {};
108	static DEFINE_RAW_SPINLOCK(sysinit_lock);
109	static bool sysinit_done;
110	static int sysinit_ret;
111
112	lockdep_assert_irqs_disabled();
113
114	raw_spin_lock(&sysinit_lock);
115
116	if (sysinit_done)
117		goto out;
118
119	/* RCX is module attributes and all bits are reserved */
120	args.rcx = 0;
121	sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
122
123	/*
124	 * The first SEAMCALL also detects the TDX module, thus
125	 * it can fail due to the TDX module is not loaded.
126	 * Dump message to let the user know.
127	 */
128	if (sysinit_ret == -ENODEV)
129		pr_err("module not loaded\n");
130
131	sysinit_done = true;
132out:
133	raw_spin_unlock(&sysinit_lock);
134	return sysinit_ret;
135}
136
137/**
138 * tdx_cpu_enable - Enable TDX on local cpu
139 *
140 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
141 * global initialization SEAMCALL if not done) on local cpu to make this
142 * cpu be ready to run any other SEAMCALLs.
143 *
144 * Always call this function via IPI function calls.
145 *
146 * Return 0 on success, otherwise errors.
147 */
148int tdx_cpu_enable(void)
149{
150	struct tdx_module_args args = {};
151	int ret;
152
153	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
154		return -ENODEV;
155
156	lockdep_assert_irqs_disabled();
157
158	if (__this_cpu_read(tdx_lp_initialized))
159		return 0;
160
161	/*
162	 * The TDX module global initialization is the very first step
163	 * to enable TDX.  Need to do it first (if hasn't been done)
164	 * before the per-cpu initialization.
165	 */
166	ret = try_init_module_global();
167	if (ret)
168		return ret;
169
170	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
171	if (ret)
172		return ret;
173
174	__this_cpu_write(tdx_lp_initialized, true);
175
176	return 0;
177}
178EXPORT_SYMBOL_GPL(tdx_cpu_enable);
179
180/*
181 * Add a memory region as a TDX memory block.  The caller must make sure
182 * all memory regions are added in address ascending order and don't
183 * overlap.
184 */
185static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
186			    unsigned long end_pfn, int nid)
187{
188	struct tdx_memblock *tmb;
189
190	tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
191	if (!tmb)
192		return -ENOMEM;
193
194	INIT_LIST_HEAD(&tmb->list);
195	tmb->start_pfn = start_pfn;
196	tmb->end_pfn = end_pfn;
197	tmb->nid = nid;
198
199	/* @tmb_list is protected by mem_hotplug_lock */
200	list_add_tail(&tmb->list, tmb_list);
201	return 0;
202}
203
204static void free_tdx_memlist(struct list_head *tmb_list)
205{
206	/* @tmb_list is protected by mem_hotplug_lock */
207	while (!list_empty(tmb_list)) {
208		struct tdx_memblock *tmb = list_first_entry(tmb_list,
209				struct tdx_memblock, list);
210
211		list_del(&tmb->list);
212		kfree(tmb);
213	}
214}
215
216/*
217 * Ensure that all memblock memory regions are convertible to TDX
218 * memory.  Once this has been established, stash the memblock
219 * ranges off in a secondary structure because memblock is modified
220 * in memory hotplug while TDX memory regions are fixed.
221 */
222static int build_tdx_memlist(struct list_head *tmb_list)
223{
224	unsigned long start_pfn, end_pfn;
225	int i, nid, ret;
226
227	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
228		/*
229		 * The first 1MB is not reported as TDX convertible memory.
230		 * Although the first 1MB is always reserved and won't end up
231		 * to the page allocator, it is still in memblock's memory
232		 * regions.  Skip them manually to exclude them as TDX memory.
233		 */
234		start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
235		if (start_pfn >= end_pfn)
236			continue;
237
238		/*
239		 * Add the memory regions as TDX memory.  The regions in
240		 * memblock has already guaranteed they are in address
241		 * ascending order and don't overlap.
242		 */
243		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
244		if (ret)
245			goto err;
246	}
247
248	return 0;
249err:
250	free_tdx_memlist(tmb_list);
251	return ret;
252}
253
254static int read_sys_metadata_field(u64 field_id, u64 *data)
255{
256	struct tdx_module_args args = {};
257	int ret;
258
259	/*
260	 * TDH.SYS.RD -- reads one global metadata field
261	 *  - RDX (in): the field to read
262	 *  - R8 (out): the field data
263	 */
264	args.rdx = field_id;
265	ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
266	if (ret)
267		return ret;
268
269	*data = args.r8;
270
271	return 0;
272}
273
274static int read_sys_metadata_field16(u64 field_id,
275				     int offset,
276				     struct tdx_tdmr_sysinfo *ts)
277{
278	u16 *ts_member = ((void *)ts) + offset;
279	u64 tmp;
280	int ret;
281
282	if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) !=
283			MD_FIELD_ID_ELE_SIZE_16BIT))
284		return -EINVAL;
285
286	ret = read_sys_metadata_field(field_id, &tmp);
287	if (ret)
288		return ret;
289
290	*ts_member = tmp;
291
292	return 0;
293}
294
295struct field_mapping {
296	u64 field_id;
297	int offset;
298};
299
300#define TD_SYSINFO_MAP(_field_id, _offset) \
301	{ .field_id = MD_FIELD_ID_##_field_id,	   \
302	  .offset   = offsetof(struct tdx_tdmr_sysinfo, _offset) }
303
304/* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */
305static const struct field_mapping fields[] = {
306	TD_SYSINFO_MAP(MAX_TDMRS,	      max_tdmrs),
307	TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr),
308	TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE,    pamt_entry_size[TDX_PS_4K]),
309	TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE,    pamt_entry_size[TDX_PS_2M]),
310	TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE,    pamt_entry_size[TDX_PS_1G]),
311};
312
313static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo)
314{
315	int ret;
316	int i;
317
318	/* Populate 'tdmr_sysinfo' fields using the mapping structure above: */
319	for (i = 0; i < ARRAY_SIZE(fields); i++) {
320		ret = read_sys_metadata_field16(fields[i].field_id,
321						fields[i].offset,
322						tdmr_sysinfo);
323		if (ret)
324			return ret;
325	}
326
327	return 0;
328}
329
330/* Calculate the actual TDMR size */
331static int tdmr_size_single(u16 max_reserved_per_tdmr)
332{
333	int tdmr_sz;
334
335	/*
336	 * The actual size of TDMR depends on the maximum
337	 * number of reserved areas.
338	 */
339	tdmr_sz = sizeof(struct tdmr_info);
340	tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
341
342	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
343}
344
345static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
346			   struct tdx_tdmr_sysinfo *tdmr_sysinfo)
347{
348	size_t tdmr_sz, tdmr_array_sz;
349	void *tdmr_array;
350
351	tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr);
352	tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs;
353
354	/*
355	 * To keep things simple, allocate all TDMRs together.
356	 * The buffer needs to be physically contiguous to make
357	 * sure each TDMR is physically contiguous.
358	 */
359	tdmr_array = alloc_pages_exact(tdmr_array_sz,
360			GFP_KERNEL | __GFP_ZERO);
361	if (!tdmr_array)
362		return -ENOMEM;
363
364	tdmr_list->tdmrs = tdmr_array;
365
366	/*
367	 * Keep the size of TDMR to find the target TDMR
368	 * at a given index in the TDMR list.
369	 */
370	tdmr_list->tdmr_sz = tdmr_sz;
371	tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs;
372	tdmr_list->nr_consumed_tdmrs = 0;
373
374	return 0;
375}
376
377static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
378{
379	free_pages_exact(tdmr_list->tdmrs,
380			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
381}
382
383/* Get the TDMR from the list at the given index. */
384static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
385				    int idx)
386{
387	int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
388
389	return (void *)tdmr_list->tdmrs + tdmr_info_offset;
390}
391
392#define TDMR_ALIGNMENT		SZ_1G
393#define TDMR_ALIGN_DOWN(_addr)	ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
394#define TDMR_ALIGN_UP(_addr)	ALIGN((_addr), TDMR_ALIGNMENT)
395
396static inline u64 tdmr_end(struct tdmr_info *tdmr)
397{
398	return tdmr->base + tdmr->size;
399}
400
401/*
402 * Take the memory referenced in @tmb_list and populate the
403 * preallocated @tdmr_list, following all the special alignment
404 * and size rules for TDMR.
405 */
406static int fill_out_tdmrs(struct list_head *tmb_list,
407			  struct tdmr_info_list *tdmr_list)
408{
409	struct tdx_memblock *tmb;
410	int tdmr_idx = 0;
411
412	/*
413	 * Loop over TDX memory regions and fill out TDMRs to cover them.
414	 * To keep it simple, always try to use one TDMR to cover one
415	 * memory region.
416	 *
417	 * In practice TDX supports at least 64 TDMRs.  A 2-socket system
418	 * typically only consumes less than 10 of those.  This code is
419	 * dumb and simple and may use more TMDRs than is strictly
420	 * required.
421	 */
422	list_for_each_entry(tmb, tmb_list, list) {
423		struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
424		u64 start, end;
425
426		start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
427		end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
428
429		/*
430		 * A valid size indicates the current TDMR has already
431		 * been filled out to cover the previous memory region(s).
432		 */
433		if (tdmr->size) {
434			/*
435			 * Loop to the next if the current memory region
436			 * has already been fully covered.
437			 */
438			if (end <= tdmr_end(tdmr))
439				continue;
440
441			/* Otherwise, skip the already covered part. */
442			if (start < tdmr_end(tdmr))
443				start = tdmr_end(tdmr);
444
445			/*
446			 * Create a new TDMR to cover the current memory
447			 * region, or the remaining part of it.
448			 */
449			tdmr_idx++;
450			if (tdmr_idx >= tdmr_list->max_tdmrs) {
451				pr_warn("initialization failed: TDMRs exhausted.\n");
452				return -ENOSPC;
453			}
454
455			tdmr = tdmr_entry(tdmr_list, tdmr_idx);
456		}
457
458		tdmr->base = start;
459		tdmr->size = end - start;
460	}
461
462	/* @tdmr_idx is always the index of the last valid TDMR. */
463	tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
464
465	/*
466	 * Warn early that kernel is about to run out of TDMRs.
467	 *
468	 * This is an indication that TDMR allocation has to be
469	 * reworked to be smarter to not run into an issue.
470	 */
471	if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
472		pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
473				tdmr_list->nr_consumed_tdmrs,
474				tdmr_list->max_tdmrs);
475
476	return 0;
477}
478
479/*
480 * Calculate PAMT size given a TDMR and a page size.  The returned
481 * PAMT size is always aligned up to 4K page boundary.
482 */
483static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
484				      u16 pamt_entry_size)
485{
486	unsigned long pamt_sz, nr_pamt_entries;
487
488	switch (pgsz) {
489	case TDX_PS_4K:
490		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
491		break;
492	case TDX_PS_2M:
493		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
494		break;
495	case TDX_PS_1G:
496		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
497		break;
498	default:
499		WARN_ON_ONCE(1);
500		return 0;
501	}
502
503	pamt_sz = nr_pamt_entries * pamt_entry_size;
504	/* TDX requires PAMT size must be 4K aligned */
505	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
506
507	return pamt_sz;
508}
509
510/*
511 * Locate a NUMA node which should hold the allocation of the @tdmr
512 * PAMT.  This node will have some memory covered by the TDMR.  The
513 * relative amount of memory covered is not considered.
514 */
515static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
516{
517	struct tdx_memblock *tmb;
518
519	/*
520	 * A TDMR must cover at least part of one TMB.  That TMB will end
521	 * after the TDMR begins.  But, that TMB may have started before
522	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
523	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
524	 */
525	list_for_each_entry(tmb, tmb_list, list) {
526		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
527			return tmb->nid;
528	}
529
530	/*
531	 * Fall back to allocating the TDMR's metadata from node 0 when
532	 * no TDX memory block can be found.  This should never happen
533	 * since TDMRs originate from TDX memory blocks.
534	 */
535	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
536			tdmr->base, tdmr_end(tdmr));
537	return 0;
538}
539
540/*
541 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
542 * within @tdmr, and set up PAMTs for @tdmr.
543 */
544static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
545			    struct list_head *tmb_list,
546			    u16 pamt_entry_size[])
547{
548	unsigned long pamt_base[TDX_PS_NR];
549	unsigned long pamt_size[TDX_PS_NR];
550	unsigned long tdmr_pamt_base;
551	unsigned long tdmr_pamt_size;
552	struct page *pamt;
553	int pgsz, nid;
554
555	nid = tdmr_get_nid(tdmr, tmb_list);
556
557	/*
558	 * Calculate the PAMT size for each TDX supported page size
559	 * and the total PAMT size.
560	 */
561	tdmr_pamt_size = 0;
562	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
563		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
564					pamt_entry_size[pgsz]);
565		tdmr_pamt_size += pamt_size[pgsz];
566	}
567
568	/*
569	 * Allocate one chunk of physically contiguous memory for all
570	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
571	 * in overlapped TDMRs.
572	 */
573	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
574			nid, &node_online_map);
575	if (!pamt)
576		return -ENOMEM;
577
578	/*
579	 * Break the contiguous allocation back up into the
580	 * individual PAMTs for each page size.
581	 */
582	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
583	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
584		pamt_base[pgsz] = tdmr_pamt_base;
585		tdmr_pamt_base += pamt_size[pgsz];
586	}
587
588	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
589	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
590	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
591	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
592	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
593	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
594
595	return 0;
596}
597
598static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
599			  unsigned long *pamt_size)
600{
601	unsigned long pamt_bs, pamt_sz;
602
603	/*
604	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
605	 * should always point to the beginning of that allocation.
606	 */
607	pamt_bs = tdmr->pamt_4k_base;
608	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
609
610	WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
611
612	*pamt_base = pamt_bs;
613	*pamt_size = pamt_sz;
614}
615
616static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
617		void (*pamt_func)(unsigned long base, unsigned long size))
618{
619	unsigned long pamt_base, pamt_size;
620
621	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
622
623	/* Do nothing if PAMT hasn't been allocated for this TDMR */
624	if (!pamt_size)
625		return;
626
627	if (WARN_ON_ONCE(!pamt_base))
628		return;
629
630	pamt_func(pamt_base, pamt_size);
631}
632
633static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
634{
635	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
636}
637
638static void tdmr_free_pamt(struct tdmr_info *tdmr)
639{
640	tdmr_do_pamt_func(tdmr, free_pamt);
641}
642
643static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
644{
645	int i;
646
647	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
648		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
649}
650
651/* Allocate and set up PAMTs for all TDMRs */
652static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
653				 struct list_head *tmb_list,
654				 u16 pamt_entry_size[])
655{
656	int i, ret = 0;
657
658	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
659		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
660				pamt_entry_size);
661		if (ret)
662			goto err;
663	}
664
665	return 0;
666err:
667	tdmrs_free_pamt_all(tdmr_list);
668	return ret;
669}
670
671/*
672 * Convert TDX private pages back to normal by using MOVDIR64B to
673 * clear these pages.  Note this function doesn't flush cache of
674 * these TDX private pages.  The caller should make sure of that.
675 */
676static void reset_tdx_pages(unsigned long base, unsigned long size)
677{
678	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
679	unsigned long phys, end;
680
681	end = base + size;
682	for (phys = base; phys < end; phys += 64)
683		movdir64b(__va(phys), zero_page);
684
685	/*
686	 * MOVDIR64B uses WC protocol.  Use memory barrier to
687	 * make sure any later user of these pages sees the
688	 * updated data.
689	 */
690	mb();
691}
692
693static void tdmr_reset_pamt(struct tdmr_info *tdmr)
694{
695	tdmr_do_pamt_func(tdmr, reset_tdx_pages);
696}
697
698static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
699{
700	int i;
701
702	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
703		tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
704}
705
706static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
707{
708	unsigned long pamt_size = 0;
709	int i;
710
711	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
712		unsigned long base, size;
713
714		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
715		pamt_size += size;
716	}
717
718	return pamt_size / 1024;
719}
720
721static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
722			      u64 size, u16 max_reserved_per_tdmr)
723{
724	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
725	int idx = *p_idx;
726
727	/* Reserved area must be 4K aligned in offset and size */
728	if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
729		return -EINVAL;
730
731	if (idx >= max_reserved_per_tdmr) {
732		pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
733				tdmr->base, tdmr_end(tdmr));
734		return -ENOSPC;
735	}
736
737	/*
738	 * Consume one reserved area per call.  Make no effort to
739	 * optimize or reduce the number of reserved areas which are
740	 * consumed by contiguous reserved areas, for instance.
741	 */
742	rsvd_areas[idx].offset = addr - tdmr->base;
743	rsvd_areas[idx].size = size;
744
745	*p_idx = idx + 1;
746
747	return 0;
748}
749
750/*
751 * Go through @tmb_list to find holes between memory areas.  If any of
752 * those holes fall within @tdmr, set up a TDMR reserved area to cover
753 * the hole.
754 */
755static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
756				    struct tdmr_info *tdmr,
757				    int *rsvd_idx,
758				    u16 max_reserved_per_tdmr)
759{
760	struct tdx_memblock *tmb;
761	u64 prev_end;
762	int ret;
763
764	/*
765	 * Start looking for reserved blocks at the
766	 * beginning of the TDMR.
767	 */
768	prev_end = tdmr->base;
769	list_for_each_entry(tmb, tmb_list, list) {
770		u64 start, end;
771
772		start = PFN_PHYS(tmb->start_pfn);
773		end   = PFN_PHYS(tmb->end_pfn);
774
775		/* Break if this region is after the TDMR */
776		if (start >= tdmr_end(tdmr))
777			break;
778
779		/* Exclude regions before this TDMR */
780		if (end < tdmr->base)
781			continue;
782
783		/*
784		 * Skip over memory areas that
785		 * have already been dealt with.
786		 */
787		if (start <= prev_end) {
788			prev_end = end;
789			continue;
790		}
791
792		/* Add the hole before this region */
793		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
794				start - prev_end,
795				max_reserved_per_tdmr);
796		if (ret)
797			return ret;
798
799		prev_end = end;
800	}
801
802	/* Add the hole after the last region if it exists. */
803	if (prev_end < tdmr_end(tdmr)) {
804		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
805				tdmr_end(tdmr) - prev_end,
806				max_reserved_per_tdmr);
807		if (ret)
808			return ret;
809	}
810
811	return 0;
812}
813
814/*
815 * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
816 * overlaps with @tdmr, set up a TDMR reserved area to cover the
817 * overlapping part.
818 */
819static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
820				    struct tdmr_info *tdmr,
821				    int *rsvd_idx,
822				    u16 max_reserved_per_tdmr)
823{
824	int i, ret;
825
826	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
827		struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
828		unsigned long pamt_base, pamt_size, pamt_end;
829
830		tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
831		/* Each TDMR must already have PAMT allocated */
832		WARN_ON_ONCE(!pamt_size || !pamt_base);
833
834		pamt_end = pamt_base + pamt_size;
835		/* Skip PAMTs outside of the given TDMR */
836		if ((pamt_end <= tdmr->base) ||
837				(pamt_base >= tdmr_end(tdmr)))
838			continue;
839
840		/* Only mark the part within the TDMR as reserved */
841		if (pamt_base < tdmr->base)
842			pamt_base = tdmr->base;
843		if (pamt_end > tdmr_end(tdmr))
844			pamt_end = tdmr_end(tdmr);
845
846		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
847				pamt_end - pamt_base,
848				max_reserved_per_tdmr);
849		if (ret)
850			return ret;
851	}
852
853	return 0;
854}
855
856/* Compare function called by sort() for TDMR reserved areas */
857static int rsvd_area_cmp_func(const void *a, const void *b)
858{
859	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
860	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
861
862	if (r1->offset + r1->size <= r2->offset)
863		return -1;
864	if (r1->offset >= r2->offset + r2->size)
865		return 1;
866
867	/* Reserved areas cannot overlap.  The caller must guarantee. */
868	WARN_ON_ONCE(1);
869	return -1;
870}
871
872/*
873 * Populate reserved areas for the given @tdmr, including memory holes
874 * (via @tmb_list) and PAMTs (via @tdmr_list).
875 */
876static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
877				    struct list_head *tmb_list,
878				    struct tdmr_info_list *tdmr_list,
879				    u16 max_reserved_per_tdmr)
880{
881	int ret, rsvd_idx = 0;
882
883	ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
884			max_reserved_per_tdmr);
885	if (ret)
886		return ret;
887
888	ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
889			max_reserved_per_tdmr);
890	if (ret)
891		return ret;
892
893	/* TDX requires reserved areas listed in address ascending order */
894	sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
895			rsvd_area_cmp_func, NULL);
896
897	return 0;
898}
899
900/*
901 * Populate reserved areas for all TDMRs in @tdmr_list, including memory
902 * holes (via @tmb_list) and PAMTs.
903 */
904static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
905					 struct list_head *tmb_list,
906					 u16 max_reserved_per_tdmr)
907{
908	int i;
909
910	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
911		int ret;
912
913		ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
914				tmb_list, tdmr_list, max_reserved_per_tdmr);
915		if (ret)
916			return ret;
917	}
918
919	return 0;
920}
921
922/*
923 * Construct a list of TDMRs on the preallocated space in @tdmr_list
924 * to cover all TDX memory regions in @tmb_list based on the TDX module
925 * TDMR global information in @tdmr_sysinfo.
926 */
927static int construct_tdmrs(struct list_head *tmb_list,
928			   struct tdmr_info_list *tdmr_list,
929			   struct tdx_tdmr_sysinfo *tdmr_sysinfo)
930{
931	int ret;
932
933	ret = fill_out_tdmrs(tmb_list, tdmr_list);
934	if (ret)
935		return ret;
936
937	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
938			tdmr_sysinfo->pamt_entry_size);
939	if (ret)
940		return ret;
941
942	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
943			tdmr_sysinfo->max_reserved_per_tdmr);
944	if (ret)
945		tdmrs_free_pamt_all(tdmr_list);
946
947	/*
948	 * The tdmr_info_list is read-only from here on out.
949	 * Ensure that these writes are seen by other CPUs.
950	 * Pairs with a smp_rmb() in is_pamt_page().
951	 */
952	smp_wmb();
953
954	return ret;
955}
956
957static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
958{
959	struct tdx_module_args args = {};
960	u64 *tdmr_pa_array;
961	size_t array_sz;
962	int i, ret;
963
964	/*
965	 * TDMRs are passed to the TDX module via an array of physical
966	 * addresses of each TDMR.  The array itself also has certain
967	 * alignment requirement.
968	 */
969	array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
970	array_sz = roundup_pow_of_two(array_sz);
971	if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
972		array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
973
974	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
975	if (!tdmr_pa_array)
976		return -ENOMEM;
977
978	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
979		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
980
981	args.rcx = __pa(tdmr_pa_array);
982	args.rdx = tdmr_list->nr_consumed_tdmrs;
983	args.r8 = global_keyid;
984	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
985
986	/* Free the array as it is not required anymore. */
987	kfree(tdmr_pa_array);
988
989	return ret;
990}
991
992static int do_global_key_config(void *unused)
993{
994	struct tdx_module_args args = {};
995
996	return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
997}
998
999/*
1000 * Attempt to configure the global KeyID on all physical packages.
1001 *
1002 * This requires running code on at least one CPU in each package.
1003 * TDMR initialization) will fail will fail if any package in the
1004 * system has no online CPUs.
1005 *
1006 * This code takes no affirmative steps to online CPUs.  Callers (aka.
1007 * KVM) can ensure success by ensuring sufficient CPUs are online and
1008 * can run SEAMCALLs.
1009 */
1010static int config_global_keyid(void)
1011{
1012	cpumask_var_t packages;
1013	int cpu, ret = -EINVAL;
1014
1015	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
1016		return -ENOMEM;
1017
1018	/*
1019	 * Hardware doesn't guarantee cache coherency across different
1020	 * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
1021	 * (associated with KeyID 0) before the TDX module can use the
1022	 * global KeyID to access the PAMT.  Given PAMTs are potentially
1023	 * large (~1/256th of system RAM), just use WBINVD.
1024	 */
1025	wbinvd_on_all_cpus();
1026
1027	for_each_online_cpu(cpu) {
1028		/*
1029		 * The key configuration only needs to be done once per
1030		 * package and will return an error if configured more
1031		 * than once.  Avoid doing it multiple times per package.
1032		 */
1033		if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1034					packages))
1035			continue;
1036
1037		/*
1038		 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1039		 * different cpus.  Do it one by one.
1040		 */
1041		ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1042		if (ret)
1043			break;
1044	}
1045
1046	free_cpumask_var(packages);
1047	return ret;
1048}
1049
1050static int init_tdmr(struct tdmr_info *tdmr)
1051{
1052	u64 next;
1053
1054	/*
1055	 * Initializing a TDMR can be time consuming.  To avoid long
1056	 * SEAMCALLs, the TDX module may only initialize a part of the
1057	 * TDMR in each call.
1058	 */
1059	do {
1060		struct tdx_module_args args = {
1061			.rcx = tdmr->base,
1062		};
1063		int ret;
1064
1065		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1066		if (ret)
1067			return ret;
1068		/*
1069		 * RDX contains 'next-to-initialize' address if
1070		 * TDH.SYS.TDMR.INIT did not fully complete and
1071		 * should be retried.
1072		 */
1073		next = args.rdx;
1074		cond_resched();
1075		/* Keep making SEAMCALLs until the TDMR is done */
1076	} while (next < tdmr->base + tdmr->size);
1077
1078	return 0;
1079}
1080
1081static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1082{
1083	int i;
1084
1085	/*
1086	 * This operation is costly.  It can be parallelized,
1087	 * but keep it simple for now.
1088	 */
1089	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1090		int ret;
1091
1092		ret = init_tdmr(tdmr_entry(tdmr_list, i));
1093		if (ret)
1094			return ret;
1095	}
1096
1097	return 0;
1098}
1099
1100static int init_tdx_module(void)
1101{
1102	struct tdx_tdmr_sysinfo tdmr_sysinfo;
1103	int ret;
1104
1105	/*
1106	 * To keep things simple, assume that all TDX-protected memory
1107	 * will come from the page allocator.  Make sure all pages in the
1108	 * page allocator are TDX-usable memory.
1109	 *
1110	 * Build the list of "TDX-usable" memory regions which cover all
1111	 * pages in the page allocator to guarantee that.  Do it while
1112	 * holding mem_hotplug_lock read-lock as the memory hotplug code
1113	 * path reads the @tdx_memlist to reject any new memory.
1114	 */
1115	get_online_mems();
1116
1117	ret = build_tdx_memlist(&tdx_memlist);
1118	if (ret)
1119		goto out_put_tdxmem;
1120
1121	ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo);
1122	if (ret)
1123		goto err_free_tdxmem;
1124
1125	/* Allocate enough space for constructing TDMRs */
1126	ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo);
1127	if (ret)
1128		goto err_free_tdxmem;
1129
1130	/* Cover all TDX-usable memory regions in TDMRs */
1131	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo);
1132	if (ret)
1133		goto err_free_tdmrs;
1134
1135	/* Pass the TDMRs and the global KeyID to the TDX module */
1136	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1137	if (ret)
1138		goto err_free_pamts;
1139
1140	/* Config the key of global KeyID on all packages */
1141	ret = config_global_keyid();
1142	if (ret)
1143		goto err_reset_pamts;
1144
1145	/* Initialize TDMRs to complete the TDX module initialization */
1146	ret = init_tdmrs(&tdx_tdmr_list);
1147	if (ret)
1148		goto err_reset_pamts;
1149
1150	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1151
1152out_put_tdxmem:
1153	/*
1154	 * @tdx_memlist is written here and read at memory hotplug time.
1155	 * Lock out memory hotplug code while building it.
1156	 */
1157	put_online_mems();
1158	return ret;
1159
1160err_reset_pamts:
1161	/*
1162	 * Part of PAMTs may already have been initialized by the
1163	 * TDX module.  Flush cache before returning PAMTs back
1164	 * to the kernel.
1165	 */
1166	wbinvd_on_all_cpus();
1167	/*
1168	 * According to the TDX hardware spec, if the platform
1169	 * doesn't have the "partial write machine check"
1170	 * erratum, any kernel read/write will never cause #MC
1171	 * in kernel space, thus it's OK to not convert PAMTs
1172	 * back to normal.  But do the conversion anyway here
1173	 * as suggested by the TDX spec.
1174	 */
1175	tdmrs_reset_pamt_all(&tdx_tdmr_list);
1176err_free_pamts:
1177	tdmrs_free_pamt_all(&tdx_tdmr_list);
1178err_free_tdmrs:
1179	free_tdmr_list(&tdx_tdmr_list);
1180err_free_tdxmem:
1181	free_tdx_memlist(&tdx_memlist);
1182	goto out_put_tdxmem;
1183}
1184
1185static int __tdx_enable(void)
1186{
1187	int ret;
1188
1189	ret = init_tdx_module();
1190	if (ret) {
1191		pr_err("module initialization failed (%d)\n", ret);
1192		tdx_module_status = TDX_MODULE_ERROR;
1193		return ret;
1194	}
1195
1196	pr_info("module initialized\n");
1197	tdx_module_status = TDX_MODULE_INITIALIZED;
1198
1199	return 0;
1200}
1201
1202/**
1203 * tdx_enable - Enable TDX module to make it ready to run TDX guests
1204 *
1205 * This function assumes the caller has: 1) held read lock of CPU hotplug
1206 * lock to prevent any new cpu from becoming online; 2) done both VMXON
1207 * and tdx_cpu_enable() on all online cpus.
1208 *
1209 * This function requires there's at least one online cpu for each CPU
1210 * package to succeed.
1211 *
1212 * This function can be called in parallel by multiple callers.
1213 *
1214 * Return 0 if TDX is enabled successfully, otherwise error.
1215 */
1216int tdx_enable(void)
1217{
1218	int ret;
1219
1220	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1221		return -ENODEV;
1222
1223	lockdep_assert_cpus_held();
1224
1225	mutex_lock(&tdx_module_lock);
1226
1227	switch (tdx_module_status) {
1228	case TDX_MODULE_UNINITIALIZED:
1229		ret = __tdx_enable();
1230		break;
1231	case TDX_MODULE_INITIALIZED:
1232		/* Already initialized, great, tell the caller. */
1233		ret = 0;
1234		break;
1235	default:
1236		/* Failed to initialize in the previous attempts */
1237		ret = -EINVAL;
1238		break;
1239	}
1240
1241	mutex_unlock(&tdx_module_lock);
1242
1243	return ret;
1244}
1245EXPORT_SYMBOL_GPL(tdx_enable);
1246
1247static bool is_pamt_page(unsigned long phys)
1248{
1249	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1250	int i;
1251
1252	/* Ensure that all remote 'tdmr_list' writes are visible: */
1253	smp_rmb();
1254
1255	/*
1256	 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1257	 * is initialized.  The 'tdmr_list' was initialized long ago
1258	 * and is now read-only.
1259	 */
1260	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1261		unsigned long base, size;
1262
1263		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1264
1265		if (phys >= base && phys < (base + size))
1266			return true;
1267	}
1268
1269	return false;
1270}
1271
1272/*
1273 * Return whether the memory page at the given physical address is TDX
1274 * private memory or not.
1275 *
1276 * This can be imprecise for two known reasons:
1277 * 1. PAMTs are private memory and exist before the TDX module is
1278 *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1279 *    short window that occurs once per boot.
1280 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1281 *    page.  However, the page can still cause #MC until it has been
1282 *    fully converted to shared using 64-byte writes like MOVDIR64B.
1283 *    Buggy hosts might still leave #MC-causing memory in place which
1284 *    this function can not detect.
1285 */
1286static bool paddr_is_tdx_private(unsigned long phys)
1287{
1288	struct tdx_module_args args = {
1289		.rcx = phys & PAGE_MASK,
1290	};
1291	u64 sret;
1292
1293	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1294		return false;
1295
1296	/* Get page type from the TDX module */
1297	sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1298
1299	/*
1300	 * The SEAMCALL will not return success unless there is a
1301	 * working, "ready" TDX module.  Assume an absence of TDX
1302	 * private pages until SEAMCALL is working.
1303	 */
1304	if (sret)
1305		return false;
1306
1307	/*
1308	 * SEAMCALL was successful -- read page type (via RCX):
1309	 *
1310	 *  - PT_NDA:	Page is not used by the TDX module
1311	 *  - PT_RSVD:	Reserved for Non-TDX use
1312	 *  - Others:	Page is used by the TDX module
1313	 *
1314	 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1315	 * private memory.
1316	 */
1317	switch (args.rcx) {
1318	case PT_NDA:
1319		return false;
1320	case PT_RSVD:
1321		return is_pamt_page(phys);
1322	default:
1323		return true;
1324	}
1325}
1326
1327/*
1328 * Some TDX-capable CPUs have an erratum.  A write to TDX private
1329 * memory poisons that memory, and a subsequent read of that memory
1330 * triggers #MC.
1331 *
1332 * Help distinguish erratum-triggered #MCs from a normal hardware one.
1333 * Just print additional message to show such #MC may be result of the
1334 * erratum.
1335 */
1336const char *tdx_dump_mce_info(struct mce *m)
1337{
1338	if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1339		return NULL;
1340
1341	if (!paddr_is_tdx_private(m->addr))
1342		return NULL;
1343
1344	return "TDX private memory error. Possible kernel bug.";
1345}
1346
1347static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1348					    u32 *nr_tdx_keyids)
1349{
1350	u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1351	int ret;
1352
1353	/*
1354	 * IA32_MKTME_KEYID_PARTIONING:
1355	 *   Bit [31:0]:	Number of MKTME KeyIDs.
1356	 *   Bit [63:32]:	Number of TDX private KeyIDs.
1357	 */
1358	ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1359			&_nr_tdx_keyids);
1360	if (ret || !_nr_tdx_keyids)
1361		return -EINVAL;
1362
1363	/* TDX KeyIDs start after the last MKTME KeyID. */
1364	_tdx_keyid_start = _nr_mktme_keyids + 1;
1365
1366	*tdx_keyid_start = _tdx_keyid_start;
1367	*nr_tdx_keyids = _nr_tdx_keyids;
1368
1369	return 0;
1370}
1371
1372static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1373{
1374	struct tdx_memblock *tmb;
1375
1376	/*
1377	 * This check assumes that the start_pfn<->end_pfn range does not
1378	 * cross multiple @tdx_memlist entries.  A single memory online
1379	 * event across multiple memblocks (from which @tdx_memlist
1380	 * entries are derived at the time of module initialization) is
1381	 * not possible.  This is because memory offline/online is done
1382	 * on granularity of 'struct memory_block', and the hotpluggable
1383	 * memory region (one memblock) must be multiple of memory_block.
1384	 */
1385	list_for_each_entry(tmb, &tdx_memlist, list) {
1386		if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1387			return true;
1388	}
1389	return false;
1390}
1391
1392static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1393			       void *v)
1394{
1395	struct memory_notify *mn = v;
1396
1397	if (action != MEM_GOING_ONLINE)
1398		return NOTIFY_OK;
1399
1400	/*
1401	 * Empty list means TDX isn't enabled.  Allow any memory
1402	 * to go online.
1403	 */
1404	if (list_empty(&tdx_memlist))
1405		return NOTIFY_OK;
1406
1407	/*
1408	 * The TDX memory configuration is static and can not be
1409	 * changed.  Reject onlining any memory which is outside of
1410	 * the static configuration whether it supports TDX or not.
1411	 */
1412	if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1413		return NOTIFY_OK;
1414
1415	return NOTIFY_BAD;
1416}
1417
1418static struct notifier_block tdx_memory_nb = {
1419	.notifier_call = tdx_memory_notifier,
1420};
1421
1422static void __init check_tdx_erratum(void)
1423{
1424	/*
1425	 * These CPUs have an erratum.  A partial write from non-TD
1426	 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1427	 * private memory poisons that memory, and a subsequent read of
1428	 * that memory triggers #MC.
1429	 */
1430	switch (boot_cpu_data.x86_model) {
1431	case INTEL_FAM6_SAPPHIRERAPIDS_X:
1432	case INTEL_FAM6_EMERALDRAPIDS_X:
1433		setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1434	}
1435}
1436
1437void __init tdx_init(void)
1438{
1439	u32 tdx_keyid_start, nr_tdx_keyids;
1440	int err;
1441
1442	err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1443	if (err)
1444		return;
1445
1446	pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1447			tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1448
1449	/*
1450	 * The TDX module itself requires one 'global KeyID' to protect
1451	 * its metadata.  If there's only one TDX KeyID, there won't be
1452	 * any left for TDX guests thus there's no point to enable TDX
1453	 * at all.
1454	 */
1455	if (nr_tdx_keyids < 2) {
1456		pr_err("initialization failed: too few private KeyIDs available.\n");
1457		return;
1458	}
1459
1460	/*
1461	 * At this point, hibernation_available() indicates whether or
1462	 * not hibernation support has been permanently disabled.
1463	 */
1464	if (hibernation_available()) {
1465		pr_err("initialization failed: Hibernation support is enabled\n");
1466		return;
1467	}
1468
1469	err = register_memory_notifier(&tdx_memory_nb);
1470	if (err) {
1471		pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1472				err);
1473		return;
1474	}
1475
1476#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1477	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1478	acpi_suspend_lowlevel = NULL;
1479#endif
1480
1481	/*
1482	 * Just use the first TDX KeyID as the 'global KeyID' and
1483	 * leave the rest for TDX guests.
1484	 */
1485	tdx_global_keyid = tdx_keyid_start;
1486	tdx_guest_keyid_start = tdx_keyid_start + 1;
1487	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1488
1489	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1490
1491	check_tdx_erratum();
1492}
1493