1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Architecture neutral utility routines for interacting with
5 * Hyper-V. This file is specifically for code that must be
6 * built-in to the kernel image when CONFIG_HYPERV is set
7 * (vs. being in a module) because it is called from architecture
8 * specific code under arch/.
9 *
10 * Copyright (C) 2021, Microsoft, Inc.
11 *
12 * Author : Michael Kelley <mikelley@microsoft.com>
13 */
14
15#include <linux/types.h>
16#include <linux/acpi.h>
17#include <linux/export.h>
18#include <linux/bitfield.h>
19#include <linux/cpumask.h>
20#include <linux/sched/task_stack.h>
21#include <linux/panic_notifier.h>
22#include <linux/ptrace.h>
23#include <linux/random.h>
24#include <linux/efi.h>
25#include <linux/kdebug.h>
26#include <linux/kmsg_dump.h>
27#include <linux/sizes.h>
28#include <linux/slab.h>
29#include <linux/dma-map-ops.h>
30#include <linux/set_memory.h>
31#include <asm/hyperv-tlfs.h>
32#include <asm/mshyperv.h>
33
34/*
35 * hv_root_partition, ms_hyperv and hv_nested are defined here with other
36 * Hyper-V specific globals so they are shared across all architectures and are
37 * built only when CONFIG_HYPERV is defined.  But on x86,
38 * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
39 * defined, and it uses these three variables.  So mark them as __weak
40 * here, allowing for an overriding definition in the module containing
41 * ms_hyperv_init_platform().
42 */
43bool __weak hv_root_partition;
44EXPORT_SYMBOL_GPL(hv_root_partition);
45
46bool __weak hv_nested;
47EXPORT_SYMBOL_GPL(hv_nested);
48
49struct ms_hyperv_info __weak ms_hyperv;
50EXPORT_SYMBOL_GPL(ms_hyperv);
51
52u32 *hv_vp_index;
53EXPORT_SYMBOL_GPL(hv_vp_index);
54
55u32 hv_max_vp_index;
56EXPORT_SYMBOL_GPL(hv_max_vp_index);
57
58void * __percpu *hyperv_pcpu_input_arg;
59EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
60
61void * __percpu *hyperv_pcpu_output_arg;
62EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
63
64static void hv_kmsg_dump_unregister(void);
65
66static struct ctl_table_header *hv_ctl_table_hdr;
67
68/*
69 * Hyper-V specific initialization and shutdown code that is
70 * common across all architectures.  Called from architecture
71 * specific initialization functions.
72 */
73
74void __init hv_common_free(void)
75{
76	unregister_sysctl_table(hv_ctl_table_hdr);
77	hv_ctl_table_hdr = NULL;
78
79	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
80		hv_kmsg_dump_unregister();
81
82	kfree(hv_vp_index);
83	hv_vp_index = NULL;
84
85	free_percpu(hyperv_pcpu_output_arg);
86	hyperv_pcpu_output_arg = NULL;
87
88	free_percpu(hyperv_pcpu_input_arg);
89	hyperv_pcpu_input_arg = NULL;
90}
91
92/*
93 * Functions for allocating and freeing memory with size and
94 * alignment HV_HYP_PAGE_SIZE. These functions are needed because
95 * the guest page size may not be the same as the Hyper-V page
96 * size. We depend upon kmalloc() aligning power-of-two size
97 * allocations to the allocation size boundary, so that the
98 * allocated memory appears to Hyper-V as a page of the size
99 * it expects.
100 */
101
102void *hv_alloc_hyperv_page(void)
103{
104	BUILD_BUG_ON(PAGE_SIZE <  HV_HYP_PAGE_SIZE);
105
106	if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
107		return (void *)__get_free_page(GFP_KERNEL);
108	else
109		return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
110}
111EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
112
113void *hv_alloc_hyperv_zeroed_page(void)
114{
115	if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
116		return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
117	else
118		return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
119}
120EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
121
122void hv_free_hyperv_page(void *addr)
123{
124	if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
125		free_page((unsigned long)addr);
126	else
127		kfree(addr);
128}
129EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
130
131static void *hv_panic_page;
132
133/*
134 * Boolean to control whether to report panic messages over Hyper-V.
135 *
136 * It can be set via /proc/sys/kernel/hyperv_record_panic_msg
137 */
138static int sysctl_record_panic_msg = 1;
139
140/*
141 * sysctl option to allow the user to control whether kmsg data should be
142 * reported to Hyper-V on panic.
143 */
144static struct ctl_table hv_ctl_table[] = {
145	{
146		.procname	= "hyperv_record_panic_msg",
147		.data		= &sysctl_record_panic_msg,
148		.maxlen		= sizeof(int),
149		.mode		= 0644,
150		.proc_handler	= proc_dointvec_minmax,
151		.extra1		= SYSCTL_ZERO,
152		.extra2		= SYSCTL_ONE
153	},
154};
155
156static int hv_die_panic_notify_crash(struct notifier_block *self,
157				     unsigned long val, void *args);
158
159static struct notifier_block hyperv_die_report_block = {
160	.notifier_call = hv_die_panic_notify_crash,
161};
162
163static struct notifier_block hyperv_panic_report_block = {
164	.notifier_call = hv_die_panic_notify_crash,
165};
166
167/*
168 * The following callback works both as die and panic notifier; its
169 * goal is to provide panic information to the hypervisor unless the
170 * kmsg dumper is used [see hv_kmsg_dump()], which provides more
171 * information but isn't always available.
172 *
173 * Notice that both the panic/die report notifiers are registered only
174 * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
175 */
176static int hv_die_panic_notify_crash(struct notifier_block *self,
177				     unsigned long val, void *args)
178{
179	struct pt_regs *regs;
180	bool is_die;
181
182	/* Don't notify Hyper-V unless we have a die oops event or panic. */
183	if (self == &hyperv_panic_report_block) {
184		is_die = false;
185		regs = current_pt_regs();
186	} else { /* die event */
187		if (val != DIE_OOPS)
188			return NOTIFY_DONE;
189
190		is_die = true;
191		regs = ((struct die_args *)args)->regs;
192	}
193
194	/*
195	 * Hyper-V should be notified only once about a panic/die. If we will
196	 * be calling hv_kmsg_dump() later with kmsg data, don't do the
197	 * notification here.
198	 */
199	if (!sysctl_record_panic_msg || !hv_panic_page)
200		hyperv_report_panic(regs, val, is_die);
201
202	return NOTIFY_DONE;
203}
204
205/*
206 * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
207 * buffer and call into Hyper-V to transfer the data.
208 */
209static void hv_kmsg_dump(struct kmsg_dumper *dumper,
210			 enum kmsg_dump_reason reason)
211{
212	struct kmsg_dump_iter iter;
213	size_t bytes_written;
214
215	/* We are only interested in panics. */
216	if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
217		return;
218
219	/*
220	 * Write dump contents to the page. No need to synchronize; panic should
221	 * be single-threaded.
222	 */
223	kmsg_dump_rewind(&iter);
224	kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
225			     &bytes_written);
226	if (!bytes_written)
227		return;
228	/*
229	 * P3 to contain the physical address of the panic page & P4 to
230	 * contain the size of the panic data in that page. Rest of the
231	 * registers are no-op when the NOTIFY_MSG flag is set.
232	 */
233	hv_set_msr(HV_MSR_CRASH_P0, 0);
234	hv_set_msr(HV_MSR_CRASH_P1, 0);
235	hv_set_msr(HV_MSR_CRASH_P2, 0);
236	hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page));
237	hv_set_msr(HV_MSR_CRASH_P4, bytes_written);
238
239	/*
240	 * Let Hyper-V know there is crash data available along with
241	 * the panic message.
242	 */
243	hv_set_msr(HV_MSR_CRASH_CTL,
244		   (HV_CRASH_CTL_CRASH_NOTIFY |
245		    HV_CRASH_CTL_CRASH_NOTIFY_MSG));
246}
247
248static struct kmsg_dumper hv_kmsg_dumper = {
249	.dump = hv_kmsg_dump,
250};
251
252static void hv_kmsg_dump_unregister(void)
253{
254	kmsg_dump_unregister(&hv_kmsg_dumper);
255	unregister_die_notifier(&hyperv_die_report_block);
256	atomic_notifier_chain_unregister(&panic_notifier_list,
257					 &hyperv_panic_report_block);
258
259	hv_free_hyperv_page(hv_panic_page);
260	hv_panic_page = NULL;
261}
262
263static void hv_kmsg_dump_register(void)
264{
265	int ret;
266
267	hv_panic_page = hv_alloc_hyperv_zeroed_page();
268	if (!hv_panic_page) {
269		pr_err("Hyper-V: panic message page memory allocation failed\n");
270		return;
271	}
272
273	ret = kmsg_dump_register(&hv_kmsg_dumper);
274	if (ret) {
275		pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
276		hv_free_hyperv_page(hv_panic_page);
277		hv_panic_page = NULL;
278	}
279}
280
281int __init hv_common_init(void)
282{
283	int i;
284	union hv_hypervisor_version_info version;
285
286	/* Get information about the Hyper-V host version */
287	if (!hv_get_hypervisor_version(&version))
288		pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
289			version.major_version, version.minor_version,
290			version.build_number, version.service_number,
291			version.service_pack, version.service_branch);
292
293	if (hv_is_isolation_supported())
294		sysctl_record_panic_msg = 0;
295
296	/*
297	 * Hyper-V expects to get crash register data or kmsg when
298	 * crash enlightment is available and system crashes. Set
299	 * crash_kexec_post_notifiers to be true to make sure that
300	 * calling crash enlightment interface before running kdump
301	 * kernel.
302	 */
303	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
304		u64 hyperv_crash_ctl;
305
306		crash_kexec_post_notifiers = true;
307		pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n");
308
309		/*
310		 * Panic message recording (sysctl_record_panic_msg)
311		 * is enabled by default in non-isolated guests and
312		 * disabled by default in isolated guests; the panic
313		 * message recording won't be available in isolated
314		 * guests should the following registration fail.
315		 */
316		hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table);
317		if (!hv_ctl_table_hdr)
318			pr_err("Hyper-V: sysctl table register error");
319
320		/*
321		 * Register for panic kmsg callback only if the right
322		 * capability is supported by the hypervisor.
323		 */
324		hyperv_crash_ctl = hv_get_msr(HV_MSR_CRASH_CTL);
325		if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
326			hv_kmsg_dump_register();
327
328		register_die_notifier(&hyperv_die_report_block);
329		atomic_notifier_chain_register(&panic_notifier_list,
330					       &hyperv_panic_report_block);
331	}
332
333	/*
334	 * Allocate the per-CPU state for the hypercall input arg.
335	 * If this allocation fails, we will not be able to setup
336	 * (per-CPU) hypercall input page and thus this failure is
337	 * fatal on Hyper-V.
338	 */
339	hyperv_pcpu_input_arg = alloc_percpu(void  *);
340	BUG_ON(!hyperv_pcpu_input_arg);
341
342	/* Allocate the per-CPU state for output arg for root */
343	if (hv_root_partition) {
344		hyperv_pcpu_output_arg = alloc_percpu(void *);
345		BUG_ON(!hyperv_pcpu_output_arg);
346	}
347
348	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
349				    GFP_KERNEL);
350	if (!hv_vp_index) {
351		hv_common_free();
352		return -ENOMEM;
353	}
354
355	for (i = 0; i < num_possible_cpus(); i++)
356		hv_vp_index[i] = VP_INVAL;
357
358	return 0;
359}
360
361void __init ms_hyperv_late_init(void)
362{
363	struct acpi_table_header *header;
364	acpi_status status;
365	u8 *randomdata;
366	u32 length, i;
367
368	/*
369	 * Seed the Linux random number generator with entropy provided by
370	 * the Hyper-V host in ACPI table OEM0.
371	 */
372	if (!IS_ENABLED(CONFIG_ACPI))
373		return;
374
375	status = acpi_get_table("OEM0", 0, &header);
376	if (ACPI_FAILURE(status) || !header)
377		return;
378
379	/*
380	 * Since the "OEM0" table name is for OEM specific usage, verify
381	 * that what we're seeing purports to be from Microsoft.
382	 */
383	if (strncmp(header->oem_table_id, "MICROSFT", 8))
384		goto error;
385
386	/*
387	 * Ensure the length is reasonable. Requiring at least 8 bytes and
388	 * no more than 4K bytes is somewhat arbitrary and just protects
389	 * against a malformed table. Hyper-V currently provides 64 bytes,
390	 * but allow for a change in a later version.
391	 */
392	if (header->length < sizeof(*header) + 8 ||
393	    header->length > sizeof(*header) + SZ_4K)
394		goto error;
395
396	length = header->length - sizeof(*header);
397	randomdata = (u8 *)(header + 1);
398
399	pr_debug("Hyper-V: Seeding rng with %d random bytes from ACPI table OEM0\n",
400			length);
401
402	add_bootloader_randomness(randomdata, length);
403
404	/*
405	 * To prevent the seed data from being visible in /sys/firmware/acpi,
406	 * zero out the random data in the ACPI table and fixup the checksum.
407	 * The zero'ing is done out of an abundance of caution in avoiding
408	 * potential security risks to the rng. Similarly, reset the table
409	 * length to just the header size so that a subsequent kexec doesn't
410	 * try to use the zero'ed out random data.
411	 */
412	for (i = 0; i < length; i++) {
413		header->checksum += randomdata[i];
414		randomdata[i] = 0;
415	}
416
417	for (i = 0; i < sizeof(header->length); i++)
418		header->checksum += ((u8 *)&header->length)[i];
419	header->length = sizeof(*header);
420	for (i = 0; i < sizeof(header->length); i++)
421		header->checksum -= ((u8 *)&header->length)[i];
422
423error:
424	acpi_put_table(header);
425}
426
427/*
428 * Hyper-V specific initialization and die code for
429 * individual CPUs that is common across all architectures.
430 * Called by the CPU hotplug mechanism.
431 */
432
433int hv_common_cpu_init(unsigned int cpu)
434{
435	void **inputarg, **outputarg;
436	u64 msr_vp_index;
437	gfp_t flags;
438	int pgcount = hv_root_partition ? 2 : 1;
439	void *mem;
440	int ret;
441
442	/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
443	flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
444
445	inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
446
447	/*
448	 * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
449	 * allocated if this CPU was previously online and then taken offline
450	 */
451	if (!*inputarg) {
452		mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
453		if (!mem)
454			return -ENOMEM;
455
456		if (hv_root_partition) {
457			outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
458			*outputarg = (char *)mem + HV_HYP_PAGE_SIZE;
459		}
460
461		if (!ms_hyperv.paravisor_present &&
462		    (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
463			ret = set_memory_decrypted((unsigned long)mem, pgcount);
464			if (ret) {
465				/* It may be unsafe to free 'mem' */
466				return ret;
467			}
468
469			memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE);
470		}
471
472		/*
473		 * In a fully enlightened TDX/SNP VM with more than 64 VPs, if
474		 * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() ->
475		 * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to
476		 * use hyperv_pcpu_input_arg as the hypercall input page, which
477		 * must be a decrypted page in such a VM, but the page is still
478		 * encrypted before set_memory_decrypted() returns. Fix this by
479		 * setting *inputarg after the above set_memory_decrypted(): if
480		 * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns
481		 * HV_STATUS_INVALID_PARAMETER immediately, and the function
482		 * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(),
483		 * which may be slightly slower than the hypercall, but still
484		 * works correctly in such a VM.
485		 */
486		*inputarg = mem;
487	}
488
489	msr_vp_index = hv_get_msr(HV_MSR_VP_INDEX);
490
491	hv_vp_index[cpu] = msr_vp_index;
492
493	if (msr_vp_index > hv_max_vp_index)
494		hv_max_vp_index = msr_vp_index;
495
496	return 0;
497}
498
499int hv_common_cpu_die(unsigned int cpu)
500{
501	/*
502	 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
503	 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
504	 * may be used by the Hyper-V vPCI driver in reassigning interrupts
505	 * as part of the offlining process.  The interrupt reassignment
506	 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
507	 * called this function.
508	 *
509	 * If a previously offlined CPU is brought back online again, the
510	 * originally allocated memory is reused in hv_common_cpu_init().
511	 */
512
513	return 0;
514}
515
516/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
517bool hv_query_ext_cap(u64 cap_query)
518{
519	/*
520	 * The address of the 'hv_extended_cap' variable will be used as an
521	 * output parameter to the hypercall below and so it should be
522	 * compatible with 'virt_to_phys'. Which means, it's address should be
523	 * directly mapped. Use 'static' to keep it compatible; stack variables
524	 * can be virtually mapped, making them incompatible with
525	 * 'virt_to_phys'.
526	 * Hypercall input/output addresses should also be 8-byte aligned.
527	 */
528	static u64 hv_extended_cap __aligned(8);
529	static bool hv_extended_cap_queried;
530	u64 status;
531
532	/*
533	 * Querying extended capabilities is an extended hypercall. Check if the
534	 * partition supports extended hypercall, first.
535	 */
536	if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
537		return false;
538
539	/* Extended capabilities do not change at runtime. */
540	if (hv_extended_cap_queried)
541		return hv_extended_cap & cap_query;
542
543	status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
544				 &hv_extended_cap);
545
546	/*
547	 * The query extended capabilities hypercall should not fail under
548	 * any normal circumstances. Avoid repeatedly making the hypercall, on
549	 * error.
550	 */
551	hv_extended_cap_queried = true;
552	if (!hv_result_success(status)) {
553		pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
554		       status);
555		return false;
556	}
557
558	return hv_extended_cap & cap_query;
559}
560EXPORT_SYMBOL_GPL(hv_query_ext_cap);
561
562void hv_setup_dma_ops(struct device *dev, bool coherent)
563{
564	/*
565	 * Hyper-V does not offer a vIOMMU in the guest
566	 * VM, so pass 0/NULL for the IOMMU settings
567	 */
568	arch_setup_dma_ops(dev, 0, 0, coherent);
569}
570EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
571
572bool hv_is_hibernation_supported(void)
573{
574	return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
575}
576EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
577
578/*
579 * Default function to read the Hyper-V reference counter, independent
580 * of whether Hyper-V enlightened clocks/timers are being used. But on
581 * architectures where it is used, Hyper-V enlightenment code in
582 * hyperv_timer.c may override this function.
583 */
584static u64 __hv_read_ref_counter(void)
585{
586	return hv_get_msr(HV_MSR_TIME_REF_COUNT);
587}
588
589u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter;
590EXPORT_SYMBOL_GPL(hv_read_reference_counter);
591
592/* These __weak functions provide default "no-op" behavior and
593 * may be overridden by architecture specific versions. Architectures
594 * for which the default "no-op" behavior is sufficient can leave
595 * them unimplemented and not be cluttered with a bunch of stub
596 * functions in arch-specific code.
597 */
598
599bool __weak hv_is_isolation_supported(void)
600{
601	return false;
602}
603EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
604
605bool __weak hv_isolation_type_snp(void)
606{
607	return false;
608}
609EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
610
611bool __weak hv_isolation_type_tdx(void)
612{
613	return false;
614}
615EXPORT_SYMBOL_GPL(hv_isolation_type_tdx);
616
617void __weak hv_setup_vmbus_handler(void (*handler)(void))
618{
619}
620EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler);
621
622void __weak hv_remove_vmbus_handler(void)
623{
624}
625EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
626
627void __weak hv_setup_kexec_handler(void (*handler)(void))
628{
629}
630EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
631
632void __weak hv_remove_kexec_handler(void)
633{
634}
635EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
636
637void __weak hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
638{
639}
640EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
641
642void __weak hv_remove_crash_handler(void)
643{
644}
645EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
646
647void __weak hyperv_cleanup(void)
648{
649}
650EXPORT_SYMBOL_GPL(hyperv_cleanup);
651
652u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
653{
654	return HV_STATUS_INVALID_PARAMETER;
655}
656EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
657
658u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
659{
660	return HV_STATUS_INVALID_PARAMETER;
661}
662EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
663