1/*	$NetBSD: xen_machdep.c,v 1.29 2023/10/17 10:24:11 riastradh Exp $	*/
2
3/*
4 * Copyright (c) 2006 Manuel Bouyer.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28/*
29 *
30 * Copyright (c) 2004 Christian Limpach.
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
43 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
44 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
45 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
46 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
47 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
51 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 */
53
54
55#include <sys/cdefs.h>
56__KERNEL_RCSID(0, "$NetBSD: xen_machdep.c,v 1.29 2023/10/17 10:24:11 riastradh Exp $");
57
58#include "opt_xen.h"
59
60#include <sys/param.h>
61#include <sys/systm.h>
62#include <sys/boot_flag.h>
63#include <sys/conf.h>
64#include <sys/disk.h>
65#include <sys/device.h>
66#include <sys/mount.h>
67#include <sys/reboot.h>
68#include <sys/timetc.h>
69#include <sys/sysctl.h>
70#include <sys/pmf.h>
71#include <sys/xcall.h>
72
73#include <dev/cons.h>
74
75#include <xen/intr.h>
76#include <xen/hypervisor.h>
77#include <xen/shutdown_xenbus.h>
78#include <xen/include/public/version.h>
79
80#include <machine/pmap_private.h>
81
82#define DPRINTK(x) printk x
83#if 0
84#define DPRINTK(x)
85#endif
86
87#ifdef DEBUG_GEOM
88#define DPRINTF(a) printf a
89#else
90#define DPRINTF(a)
91#endif
92
93
94bool xen_suspend_allow;
95
96void
97xen_parse_cmdline(int what, union xen_cmdline_parseinfo *xcp)
98{
99	char _cmd_line[256], *cmd_line, *opt, *s;
100	int b, i, ipidx = 0;
101	uint32_t xi_ip[5];
102	size_t len;
103
104	len = strlcpy(_cmd_line, xen_start_info.cmd_line, sizeof(_cmd_line));
105	if (len > sizeof(_cmd_line)) {
106		printf("command line exceeded limit of 255 chars. Truncated.\n");
107	}
108	cmd_line = _cmd_line;
109
110	switch (what) {
111	case XEN_PARSE_BOOTDEV:
112		xcp->xcp_bootdev[0] = 0;
113		break;
114	case XEN_PARSE_CONSOLE:
115		xcp->xcp_console[0] = 0;
116		break;
117	}
118
119	while (cmd_line && *cmd_line) {
120		opt = cmd_line;
121		cmd_line = strchr(opt, ' ');
122		if (cmd_line)
123			*cmd_line = 0;
124
125		switch (what) {
126		case XEN_PARSE_BOOTDEV:
127			if (strncasecmp(opt, "bootdev=", 8) == 0) {
128				strncpy(xcp->xcp_bootdev, opt + 8,
129				    sizeof(xcp->xcp_bootdev));
130				break;
131			}
132			if (strncasecmp(opt, "root=", 5) == 0) {
133				strncpy(xcp->xcp_bootdev, opt + 5,
134				    sizeof(xcp->xcp_bootdev));
135				break;
136			}
137			break;
138
139		case XEN_PARSE_NETINFO:
140			if (xcp->xcp_netinfo.xi_root &&
141			    strncasecmp(opt, "nfsroot=", 8) == 0)
142				strncpy(xcp->xcp_netinfo.xi_root, opt + 8,
143				    MNAMELEN);
144
145			if (strncasecmp(opt, "ip=", 3) == 0) {
146				memset(xi_ip, 0, sizeof(xi_ip));
147				opt += 3;
148				ipidx = 0;
149				while (opt && *opt) {
150					s = opt;
151					opt = strchr(opt, ':');
152					if (opt)
153						*opt = 0;
154
155					switch (ipidx) {
156					case 0:	/* ip */
157					case 1:	/* nfs server */
158					case 2:	/* gw */
159					case 3:	/* mask */
160					case 4:	/* host */
161						if (*s == 0)
162							break;
163						for (i = 0; i < 4; i++) {
164							b = strtoul(s, &s, 10);
165							xi_ip[ipidx] = b + 256
166								* xi_ip[ipidx];
167							if (*s != '.')
168								break;
169							s++;
170						}
171						if (i < 3)
172							xi_ip[ipidx] = 0;
173						break;
174					case 5:	/* interface */
175						if (!strncmp(s, "xennet", 6))
176							s += 6;
177						else if (!strncmp(s, "eth", 3))
178							s += 3;
179						else
180							break;
181						if (xcp->xcp_netinfo.xi_ifno
182						    == strtoul(s, NULL, 10))
183							memcpy(xcp->
184							    xcp_netinfo.xi_ip,
185							    xi_ip,
186							    sizeof(xi_ip));
187						break;
188					}
189					ipidx++;
190
191					if (opt)
192						*opt++ = ':';
193				}
194			}
195			break;
196
197		case XEN_PARSE_CONSOLE:
198			if (strncasecmp(opt, "console=", 8) == 0)
199				strncpy(xcp->xcp_console, opt + 8,
200				    sizeof(xcp->xcp_console));
201			break;
202
203		case XEN_PARSE_BOOTFLAGS:
204			if (*opt == '-') {
205				opt++;
206				while(*opt != '\0') {
207					BOOT_FLAG(*opt, boothowto);
208					opt++;
209				}
210			}
211			break;
212		case XEN_PARSE_PCIBACK:
213			if (strncasecmp(opt, "pciback.hide=", 13) == 0)
214				strncpy(xcp->xcp_pcidevs, opt + 13,
215				    sizeof(xcp->xcp_pcidevs));
216			break;
217		}
218
219		if (cmd_line)
220			*cmd_line++ = ' ';
221	}
222}
223
224#ifdef XENPV
225
226static int sysctl_xen_suspend(SYSCTLFN_ARGS);
227static void xen_suspend_domain(void);
228static void xen_prepare_suspend(void);
229static void xen_prepare_resume(void);
230
231/*
232 * this function sets up the machdep.xen.suspend sysctl(7) that
233 * controls domain suspend/save.
234 */
235void
236sysctl_xen_suspend_setup(void)
237{
238	const struct sysctlnode *node = NULL;
239
240	/*
241	 * dom0 implements sleep support through ACPI. It should not call
242	 * this function to register a suspend interface.
243	 */
244	KASSERT(!(xendomain_is_dom0()));
245
246	sysctl_createv(NULL, 0, NULL, &node,
247	    CTLFLAG_PERMANENT,
248	    CTLTYPE_NODE, "machdep", NULL,
249	    NULL, 0, NULL, 0,
250	    CTL_MACHDEP, CTL_EOL);
251
252	sysctl_createv(NULL, 0, &node, &node,
253	    CTLFLAG_PERMANENT,
254	    CTLTYPE_NODE, "xen",
255	    SYSCTL_DESCR("Xen top level node"),
256	    NULL, 0, NULL, 0,
257	    CTL_CREATE, CTL_EOL);
258
259	sysctl_createv(NULL, 0, &node, &node,
260	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE | CTLFLAG_IMMEDIATE,
261	    CTLTYPE_INT, "suspend",
262	    SYSCTL_DESCR("Suspend/save current Xen domain"),
263	    sysctl_xen_suspend, 0, NULL, 0,
264	    CTL_CREATE, CTL_EOL);
265}
266
267static int
268sysctl_xen_suspend(SYSCTLFN_ARGS)
269{
270	int error;
271	struct sysctlnode node;
272
273	node = *rnode;
274	error = sysctl_lookup(SYSCTLFN_CALL(&node));
275
276	if (error || newp == NULL)
277		return error;
278
279	/* only allow domain to suspend when dom0 instructed to do so */
280	if (xen_suspend_allow == false)
281		return EAGAIN;
282
283	xen_suspend_domain();
284
285	return 0;
286
287}
288
289static void xen_suspendclocks_xc(void *, void*);
290static void xen_resumeclocks_xc(void *, void*);
291
292/*
293 * Last operations before suspending domain
294 */
295static void
296xen_prepare_suspend(void)
297{
298
299	kpreempt_disable();
300
301	pmap_xen_suspend();
302	xc_wait(xc_broadcast(0, &xen_suspendclocks_xc, NULL, NULL));
303
304	/*
305	 * save/restore code does not translate these MFNs to their
306	 * associated PFNs, so we must do it
307	 */
308	xen_start_info.store_mfn =
309	    atop(xpmap_mtop(ptoa(xen_start_info.store_mfn)));
310	xen_start_info.console_mfn =
311	    atop(xpmap_mtop(ptoa(xen_start_info.console_mfn)));
312
313	DPRINTK(("suspending domain\n"));
314	aprint_verbose("suspending domain\n");
315
316	/* invalidate the shared_info page */
317	if (HYPERVISOR_update_va_mapping((vaddr_t)HYPERVISOR_shared_info,
318	    0, UVMF_INVLPG)) {
319		DPRINTK(("HYPERVISOR_shared_info page invalidation failed"));
320		HYPERVISOR_crash();
321	}
322
323}
324
325static void
326xen_suspendclocks_xc(void *a, void *b)
327{
328
329	kpreempt_disable();
330	xen_suspendclocks(curcpu());
331	kpreempt_enable();
332}
333
334/*
335 * First operations before restoring domain context
336 */
337static void
338xen_prepare_resume(void)
339{
340	/* map the new shared_info page */
341	if (HYPERVISOR_update_va_mapping((vaddr_t)HYPERVISOR_shared_info,
342	    xen_start_info.shared_info | PTE_W | PTE_P,
343	    UVMF_INVLPG)) {
344		DPRINTK(("could not map new shared info page"));
345		HYPERVISOR_crash();
346	}
347
348	pmap_xen_resume();
349
350	if (xen_start_info.nr_pages != physmem) {
351		/*
352		 * XXX JYM for now, we crash - fix it with memory
353		 * hotplug when supported
354		 */
355		DPRINTK(("xen_start_info.nr_pages != physmem"));
356		HYPERVISOR_crash();
357	}
358
359	DPRINTK(("preparing domain resume\n"));
360	aprint_verbose("preparing domain resume\n");
361
362	xen_suspend_allow = false;
363
364	xc_wait(xc_broadcast(0, xen_resumeclocks_xc, NULL, NULL));
365
366	kpreempt_enable();
367
368}
369
370static void
371xen_resumeclocks_xc(void *a, void *b)
372{
373
374	kpreempt_disable();
375	xen_resumeclocks(curcpu());
376	kpreempt_enable();
377}
378
379static void
380xen_suspend_domain(void)
381{
382	paddr_t mfn;
383	int s = splvm(); /* XXXSMP */
384
385	/*
386	 * console becomes unavailable when suspended, so
387	 * direct communications to domain are hampered from there on.
388	 * We can only rely on low level primitives like printk(), until
389	 * console is fully restored
390	 */
391	if (!pmf_system_suspend(PMF_Q_NONE)) {
392		DPRINTK(("devices suspend failed"));
393		HYPERVISOR_crash();
394	}
395
396	/*
397	 * obtain the MFN of the start_info page now, as we will not be
398	 * able to do it once pmap is locked
399	 */
400	pmap_extract_ma(pmap_kernel(), (vaddr_t)&xen_start_info, &mfn);
401	mfn >>= PAGE_SHIFT;
402
403	xen_prepare_suspend();
404
405	DPRINTK(("calling HYPERVISOR_suspend()\n"));
406	if (HYPERVISOR_suspend(mfn) != 0) {
407	/* XXX JYM: implement checkpoint/snapshot (ret == 1) */
408		DPRINTK(("HYPERVISOR_suspend() failed"));
409		HYPERVISOR_crash();
410	}
411
412	DPRINTK(("left HYPERVISOR_suspend()\n"));
413
414	xen_prepare_resume();
415
416	DPRINTK(("resuming devices\n"));
417	if (!pmf_system_resume(PMF_Q_NONE)) {
418		DPRINTK(("devices resume failed\n"));
419		HYPERVISOR_crash();
420	}
421
422	splx(s);
423
424	/* xencons is back online, we can print to console */
425	aprint_verbose("domain resumed\n");
426
427}
428#endif /* XENPV */
429
430#define PRINTK_BUFSIZE 1024
431void
432printk(const char *fmt, ...)
433{
434	va_list ap;
435	int ret;
436	static char buf[PRINTK_BUFSIZE];
437
438	va_start(ap, fmt);
439	ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
440	va_end(ap);
441	buf[ret] = 0;
442	(void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
443}
444
445static int early_xenconscn_getc(dev_t);
446static void early_xenconscn_putc(dev_t, int);
447static void early_xenconscn_pollc(dev_t, int);
448
449static struct consdev early_xencons = {
450	NULL, NULL,
451	early_xenconscn_getc, early_xenconscn_putc, early_xenconscn_pollc,
452	NULL, NULL, NULL, NODEV, CN_NORMAL
453};
454
455void
456xen_early_console(void)
457{
458	cn_tab = &early_xencons; /* fallback console */
459}
460
461static int
462early_xenconscn_getc(dev_t dev)
463{
464	while(1)
465		;
466	return -1;
467}
468
469static void
470early_xenconscn_putc(dev_t dev, int c)
471{
472	printk("%c", c);
473}
474
475static void
476early_xenconscn_pollc(dev_t dev, int on)
477{
478	return;
479}
480bool xen_feature_tables[XENFEAT_NR_SUBMAPS * 32];
481
482void
483xen_init_features(void)
484{
485	xen_feature_info_t features;
486
487	for (int sm = 0; sm < XENFEAT_NR_SUBMAPS; sm++) {
488		features.submap_idx = sm;
489		if (HYPERVISOR_xen_version(XENVER_get_features, &features) < 0)
490			break;
491		for (int f = 0; f < 32; f++) {
492			xen_feature_tables[sm * 32 + f] =
493			    (features.submap & (1 << f)) ? 1 : 0;
494		}
495	}
496}
497
498/*
499 * Attempt to find the device from which we were booted.
500 */
501
502static int
503is_valid_disk(device_t dv)
504{
505	if (device_class(dv) != DV_DISK)
506		return (0);
507
508	return (device_is_a(dv, "dk") ||
509		device_is_a(dv, "sd") ||
510		device_is_a(dv, "wd") ||
511		device_is_a(dv, "ld") ||
512		device_is_a(dv, "ed") ||
513		device_is_a(dv, "xbd"));
514}
515
516void
517xen_bootconf(void)
518{
519	device_t dv;
520	deviter_t di;
521	union xen_cmdline_parseinfo xcp;
522	static char bootspecbuf[sizeof(xcp.xcp_bootdev)];
523
524	if (booted_device) {
525		DPRINTF(("%s: preset booted_device: %s\n", __func__, device_xname(booted_device)));
526		return;
527	}
528
529	xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
530
531	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST);
532	     dv != NULL;
533	     dv = deviter_next(&di)) {
534		bool is_ifnet, is_disk;
535		const char *devname;
536
537		is_ifnet = (device_class(dv) == DV_IFNET);
538		is_disk = is_valid_disk(dv);
539		devname = device_xname(dv);
540
541		if (!is_ifnet && !is_disk)
542			continue;
543
544		if (is_disk && xcp.xcp_bootdev[0] == 0) {
545			booted_device = dv;
546			break;
547		}
548
549		if (strncmp(xcp.xcp_bootdev, devname, strlen(devname)))
550			continue;
551
552		if (is_disk && strlen(xcp.xcp_bootdev) > strlen(devname)) {
553			/* XXX check device_cfdata as in x86_autoconf.c? */
554			booted_partition = toupper(
555				xcp.xcp_bootdev[strlen(devname)]) - 'A';
556			DPRINTF(("%s: booted_partition: %d\n", __func__, booted_partition));
557		}
558
559		booted_device = dv;
560		booted_method = "bootinfo/bootdev";
561		break;
562	}
563	deviter_release(&di);
564
565	if (booted_device) {
566		DPRINTF(("%s: booted_device: %s\n", __func__, device_xname(booted_device)));
567		return;
568	}
569
570	/*
571	 * not a boot device name, pass through to MI code
572	 */
573	if (xcp.xcp_bootdev[0] != '\0') {
574		strlcpy(bootspecbuf, xcp.xcp_bootdev, sizeof(bootspecbuf));
575		bootspec = bootspecbuf;
576		booted_method = "bootinfo/bootspec";
577		DPRINTF(("%s: bootspec: %s\n", __func__, bootspec));
578		return;
579	}
580}
581