1/* $NetBSD: xen_machdep.c,v 1.29 2023/10/17 10:24:11 riastradh Exp $ */ 2 3/* 4 * Copyright (c) 2006 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28/* 29 * 30 * Copyright (c) 2004 Christian Limpach. 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 43 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 44 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 45 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 46 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 47 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 51 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 52 */ 53 54 55#include <sys/cdefs.h> 56__KERNEL_RCSID(0, "$NetBSD: xen_machdep.c,v 1.29 2023/10/17 10:24:11 riastradh Exp $"); 57 58#include "opt_xen.h" 59 60#include <sys/param.h> 61#include <sys/systm.h> 62#include <sys/boot_flag.h> 63#include <sys/conf.h> 64#include <sys/disk.h> 65#include <sys/device.h> 66#include <sys/mount.h> 67#include <sys/reboot.h> 68#include <sys/timetc.h> 69#include <sys/sysctl.h> 70#include <sys/pmf.h> 71#include <sys/xcall.h> 72 73#include <dev/cons.h> 74 75#include <xen/intr.h> 76#include <xen/hypervisor.h> 77#include <xen/shutdown_xenbus.h> 78#include <xen/include/public/version.h> 79 80#include <machine/pmap_private.h> 81 82#define DPRINTK(x) printk x 83#if 0 84#define DPRINTK(x) 85#endif 86 87#ifdef DEBUG_GEOM 88#define DPRINTF(a) printf a 89#else 90#define DPRINTF(a) 91#endif 92 93 94bool xen_suspend_allow; 95 96void 97xen_parse_cmdline(int what, union xen_cmdline_parseinfo *xcp) 98{ 99 char _cmd_line[256], *cmd_line, *opt, *s; 100 int b, i, ipidx = 0; 101 uint32_t xi_ip[5]; 102 size_t len; 103 104 len = strlcpy(_cmd_line, xen_start_info.cmd_line, sizeof(_cmd_line)); 105 if (len > sizeof(_cmd_line)) { 106 printf("command line exceeded limit of 255 chars. Truncated.\n"); 107 } 108 cmd_line = _cmd_line; 109 110 switch (what) { 111 case XEN_PARSE_BOOTDEV: 112 xcp->xcp_bootdev[0] = 0; 113 break; 114 case XEN_PARSE_CONSOLE: 115 xcp->xcp_console[0] = 0; 116 break; 117 } 118 119 while (cmd_line && *cmd_line) { 120 opt = cmd_line; 121 cmd_line = strchr(opt, ' '); 122 if (cmd_line) 123 *cmd_line = 0; 124 125 switch (what) { 126 case XEN_PARSE_BOOTDEV: 127 if (strncasecmp(opt, "bootdev=", 8) == 0) { 128 strncpy(xcp->xcp_bootdev, opt + 8, 129 sizeof(xcp->xcp_bootdev)); 130 break; 131 } 132 if (strncasecmp(opt, "root=", 5) == 0) { 133 strncpy(xcp->xcp_bootdev, opt + 5, 134 sizeof(xcp->xcp_bootdev)); 135 break; 136 } 137 break; 138 139 case XEN_PARSE_NETINFO: 140 if (xcp->xcp_netinfo.xi_root && 141 strncasecmp(opt, "nfsroot=", 8) == 0) 142 strncpy(xcp->xcp_netinfo.xi_root, opt + 8, 143 MNAMELEN); 144 145 if (strncasecmp(opt, "ip=", 3) == 0) { 146 memset(xi_ip, 0, sizeof(xi_ip)); 147 opt += 3; 148 ipidx = 0; 149 while (opt && *opt) { 150 s = opt; 151 opt = strchr(opt, ':'); 152 if (opt) 153 *opt = 0; 154 155 switch (ipidx) { 156 case 0: /* ip */ 157 case 1: /* nfs server */ 158 case 2: /* gw */ 159 case 3: /* mask */ 160 case 4: /* host */ 161 if (*s == 0) 162 break; 163 for (i = 0; i < 4; i++) { 164 b = strtoul(s, &s, 10); 165 xi_ip[ipidx] = b + 256 166 * xi_ip[ipidx]; 167 if (*s != '.') 168 break; 169 s++; 170 } 171 if (i < 3) 172 xi_ip[ipidx] = 0; 173 break; 174 case 5: /* interface */ 175 if (!strncmp(s, "xennet", 6)) 176 s += 6; 177 else if (!strncmp(s, "eth", 3)) 178 s += 3; 179 else 180 break; 181 if (xcp->xcp_netinfo.xi_ifno 182 == strtoul(s, NULL, 10)) 183 memcpy(xcp-> 184 xcp_netinfo.xi_ip, 185 xi_ip, 186 sizeof(xi_ip)); 187 break; 188 } 189 ipidx++; 190 191 if (opt) 192 *opt++ = ':'; 193 } 194 } 195 break; 196 197 case XEN_PARSE_CONSOLE: 198 if (strncasecmp(opt, "console=", 8) == 0) 199 strncpy(xcp->xcp_console, opt + 8, 200 sizeof(xcp->xcp_console)); 201 break; 202 203 case XEN_PARSE_BOOTFLAGS: 204 if (*opt == '-') { 205 opt++; 206 while(*opt != '\0') { 207 BOOT_FLAG(*opt, boothowto); 208 opt++; 209 } 210 } 211 break; 212 case XEN_PARSE_PCIBACK: 213 if (strncasecmp(opt, "pciback.hide=", 13) == 0) 214 strncpy(xcp->xcp_pcidevs, opt + 13, 215 sizeof(xcp->xcp_pcidevs)); 216 break; 217 } 218 219 if (cmd_line) 220 *cmd_line++ = ' '; 221 } 222} 223 224#ifdef XENPV 225 226static int sysctl_xen_suspend(SYSCTLFN_ARGS); 227static void xen_suspend_domain(void); 228static void xen_prepare_suspend(void); 229static void xen_prepare_resume(void); 230 231/* 232 * this function sets up the machdep.xen.suspend sysctl(7) that 233 * controls domain suspend/save. 234 */ 235void 236sysctl_xen_suspend_setup(void) 237{ 238 const struct sysctlnode *node = NULL; 239 240 /* 241 * dom0 implements sleep support through ACPI. It should not call 242 * this function to register a suspend interface. 243 */ 244 KASSERT(!(xendomain_is_dom0())); 245 246 sysctl_createv(NULL, 0, NULL, &node, 247 CTLFLAG_PERMANENT, 248 CTLTYPE_NODE, "machdep", NULL, 249 NULL, 0, NULL, 0, 250 CTL_MACHDEP, CTL_EOL); 251 252 sysctl_createv(NULL, 0, &node, &node, 253 CTLFLAG_PERMANENT, 254 CTLTYPE_NODE, "xen", 255 SYSCTL_DESCR("Xen top level node"), 256 NULL, 0, NULL, 0, 257 CTL_CREATE, CTL_EOL); 258 259 sysctl_createv(NULL, 0, &node, &node, 260 CTLFLAG_PERMANENT | CTLFLAG_READWRITE | CTLFLAG_IMMEDIATE, 261 CTLTYPE_INT, "suspend", 262 SYSCTL_DESCR("Suspend/save current Xen domain"), 263 sysctl_xen_suspend, 0, NULL, 0, 264 CTL_CREATE, CTL_EOL); 265} 266 267static int 268sysctl_xen_suspend(SYSCTLFN_ARGS) 269{ 270 int error; 271 struct sysctlnode node; 272 273 node = *rnode; 274 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 275 276 if (error || newp == NULL) 277 return error; 278 279 /* only allow domain to suspend when dom0 instructed to do so */ 280 if (xen_suspend_allow == false) 281 return EAGAIN; 282 283 xen_suspend_domain(); 284 285 return 0; 286 287} 288 289static void xen_suspendclocks_xc(void *, void*); 290static void xen_resumeclocks_xc(void *, void*); 291 292/* 293 * Last operations before suspending domain 294 */ 295static void 296xen_prepare_suspend(void) 297{ 298 299 kpreempt_disable(); 300 301 pmap_xen_suspend(); 302 xc_wait(xc_broadcast(0, &xen_suspendclocks_xc, NULL, NULL)); 303 304 /* 305 * save/restore code does not translate these MFNs to their 306 * associated PFNs, so we must do it 307 */ 308 xen_start_info.store_mfn = 309 atop(xpmap_mtop(ptoa(xen_start_info.store_mfn))); 310 xen_start_info.console_mfn = 311 atop(xpmap_mtop(ptoa(xen_start_info.console_mfn))); 312 313 DPRINTK(("suspending domain\n")); 314 aprint_verbose("suspending domain\n"); 315 316 /* invalidate the shared_info page */ 317 if (HYPERVISOR_update_va_mapping((vaddr_t)HYPERVISOR_shared_info, 318 0, UVMF_INVLPG)) { 319 DPRINTK(("HYPERVISOR_shared_info page invalidation failed")); 320 HYPERVISOR_crash(); 321 } 322 323} 324 325static void 326xen_suspendclocks_xc(void *a, void *b) 327{ 328 329 kpreempt_disable(); 330 xen_suspendclocks(curcpu()); 331 kpreempt_enable(); 332} 333 334/* 335 * First operations before restoring domain context 336 */ 337static void 338xen_prepare_resume(void) 339{ 340 /* map the new shared_info page */ 341 if (HYPERVISOR_update_va_mapping((vaddr_t)HYPERVISOR_shared_info, 342 xen_start_info.shared_info | PTE_W | PTE_P, 343 UVMF_INVLPG)) { 344 DPRINTK(("could not map new shared info page")); 345 HYPERVISOR_crash(); 346 } 347 348 pmap_xen_resume(); 349 350 if (xen_start_info.nr_pages != physmem) { 351 /* 352 * XXX JYM for now, we crash - fix it with memory 353 * hotplug when supported 354 */ 355 DPRINTK(("xen_start_info.nr_pages != physmem")); 356 HYPERVISOR_crash(); 357 } 358 359 DPRINTK(("preparing domain resume\n")); 360 aprint_verbose("preparing domain resume\n"); 361 362 xen_suspend_allow = false; 363 364 xc_wait(xc_broadcast(0, xen_resumeclocks_xc, NULL, NULL)); 365 366 kpreempt_enable(); 367 368} 369 370static void 371xen_resumeclocks_xc(void *a, void *b) 372{ 373 374 kpreempt_disable(); 375 xen_resumeclocks(curcpu()); 376 kpreempt_enable(); 377} 378 379static void 380xen_suspend_domain(void) 381{ 382 paddr_t mfn; 383 int s = splvm(); /* XXXSMP */ 384 385 /* 386 * console becomes unavailable when suspended, so 387 * direct communications to domain are hampered from there on. 388 * We can only rely on low level primitives like printk(), until 389 * console is fully restored 390 */ 391 if (!pmf_system_suspend(PMF_Q_NONE)) { 392 DPRINTK(("devices suspend failed")); 393 HYPERVISOR_crash(); 394 } 395 396 /* 397 * obtain the MFN of the start_info page now, as we will not be 398 * able to do it once pmap is locked 399 */ 400 pmap_extract_ma(pmap_kernel(), (vaddr_t)&xen_start_info, &mfn); 401 mfn >>= PAGE_SHIFT; 402 403 xen_prepare_suspend(); 404 405 DPRINTK(("calling HYPERVISOR_suspend()\n")); 406 if (HYPERVISOR_suspend(mfn) != 0) { 407 /* XXX JYM: implement checkpoint/snapshot (ret == 1) */ 408 DPRINTK(("HYPERVISOR_suspend() failed")); 409 HYPERVISOR_crash(); 410 } 411 412 DPRINTK(("left HYPERVISOR_suspend()\n")); 413 414 xen_prepare_resume(); 415 416 DPRINTK(("resuming devices\n")); 417 if (!pmf_system_resume(PMF_Q_NONE)) { 418 DPRINTK(("devices resume failed\n")); 419 HYPERVISOR_crash(); 420 } 421 422 splx(s); 423 424 /* xencons is back online, we can print to console */ 425 aprint_verbose("domain resumed\n"); 426 427} 428#endif /* XENPV */ 429 430#define PRINTK_BUFSIZE 1024 431void 432printk(const char *fmt, ...) 433{ 434 va_list ap; 435 int ret; 436 static char buf[PRINTK_BUFSIZE]; 437 438 va_start(ap, fmt); 439 ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); 440 va_end(ap); 441 buf[ret] = 0; 442 (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf); 443} 444 445static int early_xenconscn_getc(dev_t); 446static void early_xenconscn_putc(dev_t, int); 447static void early_xenconscn_pollc(dev_t, int); 448 449static struct consdev early_xencons = { 450 NULL, NULL, 451 early_xenconscn_getc, early_xenconscn_putc, early_xenconscn_pollc, 452 NULL, NULL, NULL, NODEV, CN_NORMAL 453}; 454 455void 456xen_early_console(void) 457{ 458 cn_tab = &early_xencons; /* fallback console */ 459} 460 461static int 462early_xenconscn_getc(dev_t dev) 463{ 464 while(1) 465 ; 466 return -1; 467} 468 469static void 470early_xenconscn_putc(dev_t dev, int c) 471{ 472 printk("%c", c); 473} 474 475static void 476early_xenconscn_pollc(dev_t dev, int on) 477{ 478 return; 479} 480bool xen_feature_tables[XENFEAT_NR_SUBMAPS * 32]; 481 482void 483xen_init_features(void) 484{ 485 xen_feature_info_t features; 486 487 for (int sm = 0; sm < XENFEAT_NR_SUBMAPS; sm++) { 488 features.submap_idx = sm; 489 if (HYPERVISOR_xen_version(XENVER_get_features, &features) < 0) 490 break; 491 for (int f = 0; f < 32; f++) { 492 xen_feature_tables[sm * 32 + f] = 493 (features.submap & (1 << f)) ? 1 : 0; 494 } 495 } 496} 497 498/* 499 * Attempt to find the device from which we were booted. 500 */ 501 502static int 503is_valid_disk(device_t dv) 504{ 505 if (device_class(dv) != DV_DISK) 506 return (0); 507 508 return (device_is_a(dv, "dk") || 509 device_is_a(dv, "sd") || 510 device_is_a(dv, "wd") || 511 device_is_a(dv, "ld") || 512 device_is_a(dv, "ed") || 513 device_is_a(dv, "xbd")); 514} 515 516void 517xen_bootconf(void) 518{ 519 device_t dv; 520 deviter_t di; 521 union xen_cmdline_parseinfo xcp; 522 static char bootspecbuf[sizeof(xcp.xcp_bootdev)]; 523 524 if (booted_device) { 525 DPRINTF(("%s: preset booted_device: %s\n", __func__, device_xname(booted_device))); 526 return; 527 } 528 529 xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp); 530 531 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); 532 dv != NULL; 533 dv = deviter_next(&di)) { 534 bool is_ifnet, is_disk; 535 const char *devname; 536 537 is_ifnet = (device_class(dv) == DV_IFNET); 538 is_disk = is_valid_disk(dv); 539 devname = device_xname(dv); 540 541 if (!is_ifnet && !is_disk) 542 continue; 543 544 if (is_disk && xcp.xcp_bootdev[0] == 0) { 545 booted_device = dv; 546 break; 547 } 548 549 if (strncmp(xcp.xcp_bootdev, devname, strlen(devname))) 550 continue; 551 552 if (is_disk && strlen(xcp.xcp_bootdev) > strlen(devname)) { 553 /* XXX check device_cfdata as in x86_autoconf.c? */ 554 booted_partition = toupper( 555 xcp.xcp_bootdev[strlen(devname)]) - 'A'; 556 DPRINTF(("%s: booted_partition: %d\n", __func__, booted_partition)); 557 } 558 559 booted_device = dv; 560 booted_method = "bootinfo/bootdev"; 561 break; 562 } 563 deviter_release(&di); 564 565 if (booted_device) { 566 DPRINTF(("%s: booted_device: %s\n", __func__, device_xname(booted_device))); 567 return; 568 } 569 570 /* 571 * not a boot device name, pass through to MI code 572 */ 573 if (xcp.xcp_bootdev[0] != '\0') { 574 strlcpy(bootspecbuf, xcp.xcp_bootdev, sizeof(bootspecbuf)); 575 bootspec = bootspecbuf; 576 booted_method = "bootinfo/bootspec"; 577 DPRINTF(("%s: bootspec: %s\n", __func__, bootspec)); 578 return; 579 } 580} 581