nvme_ctrlr.c revision 367145
1/*- 2 * Copyright (C) 2012-2016 Intel Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/dev/nvme/nvme_ctrlr.c 367145 2020-10-29 22:00:15Z brooks $"); 29 30#include "opt_cam.h" 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/buf.h> 35#include <sys/bus.h> 36#include <sys/conf.h> 37#include <sys/ioccom.h> 38#include <sys/proc.h> 39#include <sys/smp.h> 40#include <sys/uio.h> 41 42#include <dev/pci/pcireg.h> 43#include <dev/pci/pcivar.h> 44 45#include "nvme_private.h" 46 47#define B4_CHK_RDY_DELAY_MS 2300 /* work arond controller bug */ 48 49static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, 50 struct nvme_async_event_request *aer); 51static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); 52 53static int 54nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr) 55{ 56 57 ctrlr->resource_id = PCIR_BAR(0); 58 59 ctrlr->resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY, 60 &ctrlr->resource_id, RF_ACTIVE); 61 62 if(ctrlr->resource == NULL) { 63 nvme_printf(ctrlr, "unable to allocate pci resource\n"); 64 return (ENOMEM); 65 } 66 67 ctrlr->bus_tag = rman_get_bustag(ctrlr->resource); 68 ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource); 69 ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle; 70 71 /* 72 * The NVMe spec allows for the MSI-X table to be placed behind 73 * BAR 4/5, separate from the control/doorbell registers. Always 74 * try to map this bar, because it must be mapped prior to calling 75 * pci_alloc_msix(). If the table isn't behind BAR 4/5, 76 * bus_alloc_resource() will just return NULL which is OK. 77 */ 78 ctrlr->bar4_resource_id = PCIR_BAR(4); 79 ctrlr->bar4_resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY, 80 &ctrlr->bar4_resource_id, RF_ACTIVE); 81 82 return (0); 83} 84 85static int 86nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) 87{ 88 struct nvme_qpair *qpair; 89 uint32_t num_entries; 90 int error; 91 92 qpair = &ctrlr->adminq; 93 94 num_entries = NVME_ADMIN_ENTRIES; 95 TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); 96 /* 97 * If admin_entries was overridden to an invalid value, revert it 98 * back to our default value. 99 */ 100 if (num_entries < NVME_MIN_ADMIN_ENTRIES || 101 num_entries > NVME_MAX_ADMIN_ENTRIES) { 102 nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d " 103 "specified\n", num_entries); 104 num_entries = NVME_ADMIN_ENTRIES; 105 } 106 107 /* 108 * The admin queue's max xfer size is treated differently than the 109 * max I/O xfer size. 16KB is sufficient here - maybe even less? 110 */ 111 error = nvme_qpair_construct(qpair, 112 0, /* qpair ID */ 113 0, /* vector */ 114 num_entries, 115 NVME_ADMIN_TRACKERS, 116 ctrlr); 117 return (error); 118} 119 120static int 121nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) 122{ 123 struct nvme_qpair *qpair; 124 union cap_lo_register cap_lo; 125 int i, error, num_entries, num_trackers; 126 127 num_entries = NVME_IO_ENTRIES; 128 TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); 129 130 /* 131 * NVMe spec sets a hard limit of 64K max entries, but 132 * devices may specify a smaller limit, so we need to check 133 * the MQES field in the capabilities register. 134 */ 135 cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo); 136 num_entries = min(num_entries, cap_lo.bits.mqes+1); 137 138 num_trackers = NVME_IO_TRACKERS; 139 TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); 140 141 num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); 142 num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); 143 /* 144 * No need to have more trackers than entries in the submit queue. 145 * Note also that for a queue size of N, we can only have (N-1) 146 * commands outstanding, hence the "-1" here. 147 */ 148 num_trackers = min(num_trackers, (num_entries-1)); 149 150 /* 151 * Our best estimate for the maximum number of I/Os that we should 152 * noramlly have in flight at one time. This should be viewed as a hint, 153 * not a hard limit and will need to be revisitted when the upper layers 154 * of the storage system grows multi-queue support. 155 */ 156 ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; 157 158 /* 159 * This was calculated previously when setting up interrupts, but 160 * a controller could theoretically support fewer I/O queues than 161 * MSI-X vectors. So calculate again here just to be safe. 162 */ 163 ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues); 164 165 ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), 166 M_NVME, M_ZERO | M_WAITOK); 167 168 for (i = 0; i < ctrlr->num_io_queues; i++) { 169 qpair = &ctrlr->ioq[i]; 170 171 /* 172 * Admin queue has ID=0. IO queues start at ID=1 - 173 * hence the 'i+1' here. 174 * 175 * For I/O queues, use the controller-wide max_xfer_size 176 * calculated in nvme_attach(). 177 */ 178 error = nvme_qpair_construct(qpair, 179 i+1, /* qpair ID */ 180 ctrlr->msix_enabled ? i+1 : 0, /* vector */ 181 num_entries, 182 num_trackers, 183 ctrlr); 184 if (error) 185 return (error); 186 187 /* 188 * Do not bother binding interrupts if we only have one I/O 189 * interrupt thread for this controller. 190 */ 191 if (ctrlr->num_io_queues > 1) 192 bus_bind_intr(ctrlr->dev, qpair->res, 193 i * ctrlr->num_cpus_per_ioq); 194 } 195 196 return (0); 197} 198 199static void 200nvme_ctrlr_fail(struct nvme_controller *ctrlr) 201{ 202 int i; 203 204 ctrlr->is_failed = TRUE; 205 nvme_admin_qpair_disable(&ctrlr->adminq); 206 nvme_qpair_fail(&ctrlr->adminq); 207 if (ctrlr->ioq != NULL) { 208 for (i = 0; i < ctrlr->num_io_queues; i++) { 209 nvme_io_qpair_disable(&ctrlr->ioq[i]); 210 nvme_qpair_fail(&ctrlr->ioq[i]); 211 } 212 } 213 nvme_notify_fail_consumers(ctrlr); 214} 215 216void 217nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, 218 struct nvme_request *req) 219{ 220 221 mtx_lock(&ctrlr->lock); 222 STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); 223 mtx_unlock(&ctrlr->lock); 224 taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); 225} 226 227static void 228nvme_ctrlr_fail_req_task(void *arg, int pending) 229{ 230 struct nvme_controller *ctrlr = arg; 231 struct nvme_request *req; 232 233 mtx_lock(&ctrlr->lock); 234 while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) { 235 STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq); 236 mtx_unlock(&ctrlr->lock); 237 nvme_qpair_manual_complete_request(req->qpair, req, 238 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); 239 mtx_lock(&ctrlr->lock); 240 } 241 mtx_unlock(&ctrlr->lock); 242} 243 244static int 245nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val) 246{ 247 int ms_waited; 248 union csts_register csts; 249 250 ms_waited = 0; 251 while (1) { 252 csts.raw = nvme_mmio_read_4(ctrlr, csts); 253 if (csts.raw == 0xffffffff) /* Hot unplug. */ 254 return (ENXIO); 255 if (csts.bits.rdy == desired_val) 256 break; 257 if (ms_waited++ > ctrlr->ready_timeout_in_ms) { 258 nvme_printf(ctrlr, "controller ready did not become %d " 259 "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms); 260 return (ENXIO); 261 } 262 DELAY(1000); 263 } 264 265 return (0); 266} 267 268static int 269nvme_ctrlr_disable(struct nvme_controller *ctrlr) 270{ 271 union cc_register cc; 272 union csts_register csts; 273 int err; 274 275 cc.raw = nvme_mmio_read_4(ctrlr, cc); 276 csts.raw = nvme_mmio_read_4(ctrlr, csts); 277 278 /* 279 * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1 280 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when 281 * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY 282 * isn't the desired value. Short circuit if we're already disabled. 283 */ 284 if (cc.bits.en == 1) { 285 if (csts.bits.rdy == 0) { 286 /* EN == 1, wait for RDY == 1 or fail */ 287 err = nvme_ctrlr_wait_for_ready(ctrlr, 1); 288 if (err != 0) 289 return (err); 290 } 291 } else { 292 /* EN == 0 already wait for RDY == 0 */ 293 if (csts.bits.rdy == 0) 294 return (0); 295 else 296 return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); 297 } 298 299 cc.bits.en = 0; 300 nvme_mmio_write_4(ctrlr, cc, cc.raw); 301 /* 302 * Some drives have issues with accessing the mmio after we 303 * disable, so delay for a bit after we write the bit to 304 * cope with these issues. 305 */ 306 if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY) 307 pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000); 308 return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); 309} 310 311static int 312nvme_ctrlr_enable(struct nvme_controller *ctrlr) 313{ 314 union cc_register cc; 315 union csts_register csts; 316 union aqa_register aqa; 317 int err; 318 319 cc.raw = nvme_mmio_read_4(ctrlr, cc); 320 csts.raw = nvme_mmio_read_4(ctrlr, csts); 321 322 /* 323 * See note in nvme_ctrlr_disable. Short circuit if we're already enabled. 324 */ 325 if (cc.bits.en == 1) { 326 if (csts.bits.rdy == 1) 327 return (0); 328 else 329 return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); 330 } else { 331 /* EN == 0 already wait for RDY == 0 or fail */ 332 err = nvme_ctrlr_wait_for_ready(ctrlr, 0); 333 if (err != 0) 334 return (err); 335 } 336 337 nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr); 338 DELAY(5000); 339 nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr); 340 DELAY(5000); 341 342 aqa.raw = 0; 343 /* acqs and asqs are 0-based. */ 344 aqa.bits.acqs = ctrlr->adminq.num_entries-1; 345 aqa.bits.asqs = ctrlr->adminq.num_entries-1; 346 nvme_mmio_write_4(ctrlr, aqa, aqa.raw); 347 DELAY(5000); 348 349 cc.bits.en = 1; 350 cc.bits.css = 0; 351 cc.bits.ams = 0; 352 cc.bits.shn = 0; 353 cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ 354 cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ 355 356 /* This evaluates to 0, which is according to spec. */ 357 cc.bits.mps = (PAGE_SIZE >> 13); 358 359 nvme_mmio_write_4(ctrlr, cc, cc.raw); 360 361 return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); 362} 363 364int 365nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) 366{ 367 int i, err; 368 369 nvme_admin_qpair_disable(&ctrlr->adminq); 370 /* 371 * I/O queues are not allocated before the initial HW 372 * reset, so do not try to disable them. Use is_initialized 373 * to determine if this is the initial HW reset. 374 */ 375 if (ctrlr->is_initialized) { 376 for (i = 0; i < ctrlr->num_io_queues; i++) 377 nvme_io_qpair_disable(&ctrlr->ioq[i]); 378 } 379 380 DELAY(100*1000); 381 382 err = nvme_ctrlr_disable(ctrlr); 383 if (err != 0) 384 return err; 385 return (nvme_ctrlr_enable(ctrlr)); 386} 387 388void 389nvme_ctrlr_reset(struct nvme_controller *ctrlr) 390{ 391 int cmpset; 392 393 cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); 394 395 if (cmpset == 0 || ctrlr->is_failed) 396 /* 397 * Controller is already resetting or has failed. Return 398 * immediately since there is no need to kick off another 399 * reset in these cases. 400 */ 401 return; 402 403 taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); 404} 405 406static int 407nvme_ctrlr_identify(struct nvme_controller *ctrlr) 408{ 409 struct nvme_completion_poll_status status; 410 411 status.done = 0; 412 nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata, 413 nvme_completion_poll_cb, &status); 414 while (!atomic_load_acq_int(&status.done)) 415 pause("nvme", 1); 416 if (nvme_completion_is_error(&status.cpl)) { 417 nvme_printf(ctrlr, "nvme_identify_controller failed!\n"); 418 return (ENXIO); 419 } 420 421 /* 422 * Use MDTS to ensure our default max_xfer_size doesn't exceed what the 423 * controller supports. 424 */ 425 if (ctrlr->cdata.mdts > 0) 426 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size, 427 ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); 428 429 return (0); 430} 431 432static int 433nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) 434{ 435 struct nvme_completion_poll_status status; 436 int cq_allocated, sq_allocated; 437 438 status.done = 0; 439 nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, 440 nvme_completion_poll_cb, &status); 441 while (!atomic_load_acq_int(&status.done)) 442 pause("nvme", 1); 443 if (nvme_completion_is_error(&status.cpl)) { 444 nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n"); 445 return (ENXIO); 446 } 447 448 /* 449 * Data in cdw0 is 0-based. 450 * Lower 16-bits indicate number of submission queues allocated. 451 * Upper 16-bits indicate number of completion queues allocated. 452 */ 453 sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1; 454 cq_allocated = (status.cpl.cdw0 >> 16) + 1; 455 456 /* 457 * Controller may allocate more queues than we requested, 458 * so use the minimum of the number requested and what was 459 * actually allocated. 460 */ 461 ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); 462 ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); 463 464 return (0); 465} 466 467static int 468nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) 469{ 470 struct nvme_completion_poll_status status; 471 struct nvme_qpair *qpair; 472 int i; 473 474 for (i = 0; i < ctrlr->num_io_queues; i++) { 475 qpair = &ctrlr->ioq[i]; 476 477 status.done = 0; 478 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector, 479 nvme_completion_poll_cb, &status); 480 while (!atomic_load_acq_int(&status.done)) 481 pause("nvme", 1); 482 if (nvme_completion_is_error(&status.cpl)) { 483 nvme_printf(ctrlr, "nvme_create_io_cq failed!\n"); 484 return (ENXIO); 485 } 486 487 status.done = 0; 488 nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, 489 nvme_completion_poll_cb, &status); 490 while (!atomic_load_acq_int(&status.done)) 491 pause("nvme", 1); 492 if (nvme_completion_is_error(&status.cpl)) { 493 nvme_printf(ctrlr, "nvme_create_io_sq failed!\n"); 494 return (ENXIO); 495 } 496 } 497 498 return (0); 499} 500 501static int 502nvme_ctrlr_destroy_qpair(struct nvme_controller *ctrlr, struct nvme_qpair *qpair) 503{ 504 struct nvme_completion_poll_status status; 505 506 status.done = 0; 507 nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair, 508 nvme_completion_poll_cb, &status); 509 while (!atomic_load_acq_int(&status.done)) 510 pause("nvme", 1); 511 if (nvme_completion_is_error(&status.cpl)) { 512 nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n"); 513 return (ENXIO); 514 } 515 516 status.done = 0; 517 nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair, 518 nvme_completion_poll_cb, &status); 519 while (!atomic_load_acq_int(&status.done)) 520 pause("nvme", 1); 521 if (nvme_completion_is_error(&status.cpl)) { 522 nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n"); 523 return (ENXIO); 524 } 525 526 return (0); 527} 528 529static int 530nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) 531{ 532 struct nvme_namespace *ns; 533 uint32_t i; 534 535 for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) { 536 ns = &ctrlr->ns[i]; 537 nvme_ns_construct(ns, i+1, ctrlr); 538 } 539 540 return (0); 541} 542 543static boolean_t 544is_log_page_id_valid(uint8_t page_id) 545{ 546 547 switch (page_id) { 548 case NVME_LOG_ERROR: 549 case NVME_LOG_HEALTH_INFORMATION: 550 case NVME_LOG_FIRMWARE_SLOT: 551 return (TRUE); 552 } 553 554 return (FALSE); 555} 556 557static uint32_t 558nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id) 559{ 560 uint32_t log_page_size; 561 562 switch (page_id) { 563 case NVME_LOG_ERROR: 564 log_page_size = min( 565 sizeof(struct nvme_error_information_entry) * 566 ctrlr->cdata.elpe, 567 NVME_MAX_AER_LOG_SIZE); 568 break; 569 case NVME_LOG_HEALTH_INFORMATION: 570 log_page_size = sizeof(struct nvme_health_information_page); 571 break; 572 case NVME_LOG_FIRMWARE_SLOT: 573 log_page_size = sizeof(struct nvme_firmware_page); 574 break; 575 default: 576 log_page_size = 0; 577 break; 578 } 579 580 return (log_page_size); 581} 582 583static void 584nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, 585 union nvme_critical_warning_state state) 586{ 587 588 if (state.bits.available_spare == 1) 589 nvme_printf(ctrlr, "available spare space below threshold\n"); 590 591 if (state.bits.temperature == 1) 592 nvme_printf(ctrlr, "temperature above threshold\n"); 593 594 if (state.bits.device_reliability == 1) 595 nvme_printf(ctrlr, "device reliability degraded\n"); 596 597 if (state.bits.read_only == 1) 598 nvme_printf(ctrlr, "media placed in read only mode\n"); 599 600 if (state.bits.volatile_memory_backup == 1) 601 nvme_printf(ctrlr, "volatile memory backup device failed\n"); 602 603 if (state.bits.reserved != 0) 604 nvme_printf(ctrlr, 605 "unknown critical warning(s): state = 0x%02x\n", state.raw); 606} 607 608static void 609nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) 610{ 611 struct nvme_async_event_request *aer = arg; 612 struct nvme_health_information_page *health_info; 613 614 /* 615 * If the log page fetch for some reason completed with an error, 616 * don't pass log page data to the consumers. In practice, this case 617 * should never happen. 618 */ 619 if (nvme_completion_is_error(cpl)) 620 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, 621 aer->log_page_id, NULL, 0); 622 else { 623 if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { 624 health_info = (struct nvme_health_information_page *) 625 aer->log_page_buffer; 626 nvme_ctrlr_log_critical_warnings(aer->ctrlr, 627 health_info->critical_warning); 628 /* 629 * Critical warnings reported through the 630 * SMART/health log page are persistent, so 631 * clear the associated bits in the async event 632 * config so that we do not receive repeated 633 * notifications for the same event. 634 */ 635 aer->ctrlr->async_event_config.raw &= 636 ~health_info->critical_warning.raw; 637 nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, 638 aer->ctrlr->async_event_config, NULL, NULL); 639 } 640 641 642 /* 643 * Pass the cpl data from the original async event completion, 644 * not the log page fetch. 645 */ 646 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, 647 aer->log_page_id, aer->log_page_buffer, aer->log_page_size); 648 } 649 650 /* 651 * Repost another asynchronous event request to replace the one 652 * that just completed. 653 */ 654 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); 655} 656 657static void 658nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) 659{ 660 struct nvme_async_event_request *aer = arg; 661 662 if (nvme_completion_is_error(cpl)) { 663 /* 664 * Do not retry failed async event requests. This avoids 665 * infinite loops where a new async event request is submitted 666 * to replace the one just failed, only to fail again and 667 * perpetuate the loop. 668 */ 669 return; 670 } 671 672 /* Associated log page is in bits 23:16 of completion entry dw0. */ 673 aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 674 675 nvme_printf(aer->ctrlr, "async event occurred (log page id=0x%x)\n", 676 aer->log_page_id); 677 678 if (is_log_page_id_valid(aer->log_page_id)) { 679 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, 680 aer->log_page_id); 681 memcpy(&aer->cpl, cpl, sizeof(*cpl)); 682 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, 683 NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, 684 aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, 685 aer); 686 /* Wait to notify consumers until after log page is fetched. */ 687 } else { 688 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, 689 NULL, 0); 690 691 /* 692 * Repost another asynchronous event request to replace the one 693 * that just completed. 694 */ 695 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); 696 } 697} 698 699static void 700nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, 701 struct nvme_async_event_request *aer) 702{ 703 struct nvme_request *req; 704 705 aer->ctrlr = ctrlr; 706 req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); 707 aer->req = req; 708 709 /* 710 * Disable timeout here, since asynchronous event requests should by 711 * nature never be timed out. 712 */ 713 req->timeout = FALSE; 714 req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; 715 nvme_ctrlr_submit_admin_request(ctrlr, req); 716} 717 718static void 719nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) 720{ 721 struct nvme_completion_poll_status status; 722 struct nvme_async_event_request *aer; 723 uint32_t i; 724 725 ctrlr->async_event_config.raw = 0xFF; 726 ctrlr->async_event_config.bits.reserved = 0; 727 728 status.done = 0; 729 nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD, 730 0, NULL, 0, nvme_completion_poll_cb, &status); 731 while (!atomic_load_acq_int(&status.done)) 732 pause("nvme", 1); 733 if (nvme_completion_is_error(&status.cpl) || 734 (status.cpl.cdw0 & 0xFFFF) == 0xFFFF || 735 (status.cpl.cdw0 & 0xFFFF) == 0x0000) { 736 nvme_printf(ctrlr, "temperature threshold not supported\n"); 737 ctrlr->async_event_config.bits.temperature = 0; 738 } 739 740 nvme_ctrlr_cmd_set_async_event_config(ctrlr, 741 ctrlr->async_event_config, NULL, NULL); 742 743 /* aerl is a zero-based value, so we need to add 1 here. */ 744 ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1)); 745 746 for (i = 0; i < ctrlr->num_aers; i++) { 747 aer = &ctrlr->aer[i]; 748 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); 749 } 750} 751 752static void 753nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) 754{ 755 756 ctrlr->int_coal_time = 0; 757 TUNABLE_INT_FETCH("hw.nvme.int_coal_time", 758 &ctrlr->int_coal_time); 759 760 ctrlr->int_coal_threshold = 0; 761 TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold", 762 &ctrlr->int_coal_threshold); 763 764 nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time, 765 ctrlr->int_coal_threshold, NULL, NULL); 766} 767 768static void 769nvme_ctrlr_start(void *ctrlr_arg) 770{ 771 struct nvme_controller *ctrlr = ctrlr_arg; 772 uint32_t old_num_io_queues; 773 int i; 774 775 /* 776 * Only reset adminq here when we are restarting the 777 * controller after a reset. During initialization, 778 * we have already submitted admin commands to get 779 * the number of I/O queues supported, so cannot reset 780 * the adminq again here. 781 */ 782 if (ctrlr->is_resetting) { 783 nvme_qpair_reset(&ctrlr->adminq); 784 } 785 786 for (i = 0; i < ctrlr->num_io_queues; i++) 787 nvme_qpair_reset(&ctrlr->ioq[i]); 788 789 nvme_admin_qpair_enable(&ctrlr->adminq); 790 791 if (nvme_ctrlr_identify(ctrlr) != 0) { 792 nvme_ctrlr_fail(ctrlr); 793 return; 794 } 795 796 /* 797 * The number of qpairs are determined during controller initialization, 798 * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the 799 * HW limit. We call SET_FEATURES again here so that it gets called 800 * after any reset for controllers that depend on the driver to 801 * explicit specify how many queues it will use. This value should 802 * never change between resets, so panic if somehow that does happen. 803 */ 804 if (ctrlr->is_resetting) { 805 old_num_io_queues = ctrlr->num_io_queues; 806 if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { 807 nvme_ctrlr_fail(ctrlr); 808 return; 809 } 810 811 if (old_num_io_queues != ctrlr->num_io_queues) { 812 panic("num_io_queues changed from %u to %u", 813 old_num_io_queues, ctrlr->num_io_queues); 814 } 815 } 816 817 if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { 818 nvme_ctrlr_fail(ctrlr); 819 return; 820 } 821 822 if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { 823 nvme_ctrlr_fail(ctrlr); 824 return; 825 } 826 827 nvme_ctrlr_configure_aer(ctrlr); 828 nvme_ctrlr_configure_int_coalescing(ctrlr); 829 830 for (i = 0; i < ctrlr->num_io_queues; i++) 831 nvme_io_qpair_enable(&ctrlr->ioq[i]); 832} 833 834void 835nvme_ctrlr_start_config_hook(void *arg) 836{ 837 struct nvme_controller *ctrlr = arg; 838 839 nvme_qpair_reset(&ctrlr->adminq); 840 nvme_admin_qpair_enable(&ctrlr->adminq); 841 842 if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && 843 nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) 844 nvme_ctrlr_start(ctrlr); 845 else 846 nvme_ctrlr_fail(ctrlr); 847 848 nvme_sysctl_initialize_ctrlr(ctrlr); 849 config_intrhook_disestablish(&ctrlr->config_hook); 850 851 ctrlr->is_initialized = 1; 852 nvme_notify_new_controller(ctrlr); 853} 854 855static void 856nvme_ctrlr_reset_task(void *arg, int pending) 857{ 858 struct nvme_controller *ctrlr = arg; 859 int status; 860 861 nvme_printf(ctrlr, "resetting controller\n"); 862 status = nvme_ctrlr_hw_reset(ctrlr); 863 /* 864 * Use pause instead of DELAY, so that we yield to any nvme interrupt 865 * handlers on this CPU that were blocked on a qpair lock. We want 866 * all nvme interrupts completed before proceeding with restarting the 867 * controller. 868 * 869 * XXX - any way to guarantee the interrupt handlers have quiesced? 870 */ 871 pause("nvmereset", hz / 10); 872 if (status == 0) 873 nvme_ctrlr_start(ctrlr); 874 else 875 nvme_ctrlr_fail(ctrlr); 876 877 atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); 878} 879 880/* 881 * Poll all the queues enabled on the device for completion. 882 */ 883void 884nvme_ctrlr_poll(struct nvme_controller *ctrlr) 885{ 886 int i; 887 888 nvme_qpair_process_completions(&ctrlr->adminq); 889 890 for (i = 0; i < ctrlr->num_io_queues; i++) 891 if (ctrlr->ioq && ctrlr->ioq[i].cpl) 892 nvme_qpair_process_completions(&ctrlr->ioq[i]); 893} 894 895/* 896 * Poll the single-vector intertrupt case: num_io_queues will be 1 and 897 * there's only a single vector. While we're polling, we mask further 898 * interrupts in the controller. 899 */ 900void 901nvme_ctrlr_intx_handler(void *arg) 902{ 903 struct nvme_controller *ctrlr = arg; 904 905 nvme_mmio_write_4(ctrlr, intms, 1); 906 nvme_ctrlr_poll(ctrlr); 907 nvme_mmio_write_4(ctrlr, intmc, 1); 908} 909 910static int 911nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) 912{ 913 914 ctrlr->msix_enabled = 0; 915 ctrlr->num_io_queues = 1; 916 ctrlr->num_cpus_per_ioq = mp_ncpus; 917 ctrlr->rid = 0; 918 ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 919 &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); 920 921 if (ctrlr->res == NULL) { 922 nvme_printf(ctrlr, "unable to allocate shared IRQ\n"); 923 return (ENOMEM); 924 } 925 926 bus_setup_intr(ctrlr->dev, ctrlr->res, 927 INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler, 928 ctrlr, &ctrlr->tag); 929 930 if (ctrlr->tag == NULL) { 931 nvme_printf(ctrlr, "unable to setup intx handler\n"); 932 return (ENOMEM); 933 } 934 935 return (0); 936} 937 938static void 939nvme_pt_done(void *arg, const struct nvme_completion *cpl) 940{ 941 struct nvme_pt_command *pt = arg; 942 struct mtx *mtx = pt->driver_lock; 943 944 bzero(&pt->cpl, sizeof(pt->cpl)); 945 pt->cpl.cdw0 = cpl->cdw0; 946 pt->cpl.status = cpl->status; 947 pt->cpl.status.p = 0; 948 949 mtx_lock(mtx); 950 pt->driver_lock = NULL; 951 wakeup(pt); 952 mtx_unlock(mtx); 953} 954 955int 956nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, 957 struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, 958 int is_admin_cmd) 959{ 960 struct nvme_request *req; 961 struct mtx *mtx; 962 struct buf *buf = NULL; 963 int ret = 0; 964 vm_offset_t addr, end; 965 966 if (pt->len > 0) { 967 /* 968 * vmapbuf calls vm_fault_quick_hold_pages which only maps full 969 * pages. Ensure this request has fewer than MAXPHYS bytes when 970 * extended to full pages. 971 */ 972 addr = (vm_offset_t)pt->buf; 973 end = round_page(addr + pt->len); 974 addr = trunc_page(addr); 975 if (end - addr > MAXPHYS) 976 return EIO; 977 978 if (pt->len > ctrlr->max_xfer_size) { 979 nvme_printf(ctrlr, "pt->len (%d) " 980 "exceeds max_xfer_size (%d)\n", pt->len, 981 ctrlr->max_xfer_size); 982 return EIO; 983 } 984 if (is_user_buffer) { 985 /* 986 * Ensure the user buffer is wired for the duration of 987 * this passthrough command. 988 */ 989 PHOLD(curproc); 990 buf = getpbuf(NULL); 991 buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; 992 if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) { 993 ret = EFAULT; 994 goto err; 995 } 996 req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 997 nvme_pt_done, pt); 998 } else 999 req = nvme_allocate_request_vaddr(pt->buf, pt->len, 1000 nvme_pt_done, pt); 1001 } else 1002 req = nvme_allocate_request_null(nvme_pt_done, pt); 1003 1004 req->cmd.opc = pt->cmd.opc; 1005 req->cmd.rsvd2 = pt->cmd.rsvd2; 1006 req->cmd.rsvd3 = pt->cmd.rsvd3; 1007 req->cmd.cdw10 = pt->cmd.cdw10; 1008 req->cmd.cdw11 = pt->cmd.cdw11; 1009 req->cmd.cdw12 = pt->cmd.cdw12; 1010 req->cmd.cdw13 = pt->cmd.cdw13; 1011 req->cmd.cdw14 = pt->cmd.cdw14; 1012 req->cmd.cdw15 = pt->cmd.cdw15; 1013 1014 req->cmd.nsid = nsid; 1015 1016 mtx = mtx_pool_find(mtxpool_sleep, pt); 1017 pt->driver_lock = mtx; 1018 1019 if (is_admin_cmd) 1020 nvme_ctrlr_submit_admin_request(ctrlr, req); 1021 else 1022 nvme_ctrlr_submit_io_request(ctrlr, req); 1023 1024 mtx_lock(mtx); 1025 while (pt->driver_lock != NULL) 1026 mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); 1027 mtx_unlock(mtx); 1028 1029err: 1030 if (buf != NULL) { 1031 relpbuf(buf, NULL); 1032 PRELE(curproc); 1033 } 1034 1035 return (ret); 1036} 1037 1038static int 1039nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 1040 struct thread *td) 1041{ 1042 struct nvme_controller *ctrlr; 1043 struct nvme_pt_command *pt; 1044 1045 ctrlr = cdev->si_drv1; 1046 1047 switch (cmd) { 1048 case NVME_RESET_CONTROLLER: 1049 nvme_ctrlr_reset(ctrlr); 1050 break; 1051 case NVME_PASSTHROUGH_CMD: 1052 pt = (struct nvme_pt_command *)arg; 1053 return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, pt->cmd.nsid, 1054 1 /* is_user_buffer */, 1 /* is_admin_cmd */)); 1055 case NVME_GET_NSID: 1056 { 1057 struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg; 1058 strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), 1059 sizeof(gnsid->cdev)); 1060 gnsid->nsid = 0; 1061 break; 1062 } 1063 default: 1064 return (ENOTTY); 1065 } 1066 1067 return (0); 1068} 1069 1070static struct cdevsw nvme_ctrlr_cdevsw = { 1071 .d_version = D_VERSION, 1072 .d_flags = 0, 1073 .d_ioctl = nvme_ctrlr_ioctl 1074}; 1075 1076static void 1077nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) 1078{ 1079 device_t dev; 1080 int per_cpu_io_queues; 1081 int min_cpus_per_ioq; 1082 int num_vectors_requested, num_vectors_allocated; 1083 int num_vectors_available; 1084 1085 dev = ctrlr->dev; 1086 min_cpus_per_ioq = 1; 1087 TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); 1088 1089 if (min_cpus_per_ioq < 1) { 1090 min_cpus_per_ioq = 1; 1091 } else if (min_cpus_per_ioq > mp_ncpus) { 1092 min_cpus_per_ioq = mp_ncpus; 1093 } 1094 1095 per_cpu_io_queues = 1; 1096 TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); 1097 1098 if (per_cpu_io_queues == 0) { 1099 min_cpus_per_ioq = mp_ncpus; 1100 } 1101 1102 ctrlr->force_intx = 0; 1103 TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); 1104 1105 /* 1106 * FreeBSD currently cannot allocate more than about 190 vectors at 1107 * boot, meaning that systems with high core count and many devices 1108 * requesting per-CPU interrupt vectors will not get their full 1109 * allotment. So first, try to allocate as many as we may need to 1110 * understand what is available, then immediately release them. 1111 * Then figure out how many of those we will actually use, based on 1112 * assigning an equal number of cores to each I/O queue. 1113 */ 1114 1115 /* One vector for per core I/O queue, plus one vector for admin queue. */ 1116 num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); 1117 if (pci_alloc_msix(dev, &num_vectors_available) != 0) { 1118 num_vectors_available = 0; 1119 } 1120 pci_release_msi(dev); 1121 1122 if (ctrlr->force_intx || num_vectors_available < 2) { 1123 nvme_ctrlr_configure_intx(ctrlr); 1124 return; 1125 } 1126 1127 /* 1128 * Do not use all vectors for I/O queues - one must be saved for the 1129 * admin queue. 1130 */ 1131 ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq, 1132 howmany(mp_ncpus, num_vectors_available - 1)); 1133 1134 ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq); 1135 num_vectors_requested = ctrlr->num_io_queues + 1; 1136 num_vectors_allocated = num_vectors_requested; 1137 1138 /* 1139 * Now just allocate the number of vectors we need. This should 1140 * succeed, since we previously called pci_alloc_msix() 1141 * successfully returning at least this many vectors, but just to 1142 * be safe, if something goes wrong just revert to INTx. 1143 */ 1144 if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { 1145 nvme_ctrlr_configure_intx(ctrlr); 1146 return; 1147 } 1148 1149 if (num_vectors_allocated < num_vectors_requested) { 1150 pci_release_msi(dev); 1151 nvme_ctrlr_configure_intx(ctrlr); 1152 return; 1153 } 1154 1155 ctrlr->msix_enabled = 1; 1156} 1157 1158int 1159nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) 1160{ 1161 struct make_dev_args md_args; 1162 union cap_lo_register cap_lo; 1163 union cap_hi_register cap_hi; 1164 int status, timeout_period; 1165 1166 ctrlr->dev = dev; 1167 1168 mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); 1169 1170 status = nvme_ctrlr_allocate_bar(ctrlr); 1171 1172 if (status != 0) 1173 return (status); 1174 1175 /* 1176 * Software emulators may set the doorbell stride to something 1177 * other than zero, but this driver is not set up to handle that. 1178 */ 1179 cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi); 1180 if (cap_hi.bits.dstrd != 0) 1181 return (ENXIO); 1182 1183 ctrlr->min_page_size = 1 << (12 + cap_hi.bits.mpsmin); 1184 1185 /* Get ready timeout value from controller, in units of 500ms. */ 1186 cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo); 1187 ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500; 1188 1189 timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD; 1190 TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period); 1191 timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); 1192 timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); 1193 ctrlr->timeout_period = timeout_period; 1194 1195 nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; 1196 TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); 1197 1198 ctrlr->enable_aborts = 0; 1199 TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); 1200 1201 nvme_ctrlr_setup_interrupts(ctrlr); 1202 1203 ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; 1204 if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0) 1205 return (ENXIO); 1206 1207 ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, 1208 taskqueue_thread_enqueue, &ctrlr->taskqueue); 1209 taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq"); 1210 1211 ctrlr->is_resetting = 0; 1212 ctrlr->is_initialized = 0; 1213 ctrlr->notification_sent = 0; 1214 TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); 1215 TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); 1216 STAILQ_INIT(&ctrlr->fail_req); 1217 ctrlr->is_failed = FALSE; 1218 1219 make_dev_args_init(&md_args); 1220 md_args.mda_devsw = &nvme_ctrlr_cdevsw; 1221 md_args.mda_uid = UID_ROOT; 1222 md_args.mda_gid = GID_WHEEL; 1223 md_args.mda_mode = 0600; 1224 md_args.mda_unit = device_get_unit(dev); 1225 md_args.mda_si_drv1 = (void *)ctrlr; 1226 status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d", 1227 device_get_unit(dev)); 1228 if (status != 0) 1229 return (ENXIO); 1230 1231 return (0); 1232} 1233 1234void 1235nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) 1236{ 1237 int gone, i; 1238 1239 if (ctrlr->resource == NULL) 1240 goto nores; 1241 1242 /* 1243 * Check whether it is a hot unplug or a clean driver detach. 1244 * If device is not there any more, skip any shutdown commands. 1245 */ 1246 gone = (nvme_mmio_read_4(ctrlr, csts) == 0xffffffff); 1247 if (gone) 1248 nvme_ctrlr_fail(ctrlr); 1249 else 1250 nvme_notify_fail_consumers(ctrlr); 1251 1252 for (i = 0; i < NVME_MAX_NAMESPACES; i++) 1253 nvme_ns_destruct(&ctrlr->ns[i]); 1254 1255 if (ctrlr->cdev) 1256 destroy_dev(ctrlr->cdev); 1257 1258 for (i = 0; i < ctrlr->num_io_queues; i++) { 1259 if (!gone) 1260 nvme_ctrlr_destroy_qpair(ctrlr, &ctrlr->ioq[i]); 1261 nvme_io_qpair_destroy(&ctrlr->ioq[i]); 1262 } 1263 free(ctrlr->ioq, M_NVME); 1264 nvme_admin_qpair_destroy(&ctrlr->adminq); 1265 1266 /* 1267 * Notify the controller of a shutdown, even though this is due to 1268 * a driver unload, not a system shutdown (this path is not invoked 1269 * during shutdown). This ensures the controller receives a 1270 * shutdown notification in case the system is shutdown before 1271 * reloading the driver. 1272 */ 1273 if (!gone) 1274 nvme_ctrlr_shutdown(ctrlr); 1275 1276 if (!gone) 1277 nvme_ctrlr_disable(ctrlr); 1278 1279 if (ctrlr->taskqueue) 1280 taskqueue_free(ctrlr->taskqueue); 1281 1282 if (ctrlr->tag) 1283 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); 1284 1285 if (ctrlr->res) 1286 bus_release_resource(ctrlr->dev, SYS_RES_IRQ, 1287 rman_get_rid(ctrlr->res), ctrlr->res); 1288 1289 if (ctrlr->msix_enabled) 1290 pci_release_msi(dev); 1291 1292 if (ctrlr->bar4_resource != NULL) { 1293 bus_release_resource(dev, SYS_RES_MEMORY, 1294 ctrlr->bar4_resource_id, ctrlr->bar4_resource); 1295 } 1296 1297 bus_release_resource(dev, SYS_RES_MEMORY, 1298 ctrlr->resource_id, ctrlr->resource); 1299 1300nores: 1301 mtx_destroy(&ctrlr->lock); 1302} 1303 1304void 1305nvme_ctrlr_shutdown(struct nvme_controller *ctrlr) 1306{ 1307 union cc_register cc; 1308 union csts_register csts; 1309 int ticks = 0; 1310 1311 cc.raw = nvme_mmio_read_4(ctrlr, cc); 1312 cc.bits.shn = NVME_SHN_NORMAL; 1313 nvme_mmio_write_4(ctrlr, cc, cc.raw); 1314 while (1) { 1315 csts.raw = nvme_mmio_read_4(ctrlr, csts); 1316 if (csts.raw == 0xffffffff) /* Hot unplug. */ 1317 break; 1318 if (csts.bits.shst == NVME_SHST_COMPLETE) 1319 break; 1320 if (ticks++ > 5*hz) { 1321 nvme_printf(ctrlr, "did not complete shutdown within" 1322 " 5 seconds of notification\n"); 1323 break; 1324 } 1325 pause("nvme shn", 1); 1326 } 1327} 1328 1329void 1330nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, 1331 struct nvme_request *req) 1332{ 1333 1334 nvme_qpair_submit_request(&ctrlr->adminq, req); 1335} 1336 1337void 1338nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, 1339 struct nvme_request *req) 1340{ 1341 struct nvme_qpair *qpair; 1342 1343 qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; 1344 nvme_qpair_submit_request(qpair, req); 1345} 1346 1347device_t 1348nvme_ctrlr_get_device(struct nvme_controller *ctrlr) 1349{ 1350 1351 return (ctrlr->dev); 1352} 1353 1354const struct nvme_controller_data * 1355nvme_ctrlr_get_data(struct nvme_controller *ctrlr) 1356{ 1357 1358 return (&ctrlr->cdata); 1359} 1360