nvme_qpair.c revision 346244
1/*- 2 * Copyright (C) 2012-2014 Intel Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/dev/nvme/nvme_qpair.c 346244 2019-04-15 16:28:25Z mav $"); 29 30#include <sys/param.h> 31#include <sys/bus.h> 32 33#include <dev/pci/pcivar.h> 34 35#include "nvme_private.h" 36 37static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, 38 struct nvme_request *req); 39static void nvme_qpair_destroy(struct nvme_qpair *qpair); 40 41struct nvme_opcode_string { 42 43 uint16_t opc; 44 const char * str; 45}; 46 47static struct nvme_opcode_string admin_opcode[] = { 48 { NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, 49 { NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, 50 { NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, 51 { NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, 52 { NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, 53 { NVME_OPC_IDENTIFY, "IDENTIFY" }, 54 { NVME_OPC_ABORT, "ABORT" }, 55 { NVME_OPC_SET_FEATURES, "SET FEATURES" }, 56 { NVME_OPC_GET_FEATURES, "GET FEATURES" }, 57 { NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, 58 { NVME_OPC_FIRMWARE_ACTIVATE, "FIRMWARE ACTIVATE" }, 59 { NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, 60 { NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, 61 { NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, 62 { NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, 63 { 0xFFFF, "ADMIN COMMAND" } 64}; 65 66static struct nvme_opcode_string io_opcode[] = { 67 { NVME_OPC_FLUSH, "FLUSH" }, 68 { NVME_OPC_WRITE, "WRITE" }, 69 { NVME_OPC_READ, "READ" }, 70 { NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, 71 { NVME_OPC_COMPARE, "COMPARE" }, 72 { NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, 73 { 0xFFFF, "IO COMMAND" } 74}; 75 76static const char * 77get_admin_opcode_string(uint16_t opc) 78{ 79 struct nvme_opcode_string *entry; 80 81 entry = admin_opcode; 82 83 while (entry->opc != 0xFFFF) { 84 if (entry->opc == opc) 85 return (entry->str); 86 entry++; 87 } 88 return (entry->str); 89} 90 91static const char * 92get_io_opcode_string(uint16_t opc) 93{ 94 struct nvme_opcode_string *entry; 95 96 entry = io_opcode; 97 98 while (entry->opc != 0xFFFF) { 99 if (entry->opc == opc) 100 return (entry->str); 101 entry++; 102 } 103 return (entry->str); 104} 105 106 107static void 108nvme_admin_qpair_print_command(struct nvme_qpair *qpair, 109 struct nvme_command *cmd) 110{ 111 112 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x " 113 "cdw10:%08x cdw11:%08x\n", 114 get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, 115 cmd->nsid, cmd->cdw10, cmd->cdw11); 116} 117 118static void 119nvme_io_qpair_print_command(struct nvme_qpair *qpair, 120 struct nvme_command *cmd) 121{ 122 123 switch (cmd->opc) { 124 case NVME_OPC_WRITE: 125 case NVME_OPC_READ: 126 case NVME_OPC_WRITE_UNCORRECTABLE: 127 case NVME_OPC_COMPARE: 128 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " 129 "lba:%llu len:%d\n", 130 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, 131 cmd->nsid, 132 ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10, 133 (cmd->cdw12 & 0xFFFF) + 1); 134 break; 135 case NVME_OPC_FLUSH: 136 case NVME_OPC_DATASET_MANAGEMENT: 137 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", 138 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, 139 cmd->nsid); 140 break; 141 default: 142 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n", 143 get_io_opcode_string(cmd->opc), cmd->opc, qpair->id, 144 cmd->cid, cmd->nsid); 145 break; 146 } 147} 148 149static void 150nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) 151{ 152 if (qpair->id == 0) 153 nvme_admin_qpair_print_command(qpair, cmd); 154 else 155 nvme_io_qpair_print_command(qpair, cmd); 156} 157 158struct nvme_status_string { 159 160 uint16_t sc; 161 const char * str; 162}; 163 164static struct nvme_status_string generic_status[] = { 165 { NVME_SC_SUCCESS, "SUCCESS" }, 166 { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, 167 { NVME_SC_INVALID_FIELD, "INVALID_FIELD" }, 168 { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, 169 { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, 170 { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, 171 { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, 172 { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, 173 { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, 174 { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, 175 { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, 176 { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, 177 { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, 178 { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, 179 { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, 180 { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, 181 { 0xFFFF, "GENERIC" } 182}; 183 184static struct nvme_status_string command_specific_status[] = { 185 { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, 186 { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, 187 { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, 188 { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, 189 { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, 190 { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, 191 { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, 192 { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, 193 { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, 194 { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, 195 { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" }, 196 { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, 197 { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, 198 { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, 199 { 0xFFFF, "COMMAND SPECIFIC" } 200}; 201 202static struct nvme_status_string media_error_status[] = { 203 { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, 204 { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, 205 { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, 206 { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, 207 { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, 208 { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, 209 { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, 210 { 0xFFFF, "MEDIA ERROR" } 211}; 212 213static const char * 214get_status_string(uint16_t sct, uint16_t sc) 215{ 216 struct nvme_status_string *entry; 217 218 switch (sct) { 219 case NVME_SCT_GENERIC: 220 entry = generic_status; 221 break; 222 case NVME_SCT_COMMAND_SPECIFIC: 223 entry = command_specific_status; 224 break; 225 case NVME_SCT_MEDIA_ERROR: 226 entry = media_error_status; 227 break; 228 case NVME_SCT_VENDOR_SPECIFIC: 229 return ("VENDOR SPECIFIC"); 230 default: 231 return ("RESERVED"); 232 } 233 234 while (entry->sc != 0xFFFF) { 235 if (entry->sc == sc) 236 return (entry->str); 237 entry++; 238 } 239 return (entry->str); 240} 241 242static void 243nvme_qpair_print_completion(struct nvme_qpair *qpair, 244 struct nvme_completion *cpl) 245{ 246 nvme_printf(qpair->ctrlr, "%s (%02x/%02x) sqid:%d cid:%d cdw0:%x\n", 247 get_status_string(cpl->status.sct, cpl->status.sc), 248 cpl->status.sct, cpl->status.sc, cpl->sqid, cpl->cid, cpl->cdw0); 249} 250 251static boolean_t 252nvme_completion_is_retry(const struct nvme_completion *cpl) 253{ 254 /* 255 * TODO: spec is not clear how commands that are aborted due 256 * to TLER will be marked. So for now, it seems 257 * NAMESPACE_NOT_READY is the only case where we should 258 * look at the DNR bit. 259 */ 260 switch (cpl->status.sct) { 261 case NVME_SCT_GENERIC: 262 switch (cpl->status.sc) { 263 case NVME_SC_ABORTED_BY_REQUEST: 264 case NVME_SC_NAMESPACE_NOT_READY: 265 if (cpl->status.dnr) 266 return (0); 267 else 268 return (1); 269 case NVME_SC_INVALID_OPCODE: 270 case NVME_SC_INVALID_FIELD: 271 case NVME_SC_COMMAND_ID_CONFLICT: 272 case NVME_SC_DATA_TRANSFER_ERROR: 273 case NVME_SC_ABORTED_POWER_LOSS: 274 case NVME_SC_INTERNAL_DEVICE_ERROR: 275 case NVME_SC_ABORTED_SQ_DELETION: 276 case NVME_SC_ABORTED_FAILED_FUSED: 277 case NVME_SC_ABORTED_MISSING_FUSED: 278 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: 279 case NVME_SC_COMMAND_SEQUENCE_ERROR: 280 case NVME_SC_LBA_OUT_OF_RANGE: 281 case NVME_SC_CAPACITY_EXCEEDED: 282 default: 283 return (0); 284 } 285 case NVME_SCT_COMMAND_SPECIFIC: 286 case NVME_SCT_MEDIA_ERROR: 287 case NVME_SCT_VENDOR_SPECIFIC: 288 default: 289 return (0); 290 } 291} 292 293static void 294nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, 295 struct nvme_completion *cpl, boolean_t print_on_error) 296{ 297 struct nvme_request *req; 298 boolean_t retry, error; 299 300 req = tr->req; 301 error = nvme_completion_is_error(cpl); 302 retry = error && nvme_completion_is_retry(cpl) && 303 req->retries < nvme_retry_count; 304 305 if (error && print_on_error) { 306 nvme_qpair_print_command(qpair, &req->cmd); 307 nvme_qpair_print_completion(qpair, cpl); 308 } 309 310 qpair->act_tr[cpl->cid] = NULL; 311 312 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); 313 314 if (req->cb_fn && !retry) 315 req->cb_fn(req->cb_arg, cpl); 316 317 mtx_lock(&qpair->lock); 318 callout_stop(&tr->timer); 319 320 if (retry) { 321 req->retries++; 322 nvme_qpair_submit_tracker(qpair, tr); 323 } else { 324 if (req->type != NVME_REQUEST_NULL) { 325 bus_dmamap_sync(qpair->dma_tag_payload, 326 tr->payload_dma_map, 327 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 328 bus_dmamap_unload(qpair->dma_tag_payload, 329 tr->payload_dma_map); 330 } 331 332 nvme_free_request(req); 333 tr->req = NULL; 334 335 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); 336 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 337 338 /* 339 * If the controller is in the middle of resetting, don't 340 * try to submit queued requests here - let the reset logic 341 * handle that instead. 342 */ 343 if (!STAILQ_EMPTY(&qpair->queued_req) && 344 !qpair->ctrlr->is_resetting) { 345 req = STAILQ_FIRST(&qpair->queued_req); 346 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 347 _nvme_qpair_submit_request(qpair, req); 348 } 349 } 350 351 mtx_unlock(&qpair->lock); 352} 353 354static void 355nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, 356 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 357 boolean_t print_on_error) 358{ 359 struct nvme_completion cpl; 360 361 memset(&cpl, 0, sizeof(cpl)); 362 cpl.sqid = qpair->id; 363 cpl.cid = tr->cid; 364 cpl.status.sct = sct; 365 cpl.status.sc = sc; 366 cpl.status.dnr = dnr; 367 nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 368} 369 370void 371nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, 372 struct nvme_request *req, uint32_t sct, uint32_t sc, 373 boolean_t print_on_error) 374{ 375 struct nvme_completion cpl; 376 boolean_t error; 377 378 memset(&cpl, 0, sizeof(cpl)); 379 cpl.sqid = qpair->id; 380 cpl.status.sct = sct; 381 cpl.status.sc = sc; 382 383 error = nvme_completion_is_error(&cpl); 384 385 if (error && print_on_error) { 386 nvme_qpair_print_command(qpair, &req->cmd); 387 nvme_qpair_print_completion(qpair, &cpl); 388 } 389 390 if (req->cb_fn) 391 req->cb_fn(req->cb_arg, &cpl); 392 393 nvme_free_request(req); 394} 395 396bool 397nvme_qpair_process_completions(struct nvme_qpair *qpair) 398{ 399 struct nvme_tracker *tr; 400 struct nvme_completion *cpl; 401 int done = 0; 402 403 qpair->num_intr_handler_calls++; 404 405 if (!qpair->is_enabled) 406 /* 407 * qpair is not enabled, likely because a controller reset is 408 * is in progress. Ignore the interrupt - any I/O that was 409 * associated with this interrupt will get retried when the 410 * reset is complete. 411 */ 412 return (false); 413 414 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 415 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 416 while (1) { 417 cpl = &qpair->cpl[qpair->cq_head]; 418 419 if (cpl->status.p != qpair->phase) 420 break; 421 422 tr = qpair->act_tr[cpl->cid]; 423 424 if (tr != NULL) { 425 nvme_qpair_complete_tracker(qpair, tr, cpl, TRUE); 426 qpair->sq_head = cpl->sqhd; 427 done++; 428 } else { 429 nvme_printf(qpair->ctrlr, 430 "cpl does not map to outstanding cmd\n"); 431 nvme_dump_completion(cpl); 432 KASSERT(0, ("received completion for unknown cmd\n")); 433 } 434 435 if (++qpair->cq_head == qpair->num_entries) { 436 qpair->cq_head = 0; 437 qpair->phase = !qpair->phase; 438 } 439 440 nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl, 441 qpair->cq_head); 442 } 443 return (done != 0); 444} 445 446static void 447nvme_qpair_msix_handler(void *arg) 448{ 449 struct nvme_qpair *qpair = arg; 450 451 nvme_qpair_process_completions(qpair); 452} 453 454int 455nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, 456 uint16_t vector, uint32_t num_entries, uint32_t num_trackers, 457 struct nvme_controller *ctrlr) 458{ 459 struct nvme_tracker *tr; 460 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; 461 uint64_t queuemem_phys, prpmem_phys, list_phys; 462 uint8_t *queuemem, *prpmem, *prp_list; 463 int i, err; 464 465 qpair->id = id; 466 qpair->vector = vector; 467 qpair->num_entries = num_entries; 468 qpair->num_trackers = num_trackers; 469 qpair->ctrlr = ctrlr; 470 471 if (ctrlr->msix_enabled) { 472 473 /* 474 * MSI-X vector resource IDs start at 1, so we add one to 475 * the queue's vector to get the corresponding rid to use. 476 */ 477 qpair->rid = vector + 1; 478 479 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 480 &qpair->rid, RF_ACTIVE); 481 bus_setup_intr(ctrlr->dev, qpair->res, 482 INTR_TYPE_MISC | INTR_MPSAFE, NULL, 483 nvme_qpair_msix_handler, qpair, &qpair->tag); 484 if (id == 0) { 485 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 486 "admin"); 487 } else { 488 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 489 "io%d", id - 1); 490 } 491 } 492 493 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); 494 495 /* Note: NVMe PRP format is restricted to 4-byte alignment. */ 496 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 497 4, PAGE_SIZE, BUS_SPACE_MAXADDR, 498 BUS_SPACE_MAXADDR, NULL, NULL, NVME_MAX_XFER_SIZE, 499 (NVME_MAX_XFER_SIZE/PAGE_SIZE)+1, PAGE_SIZE, 0, 500 NULL, NULL, &qpair->dma_tag_payload); 501 if (err != 0) { 502 nvme_printf(ctrlr, "payload tag create failed %d\n", err); 503 goto out; 504 } 505 506 /* 507 * Each component must be page aligned, and individual PRP lists 508 * cannot cross a page boundary. 509 */ 510 cmdsz = qpair->num_entries * sizeof(struct nvme_command); 511 cmdsz = roundup2(cmdsz, PAGE_SIZE); 512 cplsz = qpair->num_entries * sizeof(struct nvme_completion); 513 cplsz = roundup2(cplsz, PAGE_SIZE); 514 prpsz = sizeof(uint64_t) * NVME_MAX_PRP_LIST_ENTRIES;; 515 prpmemsz = qpair->num_trackers * prpsz; 516 allocsz = cmdsz + cplsz + prpmemsz; 517 518 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 519 PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, 520 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); 521 if (err != 0) { 522 nvme_printf(ctrlr, "tag create failed %d\n", err); 523 goto out; 524 } 525 526 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, 527 BUS_DMA_NOWAIT, &qpair->queuemem_map)) { 528 nvme_printf(ctrlr, "failed to alloc qpair memory\n"); 529 goto out; 530 } 531 532 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, 533 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { 534 nvme_printf(ctrlr, "failed to load qpair memory\n"); 535 goto out; 536 } 537 538 qpair->num_cmds = 0; 539 qpair->num_intr_handler_calls = 0; 540 qpair->cmd = (struct nvme_command *)queuemem; 541 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); 542 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); 543 qpair->cmd_bus_addr = queuemem_phys; 544 qpair->cpl_bus_addr = queuemem_phys + cmdsz; 545 prpmem_phys = queuemem_phys + cmdsz + cplsz; 546 547 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[id].sq_tdbl); 548 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[id].cq_hdbl); 549 550 TAILQ_INIT(&qpair->free_tr); 551 TAILQ_INIT(&qpair->outstanding_tr); 552 STAILQ_INIT(&qpair->queued_req); 553 554 list_phys = prpmem_phys; 555 prp_list = prpmem; 556 for (i = 0; i < qpair->num_trackers; i++) { 557 558 if (list_phys + prpsz > prpmem_phys + prpmemsz) { 559 qpair->num_trackers = i; 560 break; 561 } 562 563 /* 564 * Make sure that the PRP list for this tracker doesn't 565 * overflow to another page. 566 */ 567 if (trunc_page(list_phys) != 568 trunc_page(list_phys + prpsz - 1)) { 569 list_phys = roundup2(list_phys, PAGE_SIZE); 570 prp_list = 571 (uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE); 572 } 573 574 tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK); 575 bus_dmamap_create(qpair->dma_tag_payload, 0, 576 &tr->payload_dma_map); 577 callout_init(&tr->timer, 1); 578 tr->cid = i; 579 tr->qpair = qpair; 580 tr->prp = (uint64_t *)prp_list; 581 tr->prp_bus_addr = list_phys; 582 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 583 list_phys += prpsz; 584 prp_list += prpsz; 585 } 586 587 if (qpair->num_trackers == 0) { 588 nvme_printf(ctrlr, "failed to allocate enough trackers\n"); 589 goto out; 590 } 591 592 qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * 593 qpair->num_entries, M_NVME, M_ZERO | M_WAITOK); 594 return (0); 595 596out: 597 nvme_qpair_destroy(qpair); 598 return (ENOMEM); 599} 600 601static void 602nvme_qpair_destroy(struct nvme_qpair *qpair) 603{ 604 struct nvme_tracker *tr; 605 606 if (qpair->tag) 607 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); 608 609 if (mtx_initialized(&qpair->lock)) 610 mtx_destroy(&qpair->lock); 611 612 if (qpair->res) 613 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, 614 rman_get_rid(qpair->res), qpair->res); 615 616 if (qpair->cmd != NULL) { 617 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); 618 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 619 qpair->queuemem_map); 620 } 621 622 if (qpair->act_tr) 623 free(qpair->act_tr, M_NVME); 624 625 while (!TAILQ_EMPTY(&qpair->free_tr)) { 626 tr = TAILQ_FIRST(&qpair->free_tr); 627 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 628 bus_dmamap_destroy(qpair->dma_tag_payload, 629 tr->payload_dma_map); 630 free(tr, M_NVME); 631 } 632 633 if (qpair->dma_tag) 634 bus_dma_tag_destroy(qpair->dma_tag); 635 636 if (qpair->dma_tag_payload) 637 bus_dma_tag_destroy(qpair->dma_tag_payload); 638} 639 640static void 641nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) 642{ 643 struct nvme_tracker *tr; 644 645 tr = TAILQ_FIRST(&qpair->outstanding_tr); 646 while (tr != NULL) { 647 if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { 648 nvme_qpair_manual_complete_tracker(qpair, tr, 649 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, 650 FALSE); 651 tr = TAILQ_FIRST(&qpair->outstanding_tr); 652 } else { 653 tr = TAILQ_NEXT(tr, tailq); 654 } 655 } 656} 657 658void 659nvme_admin_qpair_destroy(struct nvme_qpair *qpair) 660{ 661 662 nvme_admin_qpair_abort_aers(qpair); 663 nvme_qpair_destroy(qpair); 664} 665 666void 667nvme_io_qpair_destroy(struct nvme_qpair *qpair) 668{ 669 670 nvme_qpair_destroy(qpair); 671} 672 673static void 674nvme_abort_complete(void *arg, const struct nvme_completion *status) 675{ 676 struct nvme_tracker *tr = arg; 677 678 /* 679 * If cdw0 == 1, the controller was not able to abort the command 680 * we requested. We still need to check the active tracker array, 681 * to cover race where I/O timed out at same time controller was 682 * completing the I/O. 683 */ 684 if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { 685 /* 686 * An I/O has timed out, and the controller was unable to 687 * abort it for some reason. Construct a fake completion 688 * status, and then complete the I/O's tracker manually. 689 */ 690 nvme_printf(tr->qpair->ctrlr, 691 "abort command failed, aborting command manually\n"); 692 nvme_qpair_manual_complete_tracker(tr->qpair, tr, 693 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); 694 } 695} 696 697static void 698nvme_timeout(void *arg) 699{ 700 struct nvme_tracker *tr = arg; 701 struct nvme_qpair *qpair = tr->qpair; 702 struct nvme_controller *ctrlr = qpair->ctrlr; 703 union csts_register csts; 704 705 /* 706 * Read csts to get value of cfs - controller fatal status. 707 * If no fatal status, try to call the completion routine, and 708 * if completes transactions, report a missed interrupt and 709 * return (this may need to be rate limited). Otherwise, if 710 * aborts are enabled and the controller is not reporting 711 * fatal status, abort the command. Otherwise, just reset the 712 * controller and hope for the best. 713 */ 714 csts.raw = nvme_mmio_read_4(ctrlr, csts); 715 if (csts.bits.cfs == 0 && nvme_qpair_process_completions(qpair)) { 716 nvme_printf(ctrlr, "Missing interrupt\n"); 717 return; 718 } 719 if (ctrlr->enable_aborts && csts.bits.cfs == 0) { 720 nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); 721 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, 722 nvme_abort_complete, tr); 723 } else { 724 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", 725 csts.bits.cfs ? " and fatal error status" : ""); 726 nvme_ctrlr_reset(ctrlr); 727 } 728} 729 730void 731nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) 732{ 733 struct nvme_request *req; 734 struct nvme_controller *ctrlr; 735 736 mtx_assert(&qpair->lock, MA_OWNED); 737 738 req = tr->req; 739 req->cmd.cid = tr->cid; 740 qpair->act_tr[tr->cid] = tr; 741 ctrlr = qpair->ctrlr; 742 743 if (req->timeout) 744 callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz, 745 nvme_timeout, tr); 746 747 /* Copy the command from the tracker to the submission queue. */ 748 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); 749 750 if (++qpair->sq_tail == qpair->num_entries) 751 qpair->sq_tail = 0; 752 753 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 754 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 755#ifndef __powerpc__ 756 /* 757 * powerpc's bus_dmamap_sync() already includes a heavyweight sync, but 758 * no other archs do. 759 */ 760 wmb(); 761#endif 762 763 nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].sq_tdbl, 764 qpair->sq_tail); 765 766 qpair->num_cmds++; 767} 768 769static void 770nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) 771{ 772 struct nvme_tracker *tr = arg; 773 uint32_t cur_nseg; 774 775 /* 776 * If the mapping operation failed, return immediately. The caller 777 * is responsible for detecting the error status and failing the 778 * tracker manually. 779 */ 780 if (error != 0) { 781 nvme_printf(tr->qpair->ctrlr, 782 "nvme_payload_map err %d\n", error); 783 return; 784 } 785 786 /* 787 * Note that we specified PAGE_SIZE for alignment and max 788 * segment size when creating the bus dma tags. So here 789 * we can safely just transfer each segment to its 790 * associated PRP entry. 791 */ 792 tr->req->cmd.prp1 = seg[0].ds_addr; 793 794 if (nseg == 2) { 795 tr->req->cmd.prp2 = seg[1].ds_addr; 796 } else if (nseg > 2) { 797 cur_nseg = 1; 798 tr->req->cmd.prp2 = (uint64_t)tr->prp_bus_addr; 799 while (cur_nseg < nseg) { 800 tr->prp[cur_nseg-1] = 801 (uint64_t)seg[cur_nseg].ds_addr; 802 cur_nseg++; 803 } 804 } else { 805 /* 806 * prp2 should not be used by the controller 807 * since there is only one segment, but set 808 * to 0 just to be safe. 809 */ 810 tr->req->cmd.prp2 = 0; 811 } 812 813 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, 814 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 815 nvme_qpair_submit_tracker(tr->qpair, tr); 816} 817 818static void 819_nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 820{ 821 struct nvme_tracker *tr; 822 int err = 0; 823 824 mtx_assert(&qpair->lock, MA_OWNED); 825 826 tr = TAILQ_FIRST(&qpair->free_tr); 827 req->qpair = qpair; 828 829 if (tr == NULL || !qpair->is_enabled) { 830 /* 831 * No tracker is available, or the qpair is disabled due to 832 * an in-progress controller-level reset or controller 833 * failure. 834 */ 835 836 if (qpair->ctrlr->is_failed) { 837 /* 838 * The controller has failed. Post the request to a 839 * task where it will be aborted, so that we do not 840 * invoke the request's callback in the context 841 * of the submission. 842 */ 843 nvme_ctrlr_post_failed_request(qpair->ctrlr, req); 844 } else { 845 /* 846 * Put the request on the qpair's request queue to be 847 * processed when a tracker frees up via a command 848 * completion or when the controller reset is 849 * completed. 850 */ 851 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 852 } 853 return; 854 } 855 856 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 857 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); 858 tr->req = req; 859 860 switch (req->type) { 861 case NVME_REQUEST_VADDR: 862 KASSERT(req->payload_size <= qpair->ctrlr->max_xfer_size, 863 ("payload_size (%d) exceeds max_xfer_size (%d)\n", 864 req->payload_size, qpair->ctrlr->max_xfer_size)); 865 err = bus_dmamap_load(tr->qpair->dma_tag_payload, 866 tr->payload_dma_map, req->u.payload, req->payload_size, 867 nvme_payload_map, tr, 0); 868 if (err != 0) 869 nvme_printf(qpair->ctrlr, 870 "bus_dmamap_load returned 0x%x!\n", err); 871 break; 872 case NVME_REQUEST_NULL: 873 nvme_qpair_submit_tracker(tr->qpair, tr); 874 break; 875 case NVME_REQUEST_BIO: 876 KASSERT(req->u.bio->bio_bcount <= qpair->ctrlr->max_xfer_size, 877 ("bio->bio_bcount (%jd) exceeds max_xfer_size (%d)\n", 878 (intmax_t)req->u.bio->bio_bcount, 879 qpair->ctrlr->max_xfer_size)); 880 err = bus_dmamap_load_bio(tr->qpair->dma_tag_payload, 881 tr->payload_dma_map, req->u.bio, nvme_payload_map, tr, 0); 882 if (err != 0) 883 nvme_printf(qpair->ctrlr, 884 "bus_dmamap_load_bio returned 0x%x!\n", err); 885 break; 886 case NVME_REQUEST_CCB: 887 err = bus_dmamap_load_ccb(tr->qpair->dma_tag_payload, 888 tr->payload_dma_map, req->u.payload, 889 nvme_payload_map, tr, 0); 890 if (err != 0) 891 nvme_printf(qpair->ctrlr, 892 "bus_dmamap_load_ccb returned 0x%x!\n", err); 893 break; 894 default: 895 panic("unknown nvme request type 0x%x\n", req->type); 896 break; 897 } 898 899 if (err != 0) { 900 /* 901 * The dmamap operation failed, so we manually fail the 902 * tracker here with DATA_TRANSFER_ERROR status. 903 * 904 * nvme_qpair_manual_complete_tracker must not be called 905 * with the qpair lock held. 906 */ 907 mtx_unlock(&qpair->lock); 908 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 909 NVME_SC_DATA_TRANSFER_ERROR, 1 /* do not retry */, TRUE); 910 mtx_lock(&qpair->lock); 911 } 912} 913 914void 915nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 916{ 917 918 mtx_lock(&qpair->lock); 919 _nvme_qpair_submit_request(qpair, req); 920 mtx_unlock(&qpair->lock); 921} 922 923static void 924nvme_qpair_enable(struct nvme_qpair *qpair) 925{ 926 927 qpair->is_enabled = TRUE; 928} 929 930void 931nvme_qpair_reset(struct nvme_qpair *qpair) 932{ 933 934 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; 935 936 /* 937 * First time through the completion queue, HW will set phase 938 * bit on completions to 1. So set this to 1 here, indicating 939 * we're looking for a 1 to know which entries have completed. 940 * we'll toggle the bit each time when the completion queue 941 * rolls over. 942 */ 943 qpair->phase = 1; 944 945 memset(qpair->cmd, 0, 946 qpair->num_entries * sizeof(struct nvme_command)); 947 memset(qpair->cpl, 0, 948 qpair->num_entries * sizeof(struct nvme_completion)); 949} 950 951void 952nvme_admin_qpair_enable(struct nvme_qpair *qpair) 953{ 954 struct nvme_tracker *tr; 955 struct nvme_tracker *tr_temp; 956 957 /* 958 * Manually abort each outstanding admin command. Do not retry 959 * admin commands found here, since they will be left over from 960 * a controller reset and its likely the context in which the 961 * command was issued no longer applies. 962 */ 963 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 964 nvme_printf(qpair->ctrlr, 965 "aborting outstanding admin command\n"); 966 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 967 NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); 968 } 969 970 nvme_qpair_enable(qpair); 971} 972 973void 974nvme_io_qpair_enable(struct nvme_qpair *qpair) 975{ 976 STAILQ_HEAD(, nvme_request) temp; 977 struct nvme_tracker *tr; 978 struct nvme_tracker *tr_temp; 979 struct nvme_request *req; 980 981 /* 982 * Manually abort each outstanding I/O. This normally results in a 983 * retry, unless the retry count on the associated request has 984 * reached its limit. 985 */ 986 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 987 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); 988 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 989 NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); 990 } 991 992 mtx_lock(&qpair->lock); 993 994 nvme_qpair_enable(qpair); 995 996 STAILQ_INIT(&temp); 997 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); 998 999 while (!STAILQ_EMPTY(&temp)) { 1000 req = STAILQ_FIRST(&temp); 1001 STAILQ_REMOVE_HEAD(&temp, stailq); 1002 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); 1003 nvme_qpair_print_command(qpair, &req->cmd); 1004 _nvme_qpair_submit_request(qpair, req); 1005 } 1006 1007 mtx_unlock(&qpair->lock); 1008} 1009 1010static void 1011nvme_qpair_disable(struct nvme_qpair *qpair) 1012{ 1013 struct nvme_tracker *tr; 1014 1015 qpair->is_enabled = FALSE; 1016 mtx_lock(&qpair->lock); 1017 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) 1018 callout_stop(&tr->timer); 1019 mtx_unlock(&qpair->lock); 1020} 1021 1022void 1023nvme_admin_qpair_disable(struct nvme_qpair *qpair) 1024{ 1025 1026 nvme_qpair_disable(qpair); 1027 nvme_admin_qpair_abort_aers(qpair); 1028} 1029 1030void 1031nvme_io_qpair_disable(struct nvme_qpair *qpair) 1032{ 1033 1034 nvme_qpair_disable(qpair); 1035} 1036 1037void 1038nvme_qpair_fail(struct nvme_qpair *qpair) 1039{ 1040 struct nvme_tracker *tr; 1041 struct nvme_request *req; 1042 1043 if (!mtx_initialized(&qpair->lock)) 1044 return; 1045 1046 mtx_lock(&qpair->lock); 1047 1048 while (!STAILQ_EMPTY(&qpair->queued_req)) { 1049 req = STAILQ_FIRST(&qpair->queued_req); 1050 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1051 nvme_printf(qpair->ctrlr, "failing queued i/o\n"); 1052 mtx_unlock(&qpair->lock); 1053 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, 1054 NVME_SC_ABORTED_BY_REQUEST, TRUE); 1055 mtx_lock(&qpair->lock); 1056 } 1057 1058 /* Manually abort each outstanding I/O. */ 1059 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1060 tr = TAILQ_FIRST(&qpair->outstanding_tr); 1061 /* 1062 * Do not remove the tracker. The abort_tracker path will 1063 * do that for us. 1064 */ 1065 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); 1066 mtx_unlock(&qpair->lock); 1067 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 1068 NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); 1069 mtx_lock(&qpair->lock); 1070 } 1071 1072 mtx_unlock(&qpair->lock); 1073} 1074 1075