nvme_qpair.c revision 346238
1/*- 2 * Copyright (C) 2012-2014 Intel Corporation 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/dev/nvme/nvme_qpair.c 346238 2019-04-15 15:35:42Z mav $"); 29 30#include <sys/param.h> 31#include <sys/bus.h> 32 33#include <dev/pci/pcivar.h> 34 35#include "nvme_private.h" 36 37static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, 38 struct nvme_request *req); 39static void nvme_qpair_destroy(struct nvme_qpair *qpair); 40 41struct nvme_opcode_string { 42 43 uint16_t opc; 44 const char * str; 45}; 46 47static struct nvme_opcode_string admin_opcode[] = { 48 { NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, 49 { NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, 50 { NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, 51 { NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, 52 { NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, 53 { NVME_OPC_IDENTIFY, "IDENTIFY" }, 54 { NVME_OPC_ABORT, "ABORT" }, 55 { NVME_OPC_SET_FEATURES, "SET FEATURES" }, 56 { NVME_OPC_GET_FEATURES, "GET FEATURES" }, 57 { NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, 58 { NVME_OPC_FIRMWARE_ACTIVATE, "FIRMWARE ACTIVATE" }, 59 { NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, 60 { NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, 61 { NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, 62 { NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, 63 { 0xFFFF, "ADMIN COMMAND" } 64}; 65 66static struct nvme_opcode_string io_opcode[] = { 67 { NVME_OPC_FLUSH, "FLUSH" }, 68 { NVME_OPC_WRITE, "WRITE" }, 69 { NVME_OPC_READ, "READ" }, 70 { NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, 71 { NVME_OPC_COMPARE, "COMPARE" }, 72 { NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, 73 { 0xFFFF, "IO COMMAND" } 74}; 75 76static const char * 77get_admin_opcode_string(uint16_t opc) 78{ 79 struct nvme_opcode_string *entry; 80 81 entry = admin_opcode; 82 83 while (entry->opc != 0xFFFF) { 84 if (entry->opc == opc) 85 return (entry->str); 86 entry++; 87 } 88 return (entry->str); 89} 90 91static const char * 92get_io_opcode_string(uint16_t opc) 93{ 94 struct nvme_opcode_string *entry; 95 96 entry = io_opcode; 97 98 while (entry->opc != 0xFFFF) { 99 if (entry->opc == opc) 100 return (entry->str); 101 entry++; 102 } 103 return (entry->str); 104} 105 106 107static void 108nvme_admin_qpair_print_command(struct nvme_qpair *qpair, 109 struct nvme_command *cmd) 110{ 111 112 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x " 113 "cdw10:%08x cdw11:%08x\n", 114 get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, 115 cmd->nsid, cmd->cdw10, cmd->cdw11); 116} 117 118static void 119nvme_io_qpair_print_command(struct nvme_qpair *qpair, 120 struct nvme_command *cmd) 121{ 122 123 switch (cmd->opc) { 124 case NVME_OPC_WRITE: 125 case NVME_OPC_READ: 126 case NVME_OPC_WRITE_UNCORRECTABLE: 127 case NVME_OPC_COMPARE: 128 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " 129 "lba:%llu len:%d\n", 130 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, 131 cmd->nsid, 132 ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10, 133 (cmd->cdw12 & 0xFFFF) + 1); 134 break; 135 case NVME_OPC_FLUSH: 136 case NVME_OPC_DATASET_MANAGEMENT: 137 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", 138 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, 139 cmd->nsid); 140 break; 141 default: 142 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n", 143 get_io_opcode_string(cmd->opc), cmd->opc, qpair->id, 144 cmd->cid, cmd->nsid); 145 break; 146 } 147} 148 149static void 150nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) 151{ 152 if (qpair->id == 0) 153 nvme_admin_qpair_print_command(qpair, cmd); 154 else 155 nvme_io_qpair_print_command(qpair, cmd); 156} 157 158struct nvme_status_string { 159 160 uint16_t sc; 161 const char * str; 162}; 163 164static struct nvme_status_string generic_status[] = { 165 { NVME_SC_SUCCESS, "SUCCESS" }, 166 { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, 167 { NVME_SC_INVALID_FIELD, "INVALID_FIELD" }, 168 { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, 169 { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, 170 { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, 171 { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, 172 { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, 173 { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, 174 { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, 175 { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, 176 { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, 177 { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, 178 { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, 179 { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, 180 { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, 181 { 0xFFFF, "GENERIC" } 182}; 183 184static struct nvme_status_string command_specific_status[] = { 185 { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, 186 { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, 187 { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, 188 { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, 189 { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, 190 { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, 191 { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, 192 { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, 193 { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, 194 { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, 195 { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" }, 196 { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, 197 { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, 198 { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, 199 { 0xFFFF, "COMMAND SPECIFIC" } 200}; 201 202static struct nvme_status_string media_error_status[] = { 203 { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, 204 { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, 205 { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, 206 { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, 207 { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, 208 { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, 209 { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, 210 { 0xFFFF, "MEDIA ERROR" } 211}; 212 213static const char * 214get_status_string(uint16_t sct, uint16_t sc) 215{ 216 struct nvme_status_string *entry; 217 218 switch (sct) { 219 case NVME_SCT_GENERIC: 220 entry = generic_status; 221 break; 222 case NVME_SCT_COMMAND_SPECIFIC: 223 entry = command_specific_status; 224 break; 225 case NVME_SCT_MEDIA_ERROR: 226 entry = media_error_status; 227 break; 228 case NVME_SCT_VENDOR_SPECIFIC: 229 return ("VENDOR SPECIFIC"); 230 default: 231 return ("RESERVED"); 232 } 233 234 while (entry->sc != 0xFFFF) { 235 if (entry->sc == sc) 236 return (entry->str); 237 entry++; 238 } 239 return (entry->str); 240} 241 242static void 243nvme_qpair_print_completion(struct nvme_qpair *qpair, 244 struct nvme_completion *cpl) 245{ 246 nvme_printf(qpair->ctrlr, "%s (%02x/%02x) sqid:%d cid:%d cdw0:%x\n", 247 get_status_string(cpl->status.sct, cpl->status.sc), 248 cpl->status.sct, cpl->status.sc, cpl->sqid, cpl->cid, cpl->cdw0); 249} 250 251static boolean_t 252nvme_completion_is_retry(const struct nvme_completion *cpl) 253{ 254 /* 255 * TODO: spec is not clear how commands that are aborted due 256 * to TLER will be marked. So for now, it seems 257 * NAMESPACE_NOT_READY is the only case where we should 258 * look at the DNR bit. 259 */ 260 switch (cpl->status.sct) { 261 case NVME_SCT_GENERIC: 262 switch (cpl->status.sc) { 263 case NVME_SC_ABORTED_BY_REQUEST: 264 case NVME_SC_NAMESPACE_NOT_READY: 265 if (cpl->status.dnr) 266 return (0); 267 else 268 return (1); 269 case NVME_SC_INVALID_OPCODE: 270 case NVME_SC_INVALID_FIELD: 271 case NVME_SC_COMMAND_ID_CONFLICT: 272 case NVME_SC_DATA_TRANSFER_ERROR: 273 case NVME_SC_ABORTED_POWER_LOSS: 274 case NVME_SC_INTERNAL_DEVICE_ERROR: 275 case NVME_SC_ABORTED_SQ_DELETION: 276 case NVME_SC_ABORTED_FAILED_FUSED: 277 case NVME_SC_ABORTED_MISSING_FUSED: 278 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: 279 case NVME_SC_COMMAND_SEQUENCE_ERROR: 280 case NVME_SC_LBA_OUT_OF_RANGE: 281 case NVME_SC_CAPACITY_EXCEEDED: 282 default: 283 return (0); 284 } 285 case NVME_SCT_COMMAND_SPECIFIC: 286 case NVME_SCT_MEDIA_ERROR: 287 case NVME_SCT_VENDOR_SPECIFIC: 288 default: 289 return (0); 290 } 291} 292 293static void 294nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, 295 struct nvme_completion *cpl, boolean_t print_on_error) 296{ 297 struct nvme_request *req; 298 boolean_t retry, error; 299 300 req = tr->req; 301 error = nvme_completion_is_error(cpl); 302 retry = error && nvme_completion_is_retry(cpl) && 303 req->retries < nvme_retry_count; 304 305 if (error && print_on_error) { 306 nvme_qpair_print_command(qpair, &req->cmd); 307 nvme_qpair_print_completion(qpair, cpl); 308 } 309 310 qpair->act_tr[cpl->cid] = NULL; 311 312 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); 313 314 if (req->cb_fn && !retry) 315 req->cb_fn(req->cb_arg, cpl); 316 317 mtx_lock(&qpair->lock); 318 callout_stop(&tr->timer); 319 320 if (retry) { 321 req->retries++; 322 nvme_qpair_submit_tracker(qpair, tr); 323 } else { 324 if (req->type != NVME_REQUEST_NULL) { 325 bus_dmamap_sync(qpair->dma_tag_payload, 326 tr->payload_dma_map, 327 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 328 bus_dmamap_unload(qpair->dma_tag_payload, 329 tr->payload_dma_map); 330 } 331 332 nvme_free_request(req); 333 tr->req = NULL; 334 335 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); 336 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 337 338 /* 339 * If the controller is in the middle of resetting, don't 340 * try to submit queued requests here - let the reset logic 341 * handle that instead. 342 */ 343 if (!STAILQ_EMPTY(&qpair->queued_req) && 344 !qpair->ctrlr->is_resetting) { 345 req = STAILQ_FIRST(&qpair->queued_req); 346 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 347 _nvme_qpair_submit_request(qpair, req); 348 } 349 } 350 351 mtx_unlock(&qpair->lock); 352} 353 354static void 355nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, 356 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, 357 boolean_t print_on_error) 358{ 359 struct nvme_completion cpl; 360 361 memset(&cpl, 0, sizeof(cpl)); 362 cpl.sqid = qpair->id; 363 cpl.cid = tr->cid; 364 cpl.status.sct = sct; 365 cpl.status.sc = sc; 366 cpl.status.dnr = dnr; 367 nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); 368} 369 370void 371nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, 372 struct nvme_request *req, uint32_t sct, uint32_t sc, 373 boolean_t print_on_error) 374{ 375 struct nvme_completion cpl; 376 boolean_t error; 377 378 memset(&cpl, 0, sizeof(cpl)); 379 cpl.sqid = qpair->id; 380 cpl.status.sct = sct; 381 cpl.status.sc = sc; 382 383 error = nvme_completion_is_error(&cpl); 384 385 if (error && print_on_error) { 386 nvme_qpair_print_command(qpair, &req->cmd); 387 nvme_qpair_print_completion(qpair, &cpl); 388 } 389 390 if (req->cb_fn) 391 req->cb_fn(req->cb_arg, &cpl); 392 393 nvme_free_request(req); 394} 395 396bool 397nvme_qpair_process_completions(struct nvme_qpair *qpair) 398{ 399 struct nvme_tracker *tr; 400 struct nvme_completion *cpl; 401 int done = 0; 402 403 qpair->num_intr_handler_calls++; 404 405 if (!qpair->is_enabled) 406 /* 407 * qpair is not enabled, likely because a controller reset is 408 * is in progress. Ignore the interrupt - any I/O that was 409 * associated with this interrupt will get retried when the 410 * reset is complete. 411 */ 412 return (false); 413 414 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 415 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 416 while (1) { 417 cpl = &qpair->cpl[qpair->cq_head]; 418 419 if (cpl->status.p != qpair->phase) 420 break; 421 422 tr = qpair->act_tr[cpl->cid]; 423 424 if (tr != NULL) { 425 nvme_qpair_complete_tracker(qpair, tr, cpl, TRUE); 426 qpair->sq_head = cpl->sqhd; 427 done++; 428 } else { 429 nvme_printf(qpair->ctrlr, 430 "cpl does not map to outstanding cmd\n"); 431 nvme_dump_completion(cpl); 432 KASSERT(0, ("received completion for unknown cmd\n")); 433 } 434 435 if (++qpair->cq_head == qpair->num_entries) { 436 qpair->cq_head = 0; 437 qpair->phase = !qpair->phase; 438 } 439 440 nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl, 441 qpair->cq_head); 442 } 443 return (done != 0); 444} 445 446static void 447nvme_qpair_msix_handler(void *arg) 448{ 449 struct nvme_qpair *qpair = arg; 450 451 nvme_qpair_process_completions(qpair); 452} 453 454int 455nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, 456 uint16_t vector, uint32_t num_entries, uint32_t num_trackers, 457 struct nvme_controller *ctrlr) 458{ 459 struct nvme_tracker *tr; 460 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; 461 uint64_t queuemem_phys, prpmem_phys, list_phys; 462 uint8_t *queuemem, *prpmem, *prp_list; 463 int i, err; 464 465 qpair->id = id; 466 qpair->vector = vector; 467 qpair->num_entries = num_entries; 468 qpair->num_trackers = num_trackers; 469 qpair->ctrlr = ctrlr; 470 471 if (ctrlr->msix_enabled) { 472 473 /* 474 * MSI-X vector resource IDs start at 1, so we add one to 475 * the queue's vector to get the corresponding rid to use. 476 */ 477 qpair->rid = vector + 1; 478 479 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 480 &qpair->rid, RF_ACTIVE); 481 bus_setup_intr(ctrlr->dev, qpair->res, 482 INTR_TYPE_MISC | INTR_MPSAFE, NULL, 483 nvme_qpair_msix_handler, qpair, &qpair->tag); 484 if (id == 0) { 485 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 486 "admin"); 487 } else { 488 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, 489 "io%d", id - 1); 490 } 491 } 492 493 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); 494 495 /* Note: NVMe PRP format is restricted to 4-byte alignment. */ 496 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 497 4, PAGE_SIZE, BUS_SPACE_MAXADDR, 498 BUS_SPACE_MAXADDR, NULL, NULL, NVME_MAX_XFER_SIZE, 499 (NVME_MAX_XFER_SIZE/PAGE_SIZE)+1, PAGE_SIZE, 0, 500 NULL, NULL, &qpair->dma_tag_payload); 501 if (err != 0) { 502 nvme_printf(ctrlr, "payload tag create failed %d\n", err); 503 goto out; 504 } 505 506 /* 507 * Each component must be page aligned, and individual PRP lists 508 * cannot cross a page boundary. 509 */ 510 cmdsz = qpair->num_entries * sizeof(struct nvme_command); 511 cmdsz = roundup2(cmdsz, PAGE_SIZE); 512 cplsz = qpair->num_entries * sizeof(struct nvme_completion); 513 cplsz = roundup2(cplsz, PAGE_SIZE); 514 prpsz = sizeof(uint64_t) * NVME_MAX_PRP_LIST_ENTRIES;; 515 prpmemsz = qpair->num_trackers * prpsz; 516 allocsz = cmdsz + cplsz + prpmemsz; 517 518 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 519 PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, 520 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); 521 if (err != 0) { 522 nvme_printf(ctrlr, "tag create failed %d\n", err); 523 goto out; 524 } 525 526 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, 527 BUS_DMA_NOWAIT, &qpair->queuemem_map)) { 528 nvme_printf(ctrlr, "failed to alloc qpair memory\n"); 529 goto out; 530 } 531 532 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, 533 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { 534 nvme_printf(ctrlr, "failed to load qpair memory\n"); 535 goto out; 536 } 537 538 qpair->num_cmds = 0; 539 qpair->num_intr_handler_calls = 0; 540 qpair->cmd = (struct nvme_command *)queuemem; 541 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); 542 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); 543 qpair->cmd_bus_addr = queuemem_phys; 544 qpair->cpl_bus_addr = queuemem_phys + cmdsz; 545 prpmem_phys = queuemem_phys + cmdsz + cplsz; 546 547 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[id].sq_tdbl); 548 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[id].cq_hdbl); 549 550 TAILQ_INIT(&qpair->free_tr); 551 TAILQ_INIT(&qpair->outstanding_tr); 552 STAILQ_INIT(&qpair->queued_req); 553 554 list_phys = prpmem_phys; 555 prp_list = prpmem; 556 for (i = 0; i < qpair->num_trackers; i++) { 557 558 if (list_phys + prpsz > prpmem_phys + prpmemsz) { 559 qpair->num_trackers = i; 560 break; 561 } 562 563 /* 564 * Make sure that the PRP list for this tracker doesn't 565 * overflow to another page. 566 */ 567 if (trunc_page(list_phys) != 568 trunc_page(list_phys + prpsz - 1)) { 569 list_phys = roundup2(list_phys, PAGE_SIZE); 570 prp_list = 571 (uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE); 572 } 573 574 tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK); 575 bus_dmamap_create(qpair->dma_tag_payload, 0, 576 &tr->payload_dma_map); 577 callout_init(&tr->timer, 1); 578 tr->cid = i; 579 tr->qpair = qpair; 580 tr->prp = (uint64_t *)prp_list; 581 tr->prp_bus_addr = list_phys; 582 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); 583 list_phys += prpsz; 584 prp_list += prpsz; 585 } 586 587 if (qpair->num_trackers == 0) { 588 nvme_printf(ctrlr, "failed to allocate enough trackers\n"); 589 goto out; 590 } 591 592 qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * 593 qpair->num_entries, M_NVME, M_ZERO | M_WAITOK); 594 return (0); 595 596out: 597 nvme_qpair_destroy(qpair); 598 return (ENOMEM); 599} 600 601static void 602nvme_qpair_destroy(struct nvme_qpair *qpair) 603{ 604 struct nvme_tracker *tr; 605 606 if (qpair->tag) 607 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); 608 609 if (mtx_initialized(&qpair->lock)) 610 mtx_destroy(&qpair->lock); 611 612 if (qpair->res) 613 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, 614 rman_get_rid(qpair->res), qpair->res); 615 616 if (qpair->cmd != NULL) { 617 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); 618 bus_dmamem_free(qpair->dma_tag, qpair->cmd, 619 qpair->queuemem_map); 620 } 621 622 if (qpair->act_tr) 623 free(qpair->act_tr, M_NVME); 624 625 while (!TAILQ_EMPTY(&qpair->free_tr)) { 626 tr = TAILQ_FIRST(&qpair->free_tr); 627 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 628 bus_dmamap_destroy(qpair->dma_tag_payload, 629 tr->payload_dma_map); 630 free(tr, M_NVME); 631 } 632 633 if (qpair->dma_tag) 634 bus_dma_tag_destroy(qpair->dma_tag); 635 636 if (qpair->dma_tag_payload) 637 bus_dma_tag_destroy(qpair->dma_tag_payload); 638} 639 640static void 641nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) 642{ 643 struct nvme_tracker *tr; 644 645 tr = TAILQ_FIRST(&qpair->outstanding_tr); 646 while (tr != NULL) { 647 if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { 648 nvme_qpair_manual_complete_tracker(qpair, tr, 649 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, 650 FALSE); 651 tr = TAILQ_FIRST(&qpair->outstanding_tr); 652 } else { 653 tr = TAILQ_NEXT(tr, tailq); 654 } 655 } 656} 657 658void 659nvme_admin_qpair_destroy(struct nvme_qpair *qpair) 660{ 661 662 nvme_admin_qpair_abort_aers(qpair); 663 nvme_qpair_destroy(qpair); 664} 665 666void 667nvme_io_qpair_destroy(struct nvme_qpair *qpair) 668{ 669 670 nvme_qpair_destroy(qpair); 671} 672 673static void 674nvme_abort_complete(void *arg, const struct nvme_completion *status) 675{ 676 struct nvme_tracker *tr = arg; 677 678 /* 679 * If cdw0 == 1, the controller was not able to abort the command 680 * we requested. We still need to check the active tracker array, 681 * to cover race where I/O timed out at same time controller was 682 * completing the I/O. 683 */ 684 if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { 685 /* 686 * An I/O has timed out, and the controller was unable to 687 * abort it for some reason. Construct a fake completion 688 * status, and then complete the I/O's tracker manually. 689 */ 690 nvme_printf(tr->qpair->ctrlr, 691 "abort command failed, aborting command manually\n"); 692 nvme_qpair_manual_complete_tracker(tr->qpair, tr, 693 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); 694 } 695} 696 697static void 698nvme_timeout(void *arg) 699{ 700 struct nvme_tracker *tr = arg; 701 struct nvme_qpair *qpair = tr->qpair; 702 struct nvme_controller *ctrlr = qpair->ctrlr; 703 union csts_register csts; 704 705 /* 706 * Read csts to get value of cfs - controller fatal status. 707 * If no fatal status, try to call the completion routine, and 708 * if completes transactions, report a missed interrupt and 709 * return (this may need to be rate limited). Otherwise, if 710 * aborts are enabled and the controller is not reporting 711 * fatal status, abort the command. Otherwise, just reset the 712 * controller and hope for the best. 713 */ 714 csts.raw = nvme_mmio_read_4(ctrlr, csts); 715 if (csts.bits.cfs == 0 && nvme_qpair_process_completions(qpair)) { 716 nvme_printf(ctrlr, "Missing interrupt\n"); 717 return; 718 } 719 if (ctrlr->enable_aborts && csts.bits.cfs == 0) { 720 nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); 721 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, 722 nvme_abort_complete, tr); 723 } else { 724 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", 725 csts.bits.cfs ? " and fatal error status" : ""); 726 nvme_ctrlr_reset(ctrlr); 727 } 728} 729 730void 731nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) 732{ 733 struct nvme_request *req; 734 struct nvme_controller *ctrlr; 735 736 mtx_assert(&qpair->lock, MA_OWNED); 737 738 req = tr->req; 739 req->cmd.cid = tr->cid; 740 qpair->act_tr[tr->cid] = tr; 741 ctrlr = qpair->ctrlr; 742 743 if (req->timeout) 744#if __FreeBSD_version >= 800030 745 callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz, 746 nvme_timeout, tr); 747#else 748 callout_reset(&tr->timer, ctrlr->timeout_period * hz, 749 nvme_timeout, tr); 750#endif 751 752 /* Copy the command from the tracker to the submission queue. */ 753 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); 754 755 if (++qpair->sq_tail == qpair->num_entries) 756 qpair->sq_tail = 0; 757 758 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, 759 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 760#ifndef __powerpc__ 761 /* 762 * powerpc's bus_dmamap_sync() already includes a heavyweight sync, but 763 * no other archs do. 764 */ 765 wmb(); 766#endif 767 768 nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].sq_tdbl, 769 qpair->sq_tail); 770 771 qpair->num_cmds++; 772} 773 774static void 775nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) 776{ 777 struct nvme_tracker *tr = arg; 778 uint32_t cur_nseg; 779 780 /* 781 * If the mapping operation failed, return immediately. The caller 782 * is responsible for detecting the error status and failing the 783 * tracker manually. 784 */ 785 if (error != 0) { 786 nvme_printf(tr->qpair->ctrlr, 787 "nvme_payload_map err %d\n", error); 788 return; 789 } 790 791 /* 792 * Note that we specified PAGE_SIZE for alignment and max 793 * segment size when creating the bus dma tags. So here 794 * we can safely just transfer each segment to its 795 * associated PRP entry. 796 */ 797 tr->req->cmd.prp1 = seg[0].ds_addr; 798 799 if (nseg == 2) { 800 tr->req->cmd.prp2 = seg[1].ds_addr; 801 } else if (nseg > 2) { 802 cur_nseg = 1; 803 tr->req->cmd.prp2 = (uint64_t)tr->prp_bus_addr; 804 while (cur_nseg < nseg) { 805 tr->prp[cur_nseg-1] = 806 (uint64_t)seg[cur_nseg].ds_addr; 807 cur_nseg++; 808 } 809 } else { 810 /* 811 * prp2 should not be used by the controller 812 * since there is only one segment, but set 813 * to 0 just to be safe. 814 */ 815 tr->req->cmd.prp2 = 0; 816 } 817 818 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, 819 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 820 nvme_qpair_submit_tracker(tr->qpair, tr); 821} 822 823static void 824_nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 825{ 826 struct nvme_tracker *tr; 827 int err = 0; 828 829 mtx_assert(&qpair->lock, MA_OWNED); 830 831 tr = TAILQ_FIRST(&qpair->free_tr); 832 req->qpair = qpair; 833 834 if (tr == NULL || !qpair->is_enabled) { 835 /* 836 * No tracker is available, or the qpair is disabled due to 837 * an in-progress controller-level reset or controller 838 * failure. 839 */ 840 841 if (qpair->ctrlr->is_failed) { 842 /* 843 * The controller has failed. Post the request to a 844 * task where it will be aborted, so that we do not 845 * invoke the request's callback in the context 846 * of the submission. 847 */ 848 nvme_ctrlr_post_failed_request(qpair->ctrlr, req); 849 } else { 850 /* 851 * Put the request on the qpair's request queue to be 852 * processed when a tracker frees up via a command 853 * completion or when the controller reset is 854 * completed. 855 */ 856 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); 857 } 858 return; 859 } 860 861 TAILQ_REMOVE(&qpair->free_tr, tr, tailq); 862 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); 863 tr->req = req; 864 865 switch (req->type) { 866 case NVME_REQUEST_VADDR: 867 KASSERT(req->payload_size <= qpair->ctrlr->max_xfer_size, 868 ("payload_size (%d) exceeds max_xfer_size (%d)\n", 869 req->payload_size, qpair->ctrlr->max_xfer_size)); 870 err = bus_dmamap_load(tr->qpair->dma_tag_payload, 871 tr->payload_dma_map, req->u.payload, req->payload_size, 872 nvme_payload_map, tr, 0); 873 if (err != 0) 874 nvme_printf(qpair->ctrlr, 875 "bus_dmamap_load returned 0x%x!\n", err); 876 break; 877 case NVME_REQUEST_NULL: 878 nvme_qpair_submit_tracker(tr->qpair, tr); 879 break; 880#ifdef NVME_UNMAPPED_BIO_SUPPORT 881 case NVME_REQUEST_BIO: 882 KASSERT(req->u.bio->bio_bcount <= qpair->ctrlr->max_xfer_size, 883 ("bio->bio_bcount (%jd) exceeds max_xfer_size (%d)\n", 884 (intmax_t)req->u.bio->bio_bcount, 885 qpair->ctrlr->max_xfer_size)); 886 err = bus_dmamap_load_bio(tr->qpair->dma_tag_payload, 887 tr->payload_dma_map, req->u.bio, nvme_payload_map, tr, 0); 888 if (err != 0) 889 nvme_printf(qpair->ctrlr, 890 "bus_dmamap_load_bio returned 0x%x!\n", err); 891 break; 892#endif 893 case NVME_REQUEST_CCB: 894 err = bus_dmamap_load_ccb(tr->qpair->dma_tag_payload, 895 tr->payload_dma_map, req->u.payload, 896 nvme_payload_map, tr, 0); 897 if (err != 0) 898 nvme_printf(qpair->ctrlr, 899 "bus_dmamap_load_ccb returned 0x%x!\n", err); 900 break; 901 default: 902 panic("unknown nvme request type 0x%x\n", req->type); 903 break; 904 } 905 906 if (err != 0) { 907 /* 908 * The dmamap operation failed, so we manually fail the 909 * tracker here with DATA_TRANSFER_ERROR status. 910 * 911 * nvme_qpair_manual_complete_tracker must not be called 912 * with the qpair lock held. 913 */ 914 mtx_unlock(&qpair->lock); 915 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 916 NVME_SC_DATA_TRANSFER_ERROR, 1 /* do not retry */, TRUE); 917 mtx_lock(&qpair->lock); 918 } 919} 920 921void 922nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) 923{ 924 925 mtx_lock(&qpair->lock); 926 _nvme_qpair_submit_request(qpair, req); 927 mtx_unlock(&qpair->lock); 928} 929 930static void 931nvme_qpair_enable(struct nvme_qpair *qpair) 932{ 933 934 qpair->is_enabled = TRUE; 935} 936 937void 938nvme_qpair_reset(struct nvme_qpair *qpair) 939{ 940 941 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; 942 943 /* 944 * First time through the completion queue, HW will set phase 945 * bit on completions to 1. So set this to 1 here, indicating 946 * we're looking for a 1 to know which entries have completed. 947 * we'll toggle the bit each time when the completion queue 948 * rolls over. 949 */ 950 qpair->phase = 1; 951 952 memset(qpair->cmd, 0, 953 qpair->num_entries * sizeof(struct nvme_command)); 954 memset(qpair->cpl, 0, 955 qpair->num_entries * sizeof(struct nvme_completion)); 956} 957 958void 959nvme_admin_qpair_enable(struct nvme_qpair *qpair) 960{ 961 struct nvme_tracker *tr; 962 struct nvme_tracker *tr_temp; 963 964 /* 965 * Manually abort each outstanding admin command. Do not retry 966 * admin commands found here, since they will be left over from 967 * a controller reset and its likely the context in which the 968 * command was issued no longer applies. 969 */ 970 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 971 nvme_printf(qpair->ctrlr, 972 "aborting outstanding admin command\n"); 973 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 974 NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); 975 } 976 977 nvme_qpair_enable(qpair); 978} 979 980void 981nvme_io_qpair_enable(struct nvme_qpair *qpair) 982{ 983 STAILQ_HEAD(, nvme_request) temp; 984 struct nvme_tracker *tr; 985 struct nvme_tracker *tr_temp; 986 struct nvme_request *req; 987 988 /* 989 * Manually abort each outstanding I/O. This normally results in a 990 * retry, unless the retry count on the associated request has 991 * reached its limit. 992 */ 993 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { 994 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); 995 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 996 NVME_SC_ABORTED_BY_REQUEST, 0, TRUE); 997 } 998 999 mtx_lock(&qpair->lock); 1000 1001 nvme_qpair_enable(qpair); 1002 1003 STAILQ_INIT(&temp); 1004 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); 1005 1006 while (!STAILQ_EMPTY(&temp)) { 1007 req = STAILQ_FIRST(&temp); 1008 STAILQ_REMOVE_HEAD(&temp, stailq); 1009 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); 1010 nvme_qpair_print_command(qpair, &req->cmd); 1011 _nvme_qpair_submit_request(qpair, req); 1012 } 1013 1014 mtx_unlock(&qpair->lock); 1015} 1016 1017static void 1018nvme_qpair_disable(struct nvme_qpair *qpair) 1019{ 1020 struct nvme_tracker *tr; 1021 1022 qpair->is_enabled = FALSE; 1023 mtx_lock(&qpair->lock); 1024 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) 1025 callout_stop(&tr->timer); 1026 mtx_unlock(&qpair->lock); 1027} 1028 1029void 1030nvme_admin_qpair_disable(struct nvme_qpair *qpair) 1031{ 1032 1033 nvme_qpair_disable(qpair); 1034 nvme_admin_qpair_abort_aers(qpair); 1035} 1036 1037void 1038nvme_io_qpair_disable(struct nvme_qpair *qpair) 1039{ 1040 1041 nvme_qpair_disable(qpair); 1042} 1043 1044void 1045nvme_qpair_fail(struct nvme_qpair *qpair) 1046{ 1047 struct nvme_tracker *tr; 1048 struct nvme_request *req; 1049 1050 if (!mtx_initialized(&qpair->lock)) 1051 return; 1052 1053 mtx_lock(&qpair->lock); 1054 1055 while (!STAILQ_EMPTY(&qpair->queued_req)) { 1056 req = STAILQ_FIRST(&qpair->queued_req); 1057 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); 1058 nvme_printf(qpair->ctrlr, "failing queued i/o\n"); 1059 mtx_unlock(&qpair->lock); 1060 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, 1061 NVME_SC_ABORTED_BY_REQUEST, TRUE); 1062 mtx_lock(&qpair->lock); 1063 } 1064 1065 /* Manually abort each outstanding I/O. */ 1066 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { 1067 tr = TAILQ_FIRST(&qpair->outstanding_tr); 1068 /* 1069 * Do not remove the tracker. The abort_tracker path will 1070 * do that for us. 1071 */ 1072 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); 1073 mtx_unlock(&qpair->lock); 1074 nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, 1075 NVME_SC_ABORTED_BY_REQUEST, 1 /* do not retry */, TRUE); 1076 mtx_lock(&qpair->lock); 1077 } 1078 1079 mtx_unlock(&qpair->lock); 1080} 1081 1082