1240616Sjimharris/*- 2293672Sjimharris * Copyright (C) 2012-2016 Intel Corporation 3240616Sjimharris * All rights reserved. 4240616Sjimharris * 5240616Sjimharris * Redistribution and use in source and binary forms, with or without 6240616Sjimharris * modification, are permitted provided that the following conditions 7240616Sjimharris * are met: 8240616Sjimharris * 1. Redistributions of source code must retain the above copyright 9240616Sjimharris * notice, this list of conditions and the following disclaimer. 10240616Sjimharris * 2. Redistributions in binary form must reproduce the above copyright 11240616Sjimharris * notice, this list of conditions and the following disclaimer in the 12240616Sjimharris * documentation and/or other materials provided with the distribution. 13240616Sjimharris * 14240616Sjimharris * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15240616Sjimharris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16240616Sjimharris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17240616Sjimharris * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18240616Sjimharris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19240616Sjimharris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20240616Sjimharris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21240616Sjimharris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22240616Sjimharris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23240616Sjimharris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24240616Sjimharris * SUCH DAMAGE. 25240616Sjimharris */ 26240616Sjimharris 27240616Sjimharris#include <sys/cdefs.h> 28240616Sjimharris__FBSDID("$FreeBSD$"); 29240616Sjimharris 30240616Sjimharris#include <sys/param.h> 31249421Sjimharris#include <sys/systm.h> 32249421Sjimharris#include <sys/buf.h> 33240616Sjimharris#include <sys/bus.h> 34240616Sjimharris#include <sys/conf.h> 35240616Sjimharris#include <sys/ioccom.h> 36249421Sjimharris#include <sys/proc.h> 37240616Sjimharris#include <sys/smp.h> 38249421Sjimharris#include <sys/uio.h> 39240616Sjimharris 40240616Sjimharris#include <dev/pci/pcireg.h> 41240616Sjimharris#include <dev/pci/pcivar.h> 42240616Sjimharris 43240616Sjimharris#include "nvme_private.h" 44240616Sjimharris 45248737Sjimharrisstatic void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, 46248737Sjimharris struct nvme_async_event_request *aer); 47293670Sjimharrisstatic void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); 48248737Sjimharris 49240616Sjimharrisstatic int 50240616Sjimharrisnvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr) 51240616Sjimharris{ 52240616Sjimharris 53282926Sjimharris ctrlr->resource_id = PCIR_BAR(0); 54240616Sjimharris 55240616Sjimharris ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY, 56240616Sjimharris &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE); 57240616Sjimharris 58240616Sjimharris if(ctrlr->resource == NULL) { 59248773Sjimharris nvme_printf(ctrlr, "unable to allocate pci resource\n"); 60240616Sjimharris return (ENOMEM); 61240616Sjimharris } 62240616Sjimharris 63240616Sjimharris ctrlr->bus_tag = rman_get_bustag(ctrlr->resource); 64240616Sjimharris ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource); 65240616Sjimharris ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle; 66240616Sjimharris 67244413Sjimharris /* 68244413Sjimharris * The NVMe spec allows for the MSI-X table to be placed behind 69244413Sjimharris * BAR 4/5, separate from the control/doorbell registers. Always 70244413Sjimharris * try to map this bar, because it must be mapped prior to calling 71244413Sjimharris * pci_alloc_msix(). If the table isn't behind BAR 4/5, 72244413Sjimharris * bus_alloc_resource() will just return NULL which is OK. 73244413Sjimharris */ 74244413Sjimharris ctrlr->bar4_resource_id = PCIR_BAR(4); 75244413Sjimharris ctrlr->bar4_resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY, 76244413Sjimharris &ctrlr->bar4_resource_id, 0, ~0, 1, RF_ACTIVE); 77244413Sjimharris 78240616Sjimharris return (0); 79240616Sjimharris} 80240616Sjimharris 81240616Sjimharrisstatic void 82240616Sjimharrisnvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) 83240616Sjimharris{ 84240616Sjimharris struct nvme_qpair *qpair; 85240616Sjimharris uint32_t num_entries; 86240616Sjimharris 87240616Sjimharris qpair = &ctrlr->adminq; 88240616Sjimharris 89240616Sjimharris num_entries = NVME_ADMIN_ENTRIES; 90240616Sjimharris TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); 91240616Sjimharris /* 92240616Sjimharris * If admin_entries was overridden to an invalid value, revert it 93240616Sjimharris * back to our default value. 94240616Sjimharris */ 95240616Sjimharris if (num_entries < NVME_MIN_ADMIN_ENTRIES || 96240616Sjimharris num_entries > NVME_MAX_ADMIN_ENTRIES) { 97248773Sjimharris nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d " 98248773Sjimharris "specified\n", num_entries); 99240616Sjimharris num_entries = NVME_ADMIN_ENTRIES; 100240616Sjimharris } 101240616Sjimharris 102240616Sjimharris /* 103240616Sjimharris * The admin queue's max xfer size is treated differently than the 104240616Sjimharris * max I/O xfer size. 16KB is sufficient here - maybe even less? 105240616Sjimharris */ 106241664Sjimharris nvme_qpair_construct(qpair, 107241664Sjimharris 0, /* qpair ID */ 108241664Sjimharris 0, /* vector */ 109241664Sjimharris num_entries, 110241664Sjimharris NVME_ADMIN_TRACKERS, 111241664Sjimharris ctrlr); 112240616Sjimharris} 113240616Sjimharris 114240616Sjimharrisstatic int 115240616Sjimharrisnvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) 116240616Sjimharris{ 117240616Sjimharris struct nvme_qpair *qpair; 118240616Sjimharris union cap_lo_register cap_lo; 119241664Sjimharris int i, num_entries, num_trackers; 120240616Sjimharris 121240616Sjimharris num_entries = NVME_IO_ENTRIES; 122240616Sjimharris TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); 123240616Sjimharris 124240616Sjimharris /* 125240616Sjimharris * NVMe spec sets a hard limit of 64K max entries, but 126240616Sjimharris * devices may specify a smaller limit, so we need to check 127240616Sjimharris * the MQES field in the capabilities register. 128240616Sjimharris */ 129240616Sjimharris cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo); 130240616Sjimharris num_entries = min(num_entries, cap_lo.bits.mqes+1); 131240616Sjimharris 132241664Sjimharris num_trackers = NVME_IO_TRACKERS; 133241664Sjimharris TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); 134241664Sjimharris 135241664Sjimharris num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); 136241664Sjimharris num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); 137241664Sjimharris /* 138241664Sjimharris * No need to have more trackers than entries in the submit queue. 139241664Sjimharris * Note also that for a queue size of N, we can only have (N-1) 140241664Sjimharris * commands outstanding, hence the "-1" here. 141241664Sjimharris */ 142241664Sjimharris num_trackers = min(num_trackers, (num_entries-1)); 143241664Sjimharris 144293671Sjimharris /* 145293671Sjimharris * This was calculated previously when setting up interrupts, but 146293671Sjimharris * a controller could theoretically support fewer I/O queues than 147293671Sjimharris * MSI-X vectors. So calculate again here just to be safe. 148293671Sjimharris */ 149293673Sjimharris ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues); 150293671Sjimharris 151240616Sjimharris ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), 152248770Sjimharris M_NVME, M_ZERO | M_WAITOK); 153240616Sjimharris 154240616Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) { 155240616Sjimharris qpair = &ctrlr->ioq[i]; 156240616Sjimharris 157240616Sjimharris /* 158240616Sjimharris * Admin queue has ID=0. IO queues start at ID=1 - 159240616Sjimharris * hence the 'i+1' here. 160240616Sjimharris * 161240616Sjimharris * For I/O queues, use the controller-wide max_xfer_size 162240616Sjimharris * calculated in nvme_attach(). 163240616Sjimharris */ 164240616Sjimharris nvme_qpair_construct(qpair, 165240616Sjimharris i+1, /* qpair ID */ 166240616Sjimharris ctrlr->msix_enabled ? i+1 : 0, /* vector */ 167240616Sjimharris num_entries, 168241664Sjimharris num_trackers, 169240616Sjimharris ctrlr); 170240616Sjimharris 171293671Sjimharris /* 172293671Sjimharris * Do not bother binding interrupts if we only have one I/O 173293671Sjimharris * interrupt thread for this controller. 174293671Sjimharris */ 175293668Sjimharris if (ctrlr->num_io_queues > 1) 176293671Sjimharris bus_bind_intr(ctrlr->dev, qpair->res, 177293671Sjimharris i * ctrlr->num_cpus_per_ioq); 178240616Sjimharris } 179240616Sjimharris 180240616Sjimharris return (0); 181240616Sjimharris} 182240616Sjimharris 183248767Sjimharrisstatic void 184248767Sjimharrisnvme_ctrlr_fail(struct nvme_controller *ctrlr) 185248767Sjimharris{ 186248767Sjimharris int i; 187248767Sjimharris 188248767Sjimharris ctrlr->is_failed = TRUE; 189248767Sjimharris nvme_qpair_fail(&ctrlr->adminq); 190248767Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) 191248767Sjimharris nvme_qpair_fail(&ctrlr->ioq[i]); 192248767Sjimharris nvme_notify_fail_consumers(ctrlr); 193248767Sjimharris} 194248767Sjimharris 195248767Sjimharrisvoid 196248767Sjimharrisnvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, 197248767Sjimharris struct nvme_request *req) 198248767Sjimharris{ 199248767Sjimharris 200249417Sjimharris mtx_lock(&ctrlr->lock); 201248767Sjimharris STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); 202249417Sjimharris mtx_unlock(&ctrlr->lock); 203248767Sjimharris taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); 204248767Sjimharris} 205248767Sjimharris 206248767Sjimharrisstatic void 207248767Sjimharrisnvme_ctrlr_fail_req_task(void *arg, int pending) 208248767Sjimharris{ 209248767Sjimharris struct nvme_controller *ctrlr = arg; 210248767Sjimharris struct nvme_request *req; 211248767Sjimharris 212249417Sjimharris mtx_lock(&ctrlr->lock); 213248767Sjimharris while (!STAILQ_EMPTY(&ctrlr->fail_req)) { 214248767Sjimharris req = STAILQ_FIRST(&ctrlr->fail_req); 215248767Sjimharris STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq); 216248767Sjimharris nvme_qpair_manual_complete_request(req->qpair, req, 217248767Sjimharris NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, TRUE); 218248767Sjimharris } 219249417Sjimharris mtx_unlock(&ctrlr->lock); 220248767Sjimharris} 221248767Sjimharris 222240616Sjimharrisstatic int 223285918Sjimharrisnvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val) 224240616Sjimharris{ 225240616Sjimharris int ms_waited; 226240616Sjimharris union cc_register cc; 227240616Sjimharris union csts_register csts; 228240616Sjimharris 229240616Sjimharris cc.raw = nvme_mmio_read_4(ctrlr, cc); 230240616Sjimharris csts.raw = nvme_mmio_read_4(ctrlr, csts); 231240616Sjimharris 232285918Sjimharris if (cc.bits.en != desired_val) { 233285918Sjimharris nvme_printf(ctrlr, "%s called with desired_val = %d " 234285918Sjimharris "but cc.en = %d\n", __func__, desired_val, cc.bits.en); 235240616Sjimharris return (ENXIO); 236240616Sjimharris } 237240616Sjimharris 238240616Sjimharris ms_waited = 0; 239240616Sjimharris 240285918Sjimharris while (csts.bits.rdy != desired_val) { 241240616Sjimharris DELAY(1000); 242240616Sjimharris if (ms_waited++ > ctrlr->ready_timeout_in_ms) { 243285918Sjimharris nvme_printf(ctrlr, "controller ready did not become %d " 244285918Sjimharris "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms); 245240616Sjimharris return (ENXIO); 246240616Sjimharris } 247240616Sjimharris csts.raw = nvme_mmio_read_4(ctrlr, csts); 248240616Sjimharris } 249240616Sjimharris 250240616Sjimharris return (0); 251240616Sjimharris} 252240616Sjimharris 253240616Sjimharrisstatic void 254240616Sjimharrisnvme_ctrlr_disable(struct nvme_controller *ctrlr) 255240616Sjimharris{ 256240616Sjimharris union cc_register cc; 257240616Sjimharris union csts_register csts; 258240616Sjimharris 259240616Sjimharris cc.raw = nvme_mmio_read_4(ctrlr, cc); 260240616Sjimharris csts.raw = nvme_mmio_read_4(ctrlr, csts); 261240616Sjimharris 262240616Sjimharris if (cc.bits.en == 1 && csts.bits.rdy == 0) 263285918Sjimharris nvme_ctrlr_wait_for_ready(ctrlr, 1); 264240616Sjimharris 265240616Sjimharris cc.bits.en = 0; 266240616Sjimharris nvme_mmio_write_4(ctrlr, cc, cc.raw); 267240616Sjimharris DELAY(5000); 268285918Sjimharris nvme_ctrlr_wait_for_ready(ctrlr, 0); 269240616Sjimharris} 270240616Sjimharris 271240616Sjimharrisstatic int 272240616Sjimharrisnvme_ctrlr_enable(struct nvme_controller *ctrlr) 273240616Sjimharris{ 274240616Sjimharris union cc_register cc; 275240616Sjimharris union csts_register csts; 276240616Sjimharris union aqa_register aqa; 277240616Sjimharris 278240616Sjimharris cc.raw = nvme_mmio_read_4(ctrlr, cc); 279240616Sjimharris csts.raw = nvme_mmio_read_4(ctrlr, csts); 280240616Sjimharris 281240616Sjimharris if (cc.bits.en == 1) { 282240616Sjimharris if (csts.bits.rdy == 1) 283240616Sjimharris return (0); 284240616Sjimharris else 285285918Sjimharris return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); 286240616Sjimharris } 287240616Sjimharris 288240616Sjimharris nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr); 289240616Sjimharris DELAY(5000); 290240616Sjimharris nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr); 291240616Sjimharris DELAY(5000); 292240616Sjimharris 293240616Sjimharris aqa.raw = 0; 294240616Sjimharris /* acqs and asqs are 0-based. */ 295240616Sjimharris aqa.bits.acqs = ctrlr->adminq.num_entries-1; 296240616Sjimharris aqa.bits.asqs = ctrlr->adminq.num_entries-1; 297240616Sjimharris nvme_mmio_write_4(ctrlr, aqa, aqa.raw); 298240616Sjimharris DELAY(5000); 299240616Sjimharris 300240616Sjimharris cc.bits.en = 1; 301240616Sjimharris cc.bits.css = 0; 302240616Sjimharris cc.bits.ams = 0; 303240616Sjimharris cc.bits.shn = 0; 304240616Sjimharris cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ 305240616Sjimharris cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ 306240616Sjimharris 307240616Sjimharris /* This evaluates to 0, which is according to spec. */ 308240616Sjimharris cc.bits.mps = (PAGE_SIZE >> 13); 309240616Sjimharris 310240616Sjimharris nvme_mmio_write_4(ctrlr, cc, cc.raw); 311240616Sjimharris DELAY(5000); 312240616Sjimharris 313285918Sjimharris return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); 314240616Sjimharris} 315240616Sjimharris 316240616Sjimharrisint 317248746Sjimharrisnvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) 318240616Sjimharris{ 319248746Sjimharris int i; 320240616Sjimharris 321248746Sjimharris nvme_admin_qpair_disable(&ctrlr->adminq); 322293671Sjimharris /* 323293671Sjimharris * I/O queues are not allocated before the initial HW 324293671Sjimharris * reset, so do not try to disable them. Use is_initialized 325293671Sjimharris * to determine if this is the initial HW reset. 326293671Sjimharris */ 327293671Sjimharris if (ctrlr->is_initialized) { 328293671Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) 329293671Sjimharris nvme_io_qpair_disable(&ctrlr->ioq[i]); 330293671Sjimharris } 331248746Sjimharris 332248746Sjimharris DELAY(100*1000); 333248746Sjimharris 334240616Sjimharris nvme_ctrlr_disable(ctrlr); 335240616Sjimharris return (nvme_ctrlr_enable(ctrlr)); 336240616Sjimharris} 337240616Sjimharris 338248746Sjimharrisvoid 339248746Sjimharrisnvme_ctrlr_reset(struct nvme_controller *ctrlr) 340248746Sjimharris{ 341248755Sjimharris int cmpset; 342248746Sjimharris 343248755Sjimharris cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); 344248755Sjimharris 345248767Sjimharris if (cmpset == 0 || ctrlr->is_failed) 346248767Sjimharris /* 347248767Sjimharris * Controller is already resetting or has failed. Return 348248767Sjimharris * immediately since there is no need to kick off another 349248767Sjimharris * reset in these cases. 350248767Sjimharris */ 351248755Sjimharris return; 352248755Sjimharris 353248754Sjimharris taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); 354248746Sjimharris} 355248746Sjimharris 356240616Sjimharrisstatic int 357240616Sjimharrisnvme_ctrlr_identify(struct nvme_controller *ctrlr) 358240616Sjimharris{ 359248769Sjimharris struct nvme_completion_poll_status status; 360240616Sjimharris 361248769Sjimharris status.done = FALSE; 362240616Sjimharris nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata, 363248769Sjimharris nvme_completion_poll_cb, &status); 364248769Sjimharris while (status.done == FALSE) 365253438Sjimharris pause("nvme", 1); 366248769Sjimharris if (nvme_completion_is_error(&status.cpl)) { 367248773Sjimharris nvme_printf(ctrlr, "nvme_identify_controller failed!\n"); 368240616Sjimharris return (ENXIO); 369240616Sjimharris } 370240616Sjimharris 371248762Sjimharris /* 372248762Sjimharris * Use MDTS to ensure our default max_xfer_size doesn't exceed what the 373248762Sjimharris * controller supports. 374248762Sjimharris */ 375248762Sjimharris if (ctrlr->cdata.mdts > 0) 376248762Sjimharris ctrlr->max_xfer_size = min(ctrlr->max_xfer_size, 377248762Sjimharris ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); 378248762Sjimharris 379240616Sjimharris return (0); 380240616Sjimharris} 381240616Sjimharris 382240616Sjimharrisstatic int 383240616Sjimharrisnvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) 384240616Sjimharris{ 385248769Sjimharris struct nvme_completion_poll_status status; 386293671Sjimharris int cq_allocated, sq_allocated; 387240616Sjimharris 388248769Sjimharris status.done = FALSE; 389240616Sjimharris nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, 390248769Sjimharris nvme_completion_poll_cb, &status); 391248769Sjimharris while (status.done == FALSE) 392253438Sjimharris pause("nvme", 1); 393248769Sjimharris if (nvme_completion_is_error(&status.cpl)) { 394248773Sjimharris nvme_printf(ctrlr, "nvme_set_num_queues failed!\n"); 395240616Sjimharris return (ENXIO); 396240616Sjimharris } 397240616Sjimharris 398240616Sjimharris /* 399240616Sjimharris * Data in cdw0 is 0-based. 400240616Sjimharris * Lower 16-bits indicate number of submission queues allocated. 401240616Sjimharris * Upper 16-bits indicate number of completion queues allocated. 402240616Sjimharris */ 403248769Sjimharris sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1; 404248769Sjimharris cq_allocated = (status.cpl.cdw0 >> 16) + 1; 405240616Sjimharris 406240616Sjimharris /* 407293671Sjimharris * Controller may allocate more queues than we requested, 408293671Sjimharris * so use the minimum of the number requested and what was 409293671Sjimharris * actually allocated. 410240616Sjimharris */ 411293671Sjimharris ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); 412293671Sjimharris ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); 413248834Sjimharris 414240616Sjimharris return (0); 415240616Sjimharris} 416240616Sjimharris 417240616Sjimharrisstatic int 418240616Sjimharrisnvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) 419240616Sjimharris{ 420248769Sjimharris struct nvme_completion_poll_status status; 421248769Sjimharris struct nvme_qpair *qpair; 422248769Sjimharris int i; 423240616Sjimharris 424240616Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) { 425240616Sjimharris qpair = &ctrlr->ioq[i]; 426240616Sjimharris 427248769Sjimharris status.done = FALSE; 428240616Sjimharris nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector, 429248769Sjimharris nvme_completion_poll_cb, &status); 430248769Sjimharris while (status.done == FALSE) 431253438Sjimharris pause("nvme", 1); 432248769Sjimharris if (nvme_completion_is_error(&status.cpl)) { 433248773Sjimharris nvme_printf(ctrlr, "nvme_create_io_cq failed!\n"); 434240616Sjimharris return (ENXIO); 435240616Sjimharris } 436240616Sjimharris 437248769Sjimharris status.done = FALSE; 438240616Sjimharris nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, 439248769Sjimharris nvme_completion_poll_cb, &status); 440248769Sjimharris while (status.done == FALSE) 441253438Sjimharris pause("nvme", 1); 442248769Sjimharris if (nvme_completion_is_error(&status.cpl)) { 443248773Sjimharris nvme_printf(ctrlr, "nvme_create_io_sq failed!\n"); 444240616Sjimharris return (ENXIO); 445240616Sjimharris } 446240616Sjimharris } 447240616Sjimharris 448240616Sjimharris return (0); 449240616Sjimharris} 450240616Sjimharris 451240616Sjimharrisstatic int 452240616Sjimharrisnvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) 453240616Sjimharris{ 454240616Sjimharris struct nvme_namespace *ns; 455240616Sjimharris int i, status; 456240616Sjimharris 457240616Sjimharris for (i = 0; i < ctrlr->cdata.nn; i++) { 458240616Sjimharris ns = &ctrlr->ns[i]; 459240616Sjimharris status = nvme_ns_construct(ns, i+1, ctrlr); 460240616Sjimharris if (status != 0) 461240616Sjimharris return (status); 462240616Sjimharris } 463240616Sjimharris 464240616Sjimharris return (0); 465240616Sjimharris} 466240616Sjimharris 467248759Sjimharrisstatic boolean_t 468248759Sjimharrisis_log_page_id_valid(uint8_t page_id) 469248759Sjimharris{ 470248759Sjimharris 471248759Sjimharris switch (page_id) { 472248759Sjimharris case NVME_LOG_ERROR: 473248759Sjimharris case NVME_LOG_HEALTH_INFORMATION: 474248759Sjimharris case NVME_LOG_FIRMWARE_SLOT: 475248759Sjimharris return (TRUE); 476248759Sjimharris } 477248759Sjimharris 478248759Sjimharris return (FALSE); 479248759Sjimharris} 480248759Sjimharris 481248759Sjimharrisstatic uint32_t 482248759Sjimharrisnvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id) 483248759Sjimharris{ 484248759Sjimharris uint32_t log_page_size; 485248759Sjimharris 486248759Sjimharris switch (page_id) { 487248759Sjimharris case NVME_LOG_ERROR: 488248759Sjimharris log_page_size = min( 489248759Sjimharris sizeof(struct nvme_error_information_entry) * 490248759Sjimharris ctrlr->cdata.elpe, 491248759Sjimharris NVME_MAX_AER_LOG_SIZE); 492248759Sjimharris break; 493248759Sjimharris case NVME_LOG_HEALTH_INFORMATION: 494248759Sjimharris log_page_size = sizeof(struct nvme_health_information_page); 495248759Sjimharris break; 496248759Sjimharris case NVME_LOG_FIRMWARE_SLOT: 497248759Sjimharris log_page_size = sizeof(struct nvme_firmware_page); 498248759Sjimharris break; 499248759Sjimharris default: 500248759Sjimharris log_page_size = 0; 501248759Sjimharris break; 502248759Sjimharris } 503248759Sjimharris 504248759Sjimharris return (log_page_size); 505248759Sjimharris} 506248759Sjimharris 507240616Sjimharrisstatic void 508256154Sjimharrisnvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, 509256154Sjimharris union nvme_critical_warning_state state) 510256154Sjimharris{ 511256154Sjimharris 512256154Sjimharris if (state.bits.available_spare == 1) 513256154Sjimharris nvme_printf(ctrlr, "available spare space below threshold\n"); 514256154Sjimharris 515256154Sjimharris if (state.bits.temperature == 1) 516256154Sjimharris nvme_printf(ctrlr, "temperature above threshold\n"); 517256154Sjimharris 518256154Sjimharris if (state.bits.device_reliability == 1) 519256154Sjimharris nvme_printf(ctrlr, "device reliability degraded\n"); 520256154Sjimharris 521256154Sjimharris if (state.bits.read_only == 1) 522256154Sjimharris nvme_printf(ctrlr, "media placed in read only mode\n"); 523256154Sjimharris 524256154Sjimharris if (state.bits.volatile_memory_backup == 1) 525256154Sjimharris nvme_printf(ctrlr, "volatile memory backup device failed\n"); 526256154Sjimharris 527256154Sjimharris if (state.bits.reserved != 0) 528256154Sjimharris nvme_printf(ctrlr, 529256154Sjimharris "unknown critical warning(s): state = 0x%02x\n", state.raw); 530256154Sjimharris} 531256154Sjimharris 532256154Sjimharrisstatic void 533248759Sjimharrisnvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) 534248759Sjimharris{ 535256154Sjimharris struct nvme_async_event_request *aer = arg; 536256154Sjimharris struct nvme_health_information_page *health_info; 537248759Sjimharris 538248760Sjimharris /* 539248760Sjimharris * If the log page fetch for some reason completed with an error, 540248760Sjimharris * don't pass log page data to the consumers. In practice, this case 541248760Sjimharris * should never happen. 542248760Sjimharris */ 543248760Sjimharris if (nvme_completion_is_error(cpl)) 544248760Sjimharris nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, 545248760Sjimharris aer->log_page_id, NULL, 0); 546256154Sjimharris else { 547256154Sjimharris if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { 548256154Sjimharris health_info = (struct nvme_health_information_page *) 549256154Sjimharris aer->log_page_buffer; 550256154Sjimharris nvme_ctrlr_log_critical_warnings(aer->ctrlr, 551256154Sjimharris health_info->critical_warning); 552256154Sjimharris /* 553256154Sjimharris * Critical warnings reported through the 554256154Sjimharris * SMART/health log page are persistent, so 555256154Sjimharris * clear the associated bits in the async event 556256154Sjimharris * config so that we do not receive repeated 557256154Sjimharris * notifications for the same event. 558256154Sjimharris */ 559256154Sjimharris aer->ctrlr->async_event_config.raw &= 560256154Sjimharris ~health_info->critical_warning.raw; 561256154Sjimharris nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, 562256154Sjimharris aer->ctrlr->async_event_config, NULL, NULL); 563256154Sjimharris } 564256154Sjimharris 565256154Sjimharris 566248760Sjimharris /* 567248760Sjimharris * Pass the cpl data from the original async event completion, 568248760Sjimharris * not the log page fetch. 569248760Sjimharris */ 570248760Sjimharris nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, 571248760Sjimharris aer->log_page_id, aer->log_page_buffer, aer->log_page_size); 572256154Sjimharris } 573248759Sjimharris 574248759Sjimharris /* 575248759Sjimharris * Repost another asynchronous event request to replace the one 576248759Sjimharris * that just completed. 577248759Sjimharris */ 578248759Sjimharris nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); 579248759Sjimharris} 580248759Sjimharris 581248759Sjimharrisstatic void 582248737Sjimharrisnvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) 583248737Sjimharris{ 584248759Sjimharris struct nvme_async_event_request *aer = arg; 585248737Sjimharris 586253108Sjimharris if (nvme_completion_is_error(cpl)) { 587248737Sjimharris /* 588253108Sjimharris * Do not retry failed async event requests. This avoids 589253108Sjimharris * infinite loops where a new async event request is submitted 590253108Sjimharris * to replace the one just failed, only to fail again and 591253108Sjimharris * perpetuate the loop. 592248737Sjimharris */ 593248737Sjimharris return; 594248737Sjimharris } 595248737Sjimharris 596248759Sjimharris /* Associated log page is in bits 23:16 of completion entry dw0. */ 597248760Sjimharris aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 598248737Sjimharris 599248773Sjimharris nvme_printf(aer->ctrlr, "async event occurred (log page id=0x%x)\n", 600248773Sjimharris aer->log_page_id); 601248773Sjimharris 602248760Sjimharris if (is_log_page_id_valid(aer->log_page_id)) { 603248759Sjimharris aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, 604248760Sjimharris aer->log_page_id); 605248759Sjimharris memcpy(&aer->cpl, cpl, sizeof(*cpl)); 606248760Sjimharris nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, 607248759Sjimharris NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, 608248759Sjimharris aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, 609248759Sjimharris aer); 610248759Sjimharris /* Wait to notify consumers until after log page is fetched. */ 611248759Sjimharris } else { 612248760Sjimharris nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, 613248760Sjimharris NULL, 0); 614248759Sjimharris 615248759Sjimharris /* 616248759Sjimharris * Repost another asynchronous event request to replace the one 617248759Sjimharris * that just completed. 618248759Sjimharris */ 619248759Sjimharris nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); 620248759Sjimharris } 621248737Sjimharris} 622248737Sjimharris 623248737Sjimharrisstatic void 624248737Sjimharrisnvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, 625248737Sjimharris struct nvme_async_event_request *aer) 626248737Sjimharris{ 627248737Sjimharris struct nvme_request *req; 628248737Sjimharris 629248737Sjimharris aer->ctrlr = ctrlr; 630248913Sjimharris req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); 631248737Sjimharris aer->req = req; 632248737Sjimharris 633248737Sjimharris /* 634248749Sjimharris * Disable timeout here, since asynchronous event requests should by 635248749Sjimharris * nature never be timed out. 636248737Sjimharris */ 637248749Sjimharris req->timeout = FALSE; 638248737Sjimharris req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; 639248737Sjimharris nvme_ctrlr_submit_admin_request(ctrlr, req); 640248737Sjimharris} 641248737Sjimharris 642248737Sjimharrisstatic void 643240616Sjimharrisnvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) 644240616Sjimharris{ 645256153Sjimharris struct nvme_completion_poll_status status; 646248737Sjimharris struct nvme_async_event_request *aer; 647248737Sjimharris uint32_t i; 648240616Sjimharris 649256154Sjimharris ctrlr->async_event_config.raw = 0xFF; 650256154Sjimharris ctrlr->async_event_config.bits.reserved = 0; 651256153Sjimharris 652256153Sjimharris status.done = FALSE; 653256153Sjimharris nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD, 654256153Sjimharris 0, NULL, 0, nvme_completion_poll_cb, &status); 655256153Sjimharris while (status.done == FALSE) 656256153Sjimharris pause("nvme", 1); 657256153Sjimharris if (nvme_completion_is_error(&status.cpl) || 658256153Sjimharris (status.cpl.cdw0 & 0xFFFF) == 0xFFFF || 659256153Sjimharris (status.cpl.cdw0 & 0xFFFF) == 0x0000) { 660256153Sjimharris nvme_printf(ctrlr, "temperature threshold not supported\n"); 661256154Sjimharris ctrlr->async_event_config.bits.temperature = 0; 662256153Sjimharris } 663256153Sjimharris 664256154Sjimharris nvme_ctrlr_cmd_set_async_event_config(ctrlr, 665256154Sjimharris ctrlr->async_event_config, NULL, NULL); 666240616Sjimharris 667240616Sjimharris /* aerl is a zero-based value, so we need to add 1 here. */ 668248737Sjimharris ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1)); 669240616Sjimharris 670248737Sjimharris for (i = 0; i < ctrlr->num_aers; i++) { 671248737Sjimharris aer = &ctrlr->aer[i]; 672248737Sjimharris nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); 673248737Sjimharris } 674240616Sjimharris} 675240616Sjimharris 676240616Sjimharrisstatic void 677240616Sjimharrisnvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) 678240616Sjimharris{ 679240616Sjimharris 680240616Sjimharris ctrlr->int_coal_time = 0; 681240616Sjimharris TUNABLE_INT_FETCH("hw.nvme.int_coal_time", 682240616Sjimharris &ctrlr->int_coal_time); 683240616Sjimharris 684240616Sjimharris ctrlr->int_coal_threshold = 0; 685240616Sjimharris TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold", 686240616Sjimharris &ctrlr->int_coal_threshold); 687240616Sjimharris 688240616Sjimharris nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time, 689240616Sjimharris ctrlr->int_coal_threshold, NULL, NULL); 690240616Sjimharris} 691240616Sjimharris 692248763Sjimharrisstatic void 693240616Sjimharrisnvme_ctrlr_start(void *ctrlr_arg) 694240616Sjimharris{ 695240616Sjimharris struct nvme_controller *ctrlr = ctrlr_arg; 696293671Sjimharris uint32_t old_num_io_queues; 697248746Sjimharris int i; 698240616Sjimharris 699293671Sjimharris /* 700293671Sjimharris * Only reset adminq here when we are restarting the 701293671Sjimharris * controller after a reset. During initialization, 702293671Sjimharris * we have already submitted admin commands to get 703293671Sjimharris * the number of I/O queues supported, so cannot reset 704293671Sjimharris * the adminq again here. 705293671Sjimharris */ 706293671Sjimharris if (ctrlr->is_resetting) { 707293671Sjimharris nvme_qpair_reset(&ctrlr->adminq); 708293671Sjimharris } 709293671Sjimharris 710248761Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) 711248761Sjimharris nvme_qpair_reset(&ctrlr->ioq[i]); 712248761Sjimharris 713248746Sjimharris nvme_admin_qpair_enable(&ctrlr->adminq); 714248746Sjimharris 715248767Sjimharris if (nvme_ctrlr_identify(ctrlr) != 0) { 716248767Sjimharris nvme_ctrlr_fail(ctrlr); 717248763Sjimharris return; 718248767Sjimharris } 719240616Sjimharris 720293671Sjimharris /* 721293671Sjimharris * The number of qpairs are determined during controller initialization, 722293671Sjimharris * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the 723293671Sjimharris * HW limit. We call SET_FEATURES again here so that it gets called 724293671Sjimharris * after any reset for controllers that depend on the driver to 725293671Sjimharris * explicit specify how many queues it will use. This value should 726293671Sjimharris * never change between resets, so panic if somehow that does happen. 727293671Sjimharris */ 728295704Sjimharris if (ctrlr->is_resetting) { 729295704Sjimharris old_num_io_queues = ctrlr->num_io_queues; 730295704Sjimharris if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { 731295704Sjimharris nvme_ctrlr_fail(ctrlr); 732295704Sjimharris return; 733295704Sjimharris } 734240616Sjimharris 735295704Sjimharris if (old_num_io_queues != ctrlr->num_io_queues) { 736295704Sjimharris panic("num_io_queues changed from %u to %u", 737295704Sjimharris old_num_io_queues, ctrlr->num_io_queues); 738295704Sjimharris } 739293671Sjimharris } 740293671Sjimharris 741248767Sjimharris if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { 742248767Sjimharris nvme_ctrlr_fail(ctrlr); 743248763Sjimharris return; 744248767Sjimharris } 745240616Sjimharris 746248767Sjimharris if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { 747248767Sjimharris nvme_ctrlr_fail(ctrlr); 748248763Sjimharris return; 749248767Sjimharris } 750240616Sjimharris 751240616Sjimharris nvme_ctrlr_configure_aer(ctrlr); 752240616Sjimharris nvme_ctrlr_configure_int_coalescing(ctrlr); 753240616Sjimharris 754248746Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) 755248746Sjimharris nvme_io_qpair_enable(&ctrlr->ioq[i]); 756248763Sjimharris} 757248746Sjimharris 758248763Sjimharrisvoid 759248763Sjimharrisnvme_ctrlr_start_config_hook(void *arg) 760248763Sjimharris{ 761248763Sjimharris struct nvme_controller *ctrlr = arg; 762240616Sjimharris 763293671Sjimharris nvme_qpair_reset(&ctrlr->adminq); 764293671Sjimharris nvme_admin_qpair_enable(&ctrlr->adminq); 765293671Sjimharris 766293671Sjimharris if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && 767293671Sjimharris nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) 768293671Sjimharris nvme_ctrlr_start(ctrlr); 769293671Sjimharris else 770293671Sjimharris nvme_ctrlr_fail(ctrlr); 771293671Sjimharris 772293671Sjimharris nvme_sysctl_initialize_ctrlr(ctrlr); 773248763Sjimharris config_intrhook_disestablish(&ctrlr->config_hook); 774265576Sjimharris 775265576Sjimharris ctrlr->is_initialized = 1; 776265576Sjimharris nvme_notify_new_controller(ctrlr); 777240616Sjimharris} 778240616Sjimharris 779240616Sjimharrisstatic void 780248754Sjimharrisnvme_ctrlr_reset_task(void *arg, int pending) 781248748Sjimharris{ 782248754Sjimharris struct nvme_controller *ctrlr = arg; 783248754Sjimharris int status; 784248748Sjimharris 785248773Sjimharris nvme_printf(ctrlr, "resetting controller\n"); 786248754Sjimharris status = nvme_ctrlr_hw_reset(ctrlr); 787248754Sjimharris /* 788248754Sjimharris * Use pause instead of DELAY, so that we yield to any nvme interrupt 789248754Sjimharris * handlers on this CPU that were blocked on a qpair lock. We want 790248754Sjimharris * all nvme interrupts completed before proceeding with restarting the 791248754Sjimharris * controller. 792248754Sjimharris * 793248754Sjimharris * XXX - any way to guarantee the interrupt handlers have quiesced? 794248754Sjimharris */ 795248754Sjimharris pause("nvmereset", hz / 10); 796248754Sjimharris if (status == 0) 797248754Sjimharris nvme_ctrlr_start(ctrlr); 798248767Sjimharris else 799248767Sjimharris nvme_ctrlr_fail(ctrlr); 800248755Sjimharris 801248755Sjimharris atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); 802248748Sjimharris} 803248748Sjimharris 804248748Sjimharrisstatic void 805244410Sjimharrisnvme_ctrlr_intx_handler(void *arg) 806240616Sjimharris{ 807240616Sjimharris struct nvme_controller *ctrlr = arg; 808240616Sjimharris 809244410Sjimharris nvme_mmio_write_4(ctrlr, intms, 1); 810244410Sjimharris 811240616Sjimharris nvme_qpair_process_completions(&ctrlr->adminq); 812240616Sjimharris 813296191Sjimharris if (ctrlr->ioq && ctrlr->ioq[0].cpl) 814240616Sjimharris nvme_qpair_process_completions(&ctrlr->ioq[0]); 815240616Sjimharris 816240616Sjimharris nvme_mmio_write_4(ctrlr, intmc, 1); 817240616Sjimharris} 818240616Sjimharris 819240616Sjimharrisstatic int 820240616Sjimharrisnvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) 821240616Sjimharris{ 822240616Sjimharris 823293670Sjimharris ctrlr->msix_enabled = 0; 824240616Sjimharris ctrlr->num_io_queues = 1; 825293671Sjimharris ctrlr->num_cpus_per_ioq = mp_ncpus; 826240616Sjimharris ctrlr->rid = 0; 827240616Sjimharris ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, 828240616Sjimharris &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); 829240616Sjimharris 830240616Sjimharris if (ctrlr->res == NULL) { 831248773Sjimharris nvme_printf(ctrlr, "unable to allocate shared IRQ\n"); 832240616Sjimharris return (ENOMEM); 833240616Sjimharris } 834240616Sjimharris 835240616Sjimharris bus_setup_intr(ctrlr->dev, ctrlr->res, 836240616Sjimharris INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler, 837240616Sjimharris ctrlr, &ctrlr->tag); 838240616Sjimharris 839240616Sjimharris if (ctrlr->tag == NULL) { 840248773Sjimharris nvme_printf(ctrlr, "unable to setup intx handler\n"); 841240616Sjimharris return (ENOMEM); 842240616Sjimharris } 843240616Sjimharris 844240616Sjimharris return (0); 845240616Sjimharris} 846240616Sjimharris 847249421Sjimharrisstatic void 848249421Sjimharrisnvme_pt_done(void *arg, const struct nvme_completion *cpl) 849249421Sjimharris{ 850249421Sjimharris struct nvme_pt_command *pt = arg; 851249421Sjimharris 852249421Sjimharris bzero(&pt->cpl, sizeof(pt->cpl)); 853249421Sjimharris pt->cpl.cdw0 = cpl->cdw0; 854249421Sjimharris pt->cpl.status = cpl->status; 855249421Sjimharris pt->cpl.status.p = 0; 856249421Sjimharris 857249421Sjimharris mtx_lock(pt->driver_lock); 858249421Sjimharris wakeup(pt); 859249421Sjimharris mtx_unlock(pt->driver_lock); 860249421Sjimharris} 861249421Sjimharris 862249421Sjimharrisint 863249421Sjimharrisnvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, 864249421Sjimharris struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, 865249421Sjimharris int is_admin_cmd) 866249421Sjimharris{ 867249421Sjimharris struct nvme_request *req; 868249421Sjimharris struct mtx *mtx; 869249421Sjimharris struct buf *buf = NULL; 870249421Sjimharris int ret = 0; 871249421Sjimharris 872252272Sjimharris if (pt->len > 0) { 873252272Sjimharris if (pt->len > ctrlr->max_xfer_size) { 874252272Sjimharris nvme_printf(ctrlr, "pt->len (%d) " 875252272Sjimharris "exceeds max_xfer_size (%d)\n", pt->len, 876252272Sjimharris ctrlr->max_xfer_size); 877252272Sjimharris return EIO; 878252272Sjimharris } 879249421Sjimharris if (is_user_buffer) { 880249421Sjimharris /* 881249421Sjimharris * Ensure the user buffer is wired for the duration of 882249421Sjimharris * this passthrough command. 883249421Sjimharris */ 884249421Sjimharris PHOLD(curproc); 885249421Sjimharris buf = getpbuf(NULL); 886249421Sjimharris buf->b_saveaddr = buf->b_data; 887249421Sjimharris buf->b_data = pt->buf; 888249421Sjimharris buf->b_bufsize = pt->len; 889249421Sjimharris buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; 890249421Sjimharris#ifdef NVME_UNMAPPED_BIO_SUPPORT 891249421Sjimharris if (vmapbuf(buf, 1) < 0) { 892249421Sjimharris#else 893249421Sjimharris if (vmapbuf(buf) < 0) { 894249421Sjimharris#endif 895249421Sjimharris ret = EFAULT; 896249421Sjimharris goto err; 897249421Sjimharris } 898249421Sjimharris req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 899249421Sjimharris nvme_pt_done, pt); 900249421Sjimharris } else 901249421Sjimharris req = nvme_allocate_request_vaddr(pt->buf, pt->len, 902249421Sjimharris nvme_pt_done, pt); 903252272Sjimharris } else 904249421Sjimharris req = nvme_allocate_request_null(nvme_pt_done, pt); 905249421Sjimharris 906249421Sjimharris req->cmd.opc = pt->cmd.opc; 907249421Sjimharris req->cmd.cdw10 = pt->cmd.cdw10; 908249421Sjimharris req->cmd.cdw11 = pt->cmd.cdw11; 909249421Sjimharris req->cmd.cdw12 = pt->cmd.cdw12; 910249421Sjimharris req->cmd.cdw13 = pt->cmd.cdw13; 911249421Sjimharris req->cmd.cdw14 = pt->cmd.cdw14; 912249421Sjimharris req->cmd.cdw15 = pt->cmd.cdw15; 913249421Sjimharris 914249421Sjimharris req->cmd.nsid = nsid; 915249421Sjimharris 916249421Sjimharris if (is_admin_cmd) 917249421Sjimharris mtx = &ctrlr->lock; 918249421Sjimharris else 919249421Sjimharris mtx = &ctrlr->ns[nsid-1].lock; 920249421Sjimharris 921249421Sjimharris mtx_lock(mtx); 922249421Sjimharris pt->driver_lock = mtx; 923249421Sjimharris 924249421Sjimharris if (is_admin_cmd) 925249421Sjimharris nvme_ctrlr_submit_admin_request(ctrlr, req); 926249421Sjimharris else 927249421Sjimharris nvme_ctrlr_submit_io_request(ctrlr, req); 928249421Sjimharris 929249421Sjimharris mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); 930249421Sjimharris mtx_unlock(mtx); 931249421Sjimharris 932249421Sjimharris pt->driver_lock = NULL; 933249421Sjimharris 934249421Sjimharriserr: 935249421Sjimharris if (buf != NULL) { 936249421Sjimharris relpbuf(buf, NULL); 937249421Sjimharris PRELE(curproc); 938249421Sjimharris } 939249421Sjimharris 940249421Sjimharris return (ret); 941249421Sjimharris} 942249421Sjimharris 943240616Sjimharrisstatic int 944240616Sjimharrisnvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, 945240616Sjimharris struct thread *td) 946240616Sjimharris{ 947248769Sjimharris struct nvme_controller *ctrlr; 948249421Sjimharris struct nvme_pt_command *pt; 949240616Sjimharris 950240616Sjimharris ctrlr = cdev->si_drv1; 951240616Sjimharris 952240616Sjimharris switch (cmd) { 953248746Sjimharris case NVME_RESET_CONTROLLER: 954248746Sjimharris nvme_ctrlr_reset(ctrlr); 955248746Sjimharris break; 956249421Sjimharris case NVME_PASSTHROUGH_CMD: 957249421Sjimharris pt = (struct nvme_pt_command *)arg; 958249421Sjimharris return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, pt->cmd.nsid, 959249421Sjimharris 1 /* is_user_buffer */, 1 /* is_admin_cmd */)); 960240616Sjimharris default: 961240616Sjimharris return (ENOTTY); 962240616Sjimharris } 963240616Sjimharris 964240616Sjimharris return (0); 965240616Sjimharris} 966240616Sjimharris 967240616Sjimharrisstatic struct cdevsw nvme_ctrlr_cdevsw = { 968240616Sjimharris .d_version = D_VERSION, 969240616Sjimharris .d_flags = 0, 970240616Sjimharris .d_ioctl = nvme_ctrlr_ioctl 971240616Sjimharris}; 972240616Sjimharris 973293670Sjimharrisstatic void 974293670Sjimharrisnvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) 975240616Sjimharris{ 976293670Sjimharris device_t dev; 977293670Sjimharris int per_cpu_io_queues; 978293672Sjimharris int min_cpus_per_ioq; 979293670Sjimharris int num_vectors_requested, num_vectors_allocated; 980293671Sjimharris int num_vectors_available; 981240616Sjimharris 982293670Sjimharris dev = ctrlr->dev; 983293672Sjimharris min_cpus_per_ioq = 1; 984293672Sjimharris TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); 985293672Sjimharris 986293672Sjimharris if (min_cpus_per_ioq < 1) { 987293672Sjimharris min_cpus_per_ioq = 1; 988293672Sjimharris } else if (min_cpus_per_ioq > mp_ncpus) { 989293672Sjimharris min_cpus_per_ioq = mp_ncpus; 990293672Sjimharris } 991293672Sjimharris 992240616Sjimharris per_cpu_io_queues = 1; 993240616Sjimharris TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); 994240616Sjimharris 995293672Sjimharris if (per_cpu_io_queues == 0) { 996293672Sjimharris min_cpus_per_ioq = mp_ncpus; 997293672Sjimharris } 998293672Sjimharris 999240616Sjimharris ctrlr->force_intx = 0; 1000240616Sjimharris TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); 1001240616Sjimharris 1002293671Sjimharris /* 1003293671Sjimharris * FreeBSD currently cannot allocate more than about 190 vectors at 1004293671Sjimharris * boot, meaning that systems with high core count and many devices 1005293671Sjimharris * requesting per-CPU interrupt vectors will not get their full 1006293671Sjimharris * allotment. So first, try to allocate as many as we may need to 1007293671Sjimharris * understand what is available, then immediately release them. 1008293671Sjimharris * Then figure out how many of those we will actually use, based on 1009293671Sjimharris * assigning an equal number of cores to each I/O queue. 1010293671Sjimharris */ 1011293671Sjimharris 1012293671Sjimharris /* One vector for per core I/O queue, plus one vector for admin queue. */ 1013293671Sjimharris num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); 1014293671Sjimharris if (pci_alloc_msix(dev, &num_vectors_available) != 0) { 1015293671Sjimharris num_vectors_available = 0; 1016293671Sjimharris } 1017293671Sjimharris pci_release_msi(dev); 1018293671Sjimharris 1019293671Sjimharris if (ctrlr->force_intx || num_vectors_available < 2) { 1020293670Sjimharris nvme_ctrlr_configure_intx(ctrlr); 1021293670Sjimharris return; 1022293670Sjimharris } 1023248754Sjimharris 1024293672Sjimharris /* 1025293672Sjimharris * Do not use all vectors for I/O queues - one must be saved for the 1026293672Sjimharris * admin queue. 1027293672Sjimharris */ 1028293672Sjimharris ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq, 1029293673Sjimharris howmany(mp_ncpus, num_vectors_available - 1)); 1030240616Sjimharris 1031293673Sjimharris ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq); 1032285917Sjimharris num_vectors_requested = ctrlr->num_io_queues + 1; 1033293671Sjimharris num_vectors_allocated = num_vectors_requested; 1034240616Sjimharris 1035293671Sjimharris /* 1036293671Sjimharris * Now just allocate the number of vectors we need. This should 1037293671Sjimharris * succeed, since we previously called pci_alloc_msix() 1038293671Sjimharris * successfully returning at least this many vectors, but just to 1039293671Sjimharris * be safe, if something goes wrong just revert to INTx. 1040293671Sjimharris */ 1041285917Sjimharris if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { 1042293670Sjimharris nvme_ctrlr_configure_intx(ctrlr); 1043293670Sjimharris return; 1044293667Sjimharris } 1045293667Sjimharris 1046293667Sjimharris if (num_vectors_allocated < num_vectors_requested) { 1047293667Sjimharris pci_release_msi(dev); 1048293671Sjimharris nvme_ctrlr_configure_intx(ctrlr); 1049293671Sjimharris return; 1050265577Sjimharris } 1051293671Sjimharris 1052293671Sjimharris ctrlr->msix_enabled = 1; 1053293670Sjimharris} 1054240616Sjimharris 1055293670Sjimharrisint 1056293670Sjimharrisnvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) 1057293670Sjimharris{ 1058293670Sjimharris union cap_lo_register cap_lo; 1059293670Sjimharris union cap_hi_register cap_hi; 1060293670Sjimharris int status, timeout_period; 1061240616Sjimharris 1062293670Sjimharris ctrlr->dev = dev; 1063240616Sjimharris 1064293670Sjimharris mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); 1065293670Sjimharris 1066293670Sjimharris status = nvme_ctrlr_allocate_bar(ctrlr); 1067293670Sjimharris 1068293670Sjimharris if (status != 0) 1069293670Sjimharris return (status); 1070293670Sjimharris 1071293670Sjimharris /* 1072293670Sjimharris * Software emulators may set the doorbell stride to something 1073293670Sjimharris * other than zero, but this driver is not set up to handle that. 1074293670Sjimharris */ 1075293670Sjimharris cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi); 1076293670Sjimharris if (cap_hi.bits.dstrd != 0) 1077293670Sjimharris return (ENXIO); 1078293670Sjimharris 1079293670Sjimharris ctrlr->min_page_size = 1 << (12 + cap_hi.bits.mpsmin); 1080293670Sjimharris 1081293670Sjimharris /* Get ready timeout value from controller, in units of 500ms. */ 1082293670Sjimharris cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo); 1083293670Sjimharris ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500; 1084293670Sjimharris 1085293670Sjimharris timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD; 1086293670Sjimharris TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period); 1087293670Sjimharris timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); 1088293670Sjimharris timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); 1089293670Sjimharris ctrlr->timeout_period = timeout_period; 1090293670Sjimharris 1091293670Sjimharris nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; 1092293670Sjimharris TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); 1093293670Sjimharris 1094293670Sjimharris ctrlr->enable_aborts = 0; 1095293670Sjimharris TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); 1096293670Sjimharris 1097293670Sjimharris nvme_ctrlr_setup_interrupts(ctrlr); 1098293670Sjimharris 1099252271Sjimharris ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; 1100240616Sjimharris nvme_ctrlr_construct_admin_qpair(ctrlr); 1101240616Sjimharris 1102257707Sjimharris ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, device_get_unit(dev), 1103257707Sjimharris UID_ROOT, GID_WHEEL, 0600, "nvme%d", device_get_unit(dev)); 1104240616Sjimharris 1105240616Sjimharris if (ctrlr->cdev == NULL) 1106240616Sjimharris return (ENXIO); 1107240616Sjimharris 1108240616Sjimharris ctrlr->cdev->si_drv1 = (void *)ctrlr; 1109240616Sjimharris 1110248748Sjimharris ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, 1111248748Sjimharris taskqueue_thread_enqueue, &ctrlr->taskqueue); 1112248748Sjimharris taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq"); 1113248748Sjimharris 1114248755Sjimharris ctrlr->is_resetting = 0; 1115265576Sjimharris ctrlr->is_initialized = 0; 1116265576Sjimharris ctrlr->notification_sent = 0; 1117248767Sjimharris TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); 1118248755Sjimharris 1119248767Sjimharris TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); 1120248767Sjimharris STAILQ_INIT(&ctrlr->fail_req); 1121248767Sjimharris ctrlr->is_failed = FALSE; 1122248767Sjimharris 1123240616Sjimharris return (0); 1124240616Sjimharris} 1125241660Sjimharris 1126241660Sjimharrisvoid 1127248736Sjimharrisnvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) 1128248736Sjimharris{ 1129248737Sjimharris int i; 1130248736Sjimharris 1131254302Sjimharris /* 1132254302Sjimharris * Notify the controller of a shutdown, even though this is due to 1133254302Sjimharris * a driver unload, not a system shutdown (this path is not invoked 1134254302Sjimharris * during shutdown). This ensures the controller receives a 1135254302Sjimharris * shutdown notification in case the system is shutdown before 1136254302Sjimharris * reloading the driver. 1137254302Sjimharris */ 1138282926Sjimharris nvme_ctrlr_shutdown(ctrlr); 1139254302Sjimharris 1140248766Sjimharris nvme_ctrlr_disable(ctrlr); 1141248748Sjimharris taskqueue_free(ctrlr->taskqueue); 1142248748Sjimharris 1143248746Sjimharris for (i = 0; i < NVME_MAX_NAMESPACES; i++) 1144248746Sjimharris nvme_ns_destruct(&ctrlr->ns[i]); 1145248736Sjimharris 1146248736Sjimharris if (ctrlr->cdev) 1147248736Sjimharris destroy_dev(ctrlr->cdev); 1148248736Sjimharris 1149248736Sjimharris for (i = 0; i < ctrlr->num_io_queues; i++) { 1150248736Sjimharris nvme_io_qpair_destroy(&ctrlr->ioq[i]); 1151248736Sjimharris } 1152248736Sjimharris 1153248736Sjimharris free(ctrlr->ioq, M_NVME); 1154248736Sjimharris 1155248736Sjimharris nvme_admin_qpair_destroy(&ctrlr->adminq); 1156248736Sjimharris 1157248736Sjimharris if (ctrlr->resource != NULL) { 1158248736Sjimharris bus_release_resource(dev, SYS_RES_MEMORY, 1159248736Sjimharris ctrlr->resource_id, ctrlr->resource); 1160248736Sjimharris } 1161248736Sjimharris 1162248736Sjimharris if (ctrlr->bar4_resource != NULL) { 1163248736Sjimharris bus_release_resource(dev, SYS_RES_MEMORY, 1164248736Sjimharris ctrlr->bar4_resource_id, ctrlr->bar4_resource); 1165248736Sjimharris } 1166248736Sjimharris 1167248736Sjimharris if (ctrlr->tag) 1168248736Sjimharris bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); 1169248736Sjimharris 1170248736Sjimharris if (ctrlr->res) 1171248736Sjimharris bus_release_resource(ctrlr->dev, SYS_RES_IRQ, 1172248736Sjimharris rman_get_rid(ctrlr->res), ctrlr->res); 1173248736Sjimharris 1174248736Sjimharris if (ctrlr->msix_enabled) 1175248736Sjimharris pci_release_msi(dev); 1176248736Sjimharris} 1177248736Sjimharris 1178248736Sjimharrisvoid 1179254302Sjimharrisnvme_ctrlr_shutdown(struct nvme_controller *ctrlr) 1180254302Sjimharris{ 1181254302Sjimharris union cc_register cc; 1182254302Sjimharris union csts_register csts; 1183254302Sjimharris int ticks = 0; 1184254302Sjimharris 1185254302Sjimharris cc.raw = nvme_mmio_read_4(ctrlr, cc); 1186254302Sjimharris cc.bits.shn = NVME_SHN_NORMAL; 1187254302Sjimharris nvme_mmio_write_4(ctrlr, cc, cc.raw); 1188254302Sjimharris csts.raw = nvme_mmio_read_4(ctrlr, csts); 1189254302Sjimharris while ((csts.bits.shst != NVME_SHST_COMPLETE) && (ticks++ < 5*hz)) { 1190254302Sjimharris pause("nvme shn", 1); 1191254302Sjimharris csts.raw = nvme_mmio_read_4(ctrlr, csts); 1192254302Sjimharris } 1193254302Sjimharris if (csts.bits.shst != NVME_SHST_COMPLETE) 1194254302Sjimharris nvme_printf(ctrlr, "did not complete shutdown within 5 seconds " 1195254302Sjimharris "of notification\n"); 1196254302Sjimharris} 1197254302Sjimharris 1198254302Sjimharrisvoid 1199241660Sjimharrisnvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, 1200241660Sjimharris struct nvme_request *req) 1201241660Sjimharris{ 1202241660Sjimharris 1203241663Sjimharris nvme_qpair_submit_request(&ctrlr->adminq, req); 1204241660Sjimharris} 1205241660Sjimharris 1206241660Sjimharrisvoid 1207241660Sjimharrisnvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, 1208241660Sjimharris struct nvme_request *req) 1209241660Sjimharris{ 1210241660Sjimharris struct nvme_qpair *qpair; 1211241660Sjimharris 1212293671Sjimharris qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; 1213241663Sjimharris nvme_qpair_submit_request(qpair, req); 1214241660Sjimharris} 1215248738Sjimharris 1216248738Sjimharrisdevice_t 1217248738Sjimharrisnvme_ctrlr_get_device(struct nvme_controller *ctrlr) 1218248738Sjimharris{ 1219248738Sjimharris 1220248738Sjimharris return (ctrlr->dev); 1221248738Sjimharris} 1222248747Sjimharris 1223248747Sjimharrisconst struct nvme_controller_data * 1224248747Sjimharrisnvme_ctrlr_get_data(struct nvme_controller *ctrlr) 1225248747Sjimharris{ 1226248747Sjimharris 1227248747Sjimharris return (&ctrlr->cdata); 1228248747Sjimharris} 1229