nvd.c revision 344033
1/*- 2 * Copyright (C) 2012-2016 Intel Corporation 3 * All rights reserved. 4 * Copyright (C) 2018 Alexander Motin <mav@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/dev/nvd/nvd.c 344033 2019-02-12 00:53:43Z mav $"); 30 31#include <sys/param.h> 32#include <sys/bio.h> 33#include <sys/kernel.h> 34#include <sys/malloc.h> 35#include <sys/module.h> 36#include <sys/queue.h> 37#include <sys/sysctl.h> 38#include <sys/systm.h> 39#include <sys/taskqueue.h> 40#include <machine/atomic.h> 41 42#include <geom/geom.h> 43#include <geom/geom_disk.h> 44 45#include <dev/nvme/nvme.h> 46 47#define NVD_STR "nvd" 48 49struct nvd_disk; 50struct nvd_controller; 51 52static disk_ioctl_t nvd_ioctl; 53static disk_strategy_t nvd_strategy; 54static dumper_t nvd_dump; 55 56static void nvd_done(void *arg, const struct nvme_completion *cpl); 57static void nvd_gone(struct nvd_disk *ndisk); 58 59static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr); 60 61static void *nvd_new_controller(struct nvme_controller *ctrlr); 62static void nvd_controller_fail(void *ctrlr); 63 64static int nvd_load(void); 65static void nvd_unload(void); 66 67MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations"); 68 69struct nvme_consumer *consumer_handle; 70 71struct nvd_disk { 72 struct nvd_controller *ctrlr; 73 74 struct bio_queue_head bioq; 75 struct task bioqtask; 76 struct mtx bioqlock; 77 78 struct disk *disk; 79 struct taskqueue *tq; 80 struct nvme_namespace *ns; 81 82 uint32_t cur_depth; 83#define NVD_ODEPTH (1 << 30) 84 uint32_t ordered_in_flight; 85 u_int unit; 86 87 TAILQ_ENTRY(nvd_disk) global_tailq; 88 TAILQ_ENTRY(nvd_disk) ctrlr_tailq; 89}; 90 91struct nvd_controller { 92 93 TAILQ_ENTRY(nvd_controller) tailq; 94 TAILQ_HEAD(, nvd_disk) disk_head; 95}; 96 97static struct mtx nvd_lock; 98static TAILQ_HEAD(, nvd_controller) ctrlr_head; 99static TAILQ_HEAD(disk_list, nvd_disk) disk_head; 100 101static SYSCTL_NODE(_hw, OID_AUTO, nvd, CTLFLAG_RD, 0, "nvd driver parameters"); 102/* 103 * The NVMe specification does not define a maximum or optimal delete size, so 104 * technically max delete size is min(full size of the namespace, 2^32 - 1 105 * LBAs). A single delete for a multi-TB NVMe namespace though may take much 106 * longer to complete than the nvme(4) I/O timeout period. So choose a sensible 107 * default here that is still suitably large to minimize the number of overall 108 * delete operations. 109 */ 110static uint64_t nvd_delete_max = (1024 * 1024 * 1024); /* 1GB */ 111SYSCTL_UQUAD(_hw_nvd, OID_AUTO, delete_max, CTLFLAG_RDTUN, &nvd_delete_max, 0, 112 "nvd maximum BIO_DELETE size in bytes"); 113 114static int nvd_modevent(module_t mod, int type, void *arg) 115{ 116 int error = 0; 117 118 switch (type) { 119 case MOD_LOAD: 120 error = nvd_load(); 121 break; 122 case MOD_UNLOAD: 123 nvd_unload(); 124 break; 125 default: 126 break; 127 } 128 129 return (error); 130} 131 132moduledata_t nvd_mod = { 133 NVD_STR, 134 (modeventhand_t)nvd_modevent, 135 0 136}; 137 138DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY); 139MODULE_VERSION(nvd, 1); 140MODULE_DEPEND(nvd, nvme, 1, 1, 1); 141 142static int 143nvd_load() 144{ 145 if (!nvme_use_nvd) 146 return 0; 147 148 mtx_init(&nvd_lock, "nvd_lock", NULL, MTX_DEF); 149 TAILQ_INIT(&ctrlr_head); 150 TAILQ_INIT(&disk_head); 151 152 consumer_handle = nvme_register_consumer(nvd_new_disk, 153 nvd_new_controller, NULL, nvd_controller_fail); 154 155 return (consumer_handle != NULL ? 0 : -1); 156} 157 158static void 159nvd_unload() 160{ 161 struct nvd_controller *ctrlr; 162 struct nvd_disk *ndisk; 163 164 if (!nvme_use_nvd) 165 return; 166 167 mtx_lock(&nvd_lock); 168 while ((ctrlr = TAILQ_FIRST(&ctrlr_head)) != NULL) { 169 TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq); 170 TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq) 171 nvd_gone(ndisk); 172 while (!TAILQ_EMPTY(&ctrlr->disk_head)) 173 msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_unload",0); 174 free(ctrlr, M_NVD); 175 } 176 mtx_unlock(&nvd_lock); 177 178 nvme_unregister_consumer(consumer_handle); 179 180 mtx_destroy(&nvd_lock); 181} 182 183static void 184nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp) 185{ 186 int err; 187 188 bp->bio_driver1 = NULL; 189 if (__predict_false(bp->bio_flags & BIO_ORDERED)) 190 atomic_add_int(&ndisk->cur_depth, NVD_ODEPTH); 191 else 192 atomic_add_int(&ndisk->cur_depth, 1); 193 err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done); 194 if (err) { 195 if (__predict_false(bp->bio_flags & BIO_ORDERED)) { 196 atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH); 197 atomic_add_int(&ndisk->ordered_in_flight, -1); 198 wakeup(&ndisk->cur_depth); 199 } else { 200 if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 && 201 __predict_false(ndisk->ordered_in_flight != 0)) 202 wakeup(&ndisk->cur_depth); 203 } 204 bp->bio_error = err; 205 bp->bio_flags |= BIO_ERROR; 206 bp->bio_resid = bp->bio_bcount; 207 biodone(bp); 208 } 209} 210 211static void 212nvd_strategy(struct bio *bp) 213{ 214 struct nvd_disk *ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1; 215 216 /* 217 * bio with BIO_ORDERED flag must be executed after all previous 218 * bios in the queue, and before any successive bios. 219 */ 220 if (__predict_false(bp->bio_flags & BIO_ORDERED)) { 221 if (atomic_fetchadd_int(&ndisk->ordered_in_flight, 1) == 0 && 222 ndisk->cur_depth == 0 && bioq_first(&ndisk->bioq) == NULL) { 223 nvd_bio_submit(ndisk, bp); 224 return; 225 } 226 } else if (__predict_true(ndisk->ordered_in_flight == 0)) { 227 nvd_bio_submit(ndisk, bp); 228 return; 229 } 230 231 /* 232 * There are ordered bios in flight, so we need to submit 233 * bios through the task queue to enforce ordering. 234 */ 235 mtx_lock(&ndisk->bioqlock); 236 bioq_insert_tail(&ndisk->bioq, bp); 237 mtx_unlock(&ndisk->bioqlock); 238 taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask); 239} 240 241static void 242nvd_gone(struct nvd_disk *ndisk) 243{ 244 struct bio *bp; 245 246 printf(NVD_STR"%u: detached\n", ndisk->unit); 247 mtx_lock(&ndisk->bioqlock); 248 disk_gone(ndisk->disk); 249 while ((bp = bioq_takefirst(&ndisk->bioq)) != NULL) { 250 if (__predict_false(bp->bio_flags & BIO_ORDERED)) 251 atomic_add_int(&ndisk->ordered_in_flight, -1); 252 bp->bio_error = ENXIO; 253 bp->bio_flags |= BIO_ERROR; 254 bp->bio_resid = bp->bio_bcount; 255 biodone(bp); 256 } 257 mtx_unlock(&ndisk->bioqlock); 258} 259 260static void 261nvd_gonecb(struct disk *dp) 262{ 263 struct nvd_disk *ndisk = (struct nvd_disk *)dp->d_drv1; 264 265 disk_destroy(ndisk->disk); 266 mtx_lock(&nvd_lock); 267 TAILQ_REMOVE(&disk_head, ndisk, global_tailq); 268 TAILQ_REMOVE(&ndisk->ctrlr->disk_head, ndisk, ctrlr_tailq); 269 if (TAILQ_EMPTY(&ndisk->ctrlr->disk_head)) 270 wakeup(&ndisk->ctrlr->disk_head); 271 mtx_unlock(&nvd_lock); 272 taskqueue_free(ndisk->tq); 273 mtx_destroy(&ndisk->bioqlock); 274 free(ndisk, M_NVD); 275} 276 277static int 278nvd_ioctl(struct disk *ndisk, u_long cmd, void *data, int fflag, 279 struct thread *td) 280{ 281 int ret = 0; 282 283 switch (cmd) { 284 default: 285 ret = EIO; 286 } 287 288 return (ret); 289} 290 291static int 292nvd_dump(void *arg, void *virt, vm_offset_t phys, off_t offset, size_t len) 293{ 294 struct disk *dp = arg; 295 struct nvd_disk *ndisk = dp->d_drv1; 296 297 return (nvme_ns_dump(ndisk->ns, virt, offset, len)); 298} 299 300static void 301nvd_done(void *arg, const struct nvme_completion *cpl) 302{ 303 struct bio *bp = (struct bio *)arg; 304 struct nvd_disk *ndisk = bp->bio_disk->d_drv1; 305 306 if (__predict_false(bp->bio_flags & BIO_ORDERED)) { 307 atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH); 308 atomic_add_int(&ndisk->ordered_in_flight, -1); 309 wakeup(&ndisk->cur_depth); 310 } else { 311 if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 && 312 __predict_false(ndisk->ordered_in_flight != 0)) 313 wakeup(&ndisk->cur_depth); 314 } 315 316 biodone(bp); 317} 318 319static void 320nvd_bioq_process(void *arg, int pending) 321{ 322 struct nvd_disk *ndisk = arg; 323 struct bio *bp; 324 325 for (;;) { 326 mtx_lock(&ndisk->bioqlock); 327 bp = bioq_takefirst(&ndisk->bioq); 328 mtx_unlock(&ndisk->bioqlock); 329 if (bp == NULL) 330 break; 331 332 if (__predict_false(bp->bio_flags & BIO_ORDERED)) { 333 /* 334 * bio with BIO_ORDERED flag set must be executed 335 * after all previous bios. 336 */ 337 while (ndisk->cur_depth > 0) 338 tsleep(&ndisk->cur_depth, 0, "nvdorb", 1); 339 } else { 340 /* 341 * bio with BIO_ORDERED flag set must be completed 342 * before proceeding with additional bios. 343 */ 344 while (ndisk->cur_depth >= NVD_ODEPTH) 345 tsleep(&ndisk->cur_depth, 0, "nvdora", 1); 346 } 347 348 nvd_bio_submit(ndisk, bp); 349 } 350} 351 352static void * 353nvd_new_controller(struct nvme_controller *ctrlr) 354{ 355 struct nvd_controller *nvd_ctrlr; 356 357 nvd_ctrlr = malloc(sizeof(struct nvd_controller), M_NVD, 358 M_ZERO | M_WAITOK); 359 360 TAILQ_INIT(&nvd_ctrlr->disk_head); 361 mtx_lock(&nvd_lock); 362 TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq); 363 mtx_unlock(&nvd_lock); 364 365 return (nvd_ctrlr); 366} 367 368static void * 369nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg) 370{ 371 uint8_t descr[NVME_MODEL_NUMBER_LENGTH+1]; 372 struct nvd_disk *ndisk, *tnd; 373 struct disk *disk; 374 struct nvd_controller *ctrlr = ctrlr_arg; 375 int unit; 376 377 ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK); 378 ndisk->ctrlr = ctrlr; 379 ndisk->ns = ns; 380 ndisk->cur_depth = 0; 381 ndisk->ordered_in_flight = 0; 382 mtx_init(&ndisk->bioqlock, "nvd bioq lock", NULL, MTX_DEF); 383 bioq_init(&ndisk->bioq); 384 TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk); 385 386 mtx_lock(&nvd_lock); 387 unit = 0; 388 TAILQ_FOREACH(tnd, &disk_head, global_tailq) { 389 if (tnd->unit > unit) 390 break; 391 unit = tnd->unit + 1; 392 } 393 ndisk->unit = unit; 394 if (tnd != NULL) 395 TAILQ_INSERT_BEFORE(tnd, ndisk, global_tailq); 396 else 397 TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq); 398 TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq); 399 mtx_unlock(&nvd_lock); 400 401 ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK, 402 taskqueue_thread_enqueue, &ndisk->tq); 403 taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq"); 404 405 disk = ndisk->disk = disk_alloc(); 406 disk->d_strategy = nvd_strategy; 407 disk->d_ioctl = nvd_ioctl; 408 disk->d_dump = nvd_dump; 409 disk->d_gone = nvd_gonecb; 410 disk->d_name = NVD_STR; 411 disk->d_unit = ndisk->unit; 412 disk->d_drv1 = ndisk; 413 414 disk->d_sectorsize = nvme_ns_get_sector_size(ns); 415 disk->d_mediasize = (off_t)nvme_ns_get_size(ns); 416 disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns); 417 disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns); 418 if (disk->d_delmaxsize > nvd_delete_max) 419 disk->d_delmaxsize = nvd_delete_max; 420 disk->d_stripesize = nvme_ns_get_stripesize(ns); 421 disk->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION; 422 if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED) 423 disk->d_flags |= DISKFLAG_CANDELETE; 424 if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED) 425 disk->d_flags |= DISKFLAG_CANFLUSHCACHE; 426 427 /* 428 * d_ident and d_descr are both far bigger than the length of either 429 * the serial or model number strings. 430 */ 431 nvme_strvis(disk->d_ident, nvme_ns_get_serial_number(ns), 432 sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH); 433 nvme_strvis(descr, nvme_ns_get_model_number(ns), sizeof(descr), 434 NVME_MODEL_NUMBER_LENGTH); 435 strlcpy(disk->d_descr, descr, sizeof(descr)); 436 437 disk->d_rotation_rate = DISK_RR_NON_ROTATING; 438 439 disk_create(disk, DISK_VERSION); 440 441 printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr); 442 printf(NVD_STR"%u: %juMB (%ju %u byte sectors)\n", disk->d_unit, 443 (uintmax_t)disk->d_mediasize / (1024*1024), 444 (uintmax_t)disk->d_mediasize / disk->d_sectorsize, 445 disk->d_sectorsize); 446 447 return (ndisk); 448} 449 450static void 451nvd_controller_fail(void *ctrlr_arg) 452{ 453 struct nvd_controller *ctrlr = ctrlr_arg; 454 struct nvd_disk *ndisk; 455 456 mtx_lock(&nvd_lock); 457 TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq); 458 TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq) 459 nvd_gone(ndisk); 460 while (!TAILQ_EMPTY(&ctrlr->disk_head)) 461 msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_fail", 0); 462 mtx_unlock(&nvd_lock); 463 free(ctrlr, M_NVD); 464} 465 466