1/*-
2 * Copyright (C) 2012-2016 Intel Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/dev/nvd/nvd.c 312406 2017-01-19 11:17:09Z mav $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/kernel.h>
33#include <sys/malloc.h>
34#include <sys/module.h>
35#include <sys/sysctl.h>
36#include <sys/systm.h>
37#include <sys/taskqueue.h>
38
39#include <geom/geom.h>
40#include <geom/geom_disk.h>
41
42#include <dev/nvme/nvme.h>
43
44#define NVD_STR		"nvd"
45
46struct nvd_disk;
47
48static disk_ioctl_t nvd_ioctl;
49static disk_strategy_t nvd_strategy;
50
51static void nvd_done(void *arg, const struct nvme_completion *cpl);
52
53static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
54static void destroy_geom_disk(struct nvd_disk *ndisk);
55
56static void *nvd_new_controller(struct nvme_controller *ctrlr);
57static void nvd_controller_fail(void *ctrlr);
58
59static int nvd_load(void);
60static void nvd_unload(void);
61
62MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
63
64struct nvme_consumer *consumer_handle;
65
66struct nvd_disk {
67
68	struct bio_queue_head	bioq;
69	struct task		bioqtask;
70	struct mtx		bioqlock;
71
72	struct disk		*disk;
73	struct taskqueue	*tq;
74	struct nvme_namespace	*ns;
75
76	uint32_t		cur_depth;
77	uint32_t		ordered_in_flight;
78
79	TAILQ_ENTRY(nvd_disk)	global_tailq;
80	TAILQ_ENTRY(nvd_disk)	ctrlr_tailq;
81};
82
83struct nvd_controller {
84
85	TAILQ_ENTRY(nvd_controller)	tailq;
86	TAILQ_HEAD(, nvd_disk)		disk_head;
87};
88
89static TAILQ_HEAD(, nvd_controller)	ctrlr_head;
90static TAILQ_HEAD(disk_list, nvd_disk)	disk_head;
91
92static SYSCTL_NODE(_hw, OID_AUTO, nvd, CTLFLAG_RD, 0, "nvd driver parameters");
93/*
94 * The NVMe specification does not define a maximum or optimal delete size, so
95 *  technically max delete size is min(full size of the namespace, 2^32 - 1
96 *  LBAs).  A single delete for a multi-TB NVMe namespace though may take much
97 *  longer to complete than the nvme(4) I/O timeout period.  So choose a sensible
98 *  default here that is still suitably large to minimize the number of overall
99 *  delete operations.
100 */
101static uint64_t nvd_delete_max = (1024 * 1024 * 1024);  /* 1GB */
102SYSCTL_UQUAD(_hw_nvd, OID_AUTO, delete_max, CTLFLAG_RDTUN, &nvd_delete_max, 0,
103	     "nvd maximum BIO_DELETE size in bytes");
104
105static int nvd_modevent(module_t mod, int type, void *arg)
106{
107	int error = 0;
108
109	switch (type) {
110	case MOD_LOAD:
111		error = nvd_load();
112		break;
113	case MOD_UNLOAD:
114		nvd_unload();
115		break;
116	default:
117		break;
118	}
119
120	return (error);
121}
122
123moduledata_t nvd_mod = {
124	NVD_STR,
125	(modeventhand_t)nvd_modevent,
126	0
127};
128
129DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
130MODULE_VERSION(nvd, 1);
131MODULE_DEPEND(nvd, nvme, 1, 1, 1);
132
133static int
134nvd_load()
135{
136
137	TAILQ_INIT(&ctrlr_head);
138	TAILQ_INIT(&disk_head);
139
140	consumer_handle = nvme_register_consumer(nvd_new_disk,
141	    nvd_new_controller, NULL, nvd_controller_fail);
142
143	return (consumer_handle != NULL ? 0 : -1);
144}
145
146static void
147nvd_unload()
148{
149	struct nvd_controller	*ctrlr;
150	struct nvd_disk		*disk;
151
152	while (!TAILQ_EMPTY(&ctrlr_head)) {
153		ctrlr = TAILQ_FIRST(&ctrlr_head);
154		TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
155		free(ctrlr, M_NVD);
156	}
157
158	while (!TAILQ_EMPTY(&disk_head)) {
159		disk = TAILQ_FIRST(&disk_head);
160		TAILQ_REMOVE(&disk_head, disk, global_tailq);
161		destroy_geom_disk(disk);
162		free(disk, M_NVD);
163	}
164
165	nvme_unregister_consumer(consumer_handle);
166}
167
168static int
169nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp)
170{
171	int err;
172
173	bp->bio_driver1 = NULL;
174	atomic_add_int(&ndisk->cur_depth, 1);
175	err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done);
176	if (err) {
177		atomic_add_int(&ndisk->cur_depth, -1);
178		if (__predict_false(bp->bio_flags & BIO_ORDERED))
179			atomic_add_int(&ndisk->ordered_in_flight, -1);
180		bp->bio_error = err;
181		bp->bio_flags |= BIO_ERROR;
182		bp->bio_resid = bp->bio_bcount;
183		biodone(bp);
184		return (-1);
185	}
186
187	return (0);
188}
189
190static void
191nvd_strategy(struct bio *bp)
192{
193	struct nvd_disk *ndisk;
194
195	ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
196
197	if (__predict_false(bp->bio_flags & BIO_ORDERED))
198		atomic_add_int(&ndisk->ordered_in_flight, 1);
199
200	if (__predict_true(ndisk->ordered_in_flight == 0)) {
201		nvd_bio_submit(ndisk, bp);
202		return;
203	}
204
205	/*
206	 * There are ordered bios in flight, so we need to submit
207	 *  bios through the task queue to enforce ordering.
208	 */
209	mtx_lock(&ndisk->bioqlock);
210	bioq_insert_tail(&ndisk->bioq, bp);
211	mtx_unlock(&ndisk->bioqlock);
212	taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
213}
214
215static int
216nvd_ioctl(struct disk *ndisk, u_long cmd, void *data, int fflag,
217    struct thread *td)
218{
219	int ret = 0;
220
221	switch (cmd) {
222	default:
223		ret = EIO;
224	}
225
226	return (ret);
227}
228
229static void
230nvd_done(void *arg, const struct nvme_completion *cpl)
231{
232	struct bio *bp;
233	struct nvd_disk *ndisk;
234
235	bp = (struct bio *)arg;
236
237	ndisk = bp->bio_disk->d_drv1;
238
239	atomic_add_int(&ndisk->cur_depth, -1);
240	if (__predict_false(bp->bio_flags & BIO_ORDERED))
241		atomic_add_int(&ndisk->ordered_in_flight, -1);
242
243	biodone(bp);
244}
245
246static void
247nvd_bioq_process(void *arg, int pending)
248{
249	struct nvd_disk *ndisk = arg;
250	struct bio *bp;
251
252	for (;;) {
253		mtx_lock(&ndisk->bioqlock);
254		bp = bioq_takefirst(&ndisk->bioq);
255		mtx_unlock(&ndisk->bioqlock);
256		if (bp == NULL)
257			break;
258
259		if (nvd_bio_submit(ndisk, bp) != 0) {
260			continue;
261		}
262
263#ifdef BIO_ORDERED
264		/*
265		 * BIO_ORDERED flag dictates that the bio with BIO_ORDERED
266		 *  flag set must be completed before proceeding with
267		 *  additional bios.
268		 */
269		if (bp->bio_flags & BIO_ORDERED) {
270			while (ndisk->cur_depth > 0) {
271				pause("nvd flush", 1);
272			}
273		}
274#endif
275	}
276}
277
278static void *
279nvd_new_controller(struct nvme_controller *ctrlr)
280{
281	struct nvd_controller	*nvd_ctrlr;
282
283	nvd_ctrlr = malloc(sizeof(struct nvd_controller), M_NVD,
284	    M_ZERO | M_WAITOK);
285
286	TAILQ_INIT(&nvd_ctrlr->disk_head);
287	TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq);
288
289	return (nvd_ctrlr);
290}
291
292static void *
293nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
294{
295	uint8_t			descr[NVME_MODEL_NUMBER_LENGTH+1];
296	struct nvd_disk		*ndisk;
297	struct disk		*disk;
298	struct nvd_controller	*ctrlr = ctrlr_arg;
299
300	ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK);
301
302	disk = disk_alloc();
303	disk->d_strategy = nvd_strategy;
304	disk->d_ioctl = nvd_ioctl;
305	disk->d_name = NVD_STR;
306	disk->d_drv1 = ndisk;
307
308	disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
309	disk->d_sectorsize = nvme_ns_get_sector_size(ns);
310	disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
311	disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
312	if (disk->d_delmaxsize > nvd_delete_max)
313		disk->d_delmaxsize = nvd_delete_max;
314	disk->d_stripesize = nvme_ns_get_stripesize(ns);
315
316	if (TAILQ_EMPTY(&disk_head))
317		disk->d_unit = 0;
318	else
319		disk->d_unit =
320		    TAILQ_LAST(&disk_head, disk_list)->disk->d_unit + 1;
321
322	disk->d_flags = DISKFLAG_DIRECT_COMPLETION;
323
324	if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
325		disk->d_flags |= DISKFLAG_CANDELETE;
326
327	if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
328		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
329
330/* ifdef used here to ease porting to stable branches at a later point. */
331#ifdef DISKFLAG_UNMAPPED_BIO
332	disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
333#endif
334
335	/*
336	 * d_ident and d_descr are both far bigger than the length of either
337	 *  the serial or model number strings.
338	 */
339	nvme_strvis(disk->d_ident, nvme_ns_get_serial_number(ns),
340	    sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
341	nvme_strvis(descr, nvme_ns_get_model_number(ns), sizeof(descr),
342	    NVME_MODEL_NUMBER_LENGTH);
343	strlcpy(disk->d_descr, descr, sizeof(descr));
344
345	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
346
347	ndisk->ns = ns;
348	ndisk->disk = disk;
349	ndisk->cur_depth = 0;
350	ndisk->ordered_in_flight = 0;
351
352	mtx_init(&ndisk->bioqlock, "NVD bioq lock", NULL, MTX_DEF);
353	bioq_init(&ndisk->bioq);
354
355	TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
356	ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
357	    taskqueue_thread_enqueue, &ndisk->tq);
358	taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
359
360	TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
361	TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
362
363	disk_create(disk, DISK_VERSION);
364
365	printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr);
366	printf(NVD_STR"%u: %juMB (%ju %u byte sectors)\n", disk->d_unit,
367		(uintmax_t)disk->d_mediasize / (1024*1024),
368		(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
369		disk->d_sectorsize);
370
371	return (NULL);
372}
373
374static void
375destroy_geom_disk(struct nvd_disk *ndisk)
376{
377	struct bio	*bp;
378	struct disk	*disk;
379	uint32_t	unit;
380	int		cnt = 0;
381
382	disk = ndisk->disk;
383	unit = disk->d_unit;
384	taskqueue_free(ndisk->tq);
385
386	disk_destroy(ndisk->disk);
387
388	mtx_lock(&ndisk->bioqlock);
389	for (;;) {
390		bp = bioq_takefirst(&ndisk->bioq);
391		if (bp == NULL)
392			break;
393		bp->bio_error = EIO;
394		bp->bio_flags |= BIO_ERROR;
395		bp->bio_resid = bp->bio_bcount;
396		cnt++;
397		biodone(bp);
398	}
399
400	printf(NVD_STR"%u: lost device - %d outstanding\n", unit, cnt);
401	printf(NVD_STR"%u: removing device entry\n", unit);
402
403	mtx_unlock(&ndisk->bioqlock);
404
405	mtx_destroy(&ndisk->bioqlock);
406}
407
408static void
409nvd_controller_fail(void *ctrlr_arg)
410{
411	struct nvd_controller	*ctrlr = ctrlr_arg;
412	struct nvd_disk		*disk;
413
414	while (!TAILQ_EMPTY(&ctrlr->disk_head)) {
415		disk = TAILQ_FIRST(&ctrlr->disk_head);
416		TAILQ_REMOVE(&disk_head, disk, global_tailq);
417		TAILQ_REMOVE(&ctrlr->disk_head, disk, ctrlr_tailq);
418		destroy_geom_disk(disk);
419		free(disk, M_NVD);
420	}
421
422	TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
423	free(ctrlr, M_NVD);
424}
425
426