nvme_da.c revision 350804
1/*-
2 * Copyright (c) 2015 Netflix, Inc
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer,
10 *    without modification, immediately at the beginning of the file.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 * Derived from ata_da.c:
27 * Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/11/sys/cam/nvme/nvme_da.c 350804 2019-08-08 22:16:19Z mav $");
32
33#include <sys/param.h>
34
35#ifdef _KERNEL
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/bio.h>
39#include <sys/sysctl.h>
40#include <sys/taskqueue.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/conf.h>
44#include <sys/devicestat.h>
45#include <sys/eventhandler.h>
46#include <sys/malloc.h>
47#include <sys/cons.h>
48#include <sys/proc.h>
49#include <sys/reboot.h>
50#include <geom/geom_disk.h>
51#endif /* _KERNEL */
52
53#ifndef _KERNEL
54#include <stdio.h>
55#include <string.h>
56#endif /* _KERNEL */
57
58#include <cam/cam.h>
59#include <cam/cam_ccb.h>
60#include <cam/cam_periph.h>
61#include <cam/cam_xpt_periph.h>
62#include <cam/cam_sim.h>
63#include <cam/cam_iosched.h>
64
65#include <cam/nvme/nvme_all.h>
66
67typedef enum {
68	NDA_STATE_NORMAL
69} nda_state;
70
71typedef enum {
72	NDA_FLAG_OPEN		= 0x0001,
73	NDA_FLAG_DIRTY		= 0x0002,
74	NDA_FLAG_SCTX_INIT	= 0x0004,
75} nda_flags;
76
77typedef enum {
78	NDA_Q_4K   = 0x01,
79	NDA_Q_NONE = 0x00,
80} nda_quirks;
81
82#define NDA_Q_BIT_STRING	\
83	"\020"			\
84	"\001Bit 0"
85
86typedef enum {
87	NDA_CCB_BUFFER_IO	= 0x01,
88	NDA_CCB_DUMP            = 0x02,
89	NDA_CCB_TRIM            = 0x03,
90	NDA_CCB_TYPE_MASK	= 0x0F,
91} nda_ccb_state;
92
93/* Offsets into our private area for storing information */
94#define ccb_state	ppriv_field0
95#define ccb_bp		ppriv_ptr1
96
97struct trim_request {
98	TAILQ_HEAD(, bio) bps;
99};
100struct nda_softc {
101	struct   cam_iosched_softc *cam_iosched;
102	int	 outstanding_cmds;	/* Number of active commands */
103	int	 refcount;		/* Active xpt_action() calls */
104	nda_state state;
105	nda_flags flags;
106	nda_quirks quirks;
107	int	 unmappedio;
108	uint32_t  nsid;			/* Namespace ID for this nda device */
109	struct disk *disk;
110	struct task		sysctl_task;
111	struct sysctl_ctx_list	sysctl_ctx;
112	struct sysctl_oid	*sysctl_tree;
113	struct trim_request	trim_req;
114#ifdef CAM_IO_STATS
115	struct sysctl_ctx_list	sysctl_stats_ctx;
116	struct sysctl_oid	*sysctl_stats_tree;
117	u_int	timeouts;
118	u_int	errors;
119	u_int	invalidations;
120#endif
121};
122
123/* Need quirk table */
124
125static	disk_strategy_t	ndastrategy;
126static	dumper_t	ndadump;
127static	periph_init_t	ndainit;
128static	void		ndaasync(void *callback_arg, u_int32_t code,
129				struct cam_path *path, void *arg);
130static	void		ndasysctlinit(void *context, int pending);
131static	periph_ctor_t	ndaregister;
132static	periph_dtor_t	ndacleanup;
133static	periph_start_t	ndastart;
134static	periph_oninv_t	ndaoninvalidate;
135static	void		ndadone(struct cam_periph *periph,
136			       union ccb *done_ccb);
137static  int		ndaerror(union ccb *ccb, u_int32_t cam_flags,
138				u_int32_t sense_flags);
139static void		ndashutdown(void *arg, int howto);
140static void		ndasuspend(void *arg);
141
142#ifndef	NDA_DEFAULT_SEND_ORDERED
143#define	NDA_DEFAULT_SEND_ORDERED	1
144#endif
145#ifndef NDA_DEFAULT_TIMEOUT
146#define NDA_DEFAULT_TIMEOUT 30	/* Timeout in seconds */
147#endif
148#ifndef	NDA_DEFAULT_RETRY
149#define	NDA_DEFAULT_RETRY	4
150#endif
151
152
153//static int nda_retry_count = NDA_DEFAULT_RETRY;
154static int nda_send_ordered = NDA_DEFAULT_SEND_ORDERED;
155static int nda_default_timeout = NDA_DEFAULT_TIMEOUT;
156
157/*
158 * All NVMe media is non-rotational, so all nvme device instances
159 * share this to implement the sysctl.
160 */
161static int nda_rotating_media = 0;
162
163static SYSCTL_NODE(_kern_cam, OID_AUTO, nda, CTLFLAG_RD, 0,
164            "CAM Direct Access Disk driver");
165
166static struct periph_driver ndadriver =
167{
168	ndainit, "nda",
169	TAILQ_HEAD_INITIALIZER(ndadriver.units), /* generation */ 0
170};
171
172PERIPHDRIVER_DECLARE(nda, ndadriver);
173
174static MALLOC_DEFINE(M_NVMEDA, "nvme_da", "nvme_da buffers");
175
176/*
177 * nice wrappers. Maybe these belong in nvme_all.c instead of
178 * here, but this is the only place that uses these. Should
179 * we ever grow another NVME periph, we should move them
180 * all there wholesale.
181 */
182
183static void
184nda_nvme_flush(struct nda_softc *softc, struct ccb_nvmeio *nvmeio)
185{
186	cam_fill_nvmeio(nvmeio,
187	    0,			/* retries */
188	    ndadone,		/* cbfcnp */
189	    CAM_DIR_NONE,	/* flags */
190	    NULL,		/* data_ptr */
191	    0,			/* dxfer_len */
192	    nda_default_timeout * 1000); /* timeout 30s */
193	nvme_ns_flush_cmd(&nvmeio->cmd, softc->nsid);
194}
195
196static void
197nda_nvme_trim(struct nda_softc *softc, struct ccb_nvmeio *nvmeio,
198    void *payload, uint32_t num_ranges)
199{
200	cam_fill_nvmeio(nvmeio,
201	    0,			/* retries */
202	    ndadone,		/* cbfcnp */
203	    CAM_DIR_OUT,	/* flags */
204	    payload,		/* data_ptr */
205	    num_ranges * sizeof(struct nvme_dsm_range), /* dxfer_len */
206	    nda_default_timeout * 1000); /* timeout 30s */
207	nvme_ns_trim_cmd(&nvmeio->cmd, softc->nsid, num_ranges);
208}
209
210static void
211nda_nvme_write(struct nda_softc *softc, struct ccb_nvmeio *nvmeio,
212    void *payload, uint64_t lba, uint32_t len, uint32_t count)
213{
214	cam_fill_nvmeio(nvmeio,
215	    0,			/* retries */
216	    ndadone,		/* cbfcnp */
217	    CAM_DIR_OUT,	/* flags */
218	    payload,		/* data_ptr */
219	    len,		/* dxfer_len */
220	    nda_default_timeout * 1000); /* timeout 30s */
221	nvme_ns_write_cmd(&nvmeio->cmd, softc->nsid, lba, count);
222}
223
224static void
225nda_nvme_rw_bio(struct nda_softc *softc, struct ccb_nvmeio *nvmeio,
226    struct bio *bp, uint32_t rwcmd)
227{
228	int flags = rwcmd == NVME_OPC_READ ? CAM_DIR_IN : CAM_DIR_OUT;
229	void *payload;
230	uint64_t lba;
231	uint32_t count;
232
233	if (bp->bio_flags & BIO_UNMAPPED) {
234		flags |= CAM_DATA_BIO;
235		payload = bp;
236	} else {
237		payload = bp->bio_data;
238	}
239
240	lba = bp->bio_pblkno;
241	count = bp->bio_bcount / softc->disk->d_sectorsize;
242
243	cam_fill_nvmeio(nvmeio,
244	    0,			/* retries */
245	    ndadone,		/* cbfcnp */
246	    flags,		/* flags */
247	    payload,		/* data_ptr */
248	    bp->bio_bcount,	/* dxfer_len */
249	    nda_default_timeout * 1000); /* timeout 30s */
250	nvme_ns_rw_cmd(&nvmeio->cmd, rwcmd, softc->nsid, lba, count);
251}
252
253static int
254ndaopen(struct disk *dp)
255{
256	struct cam_periph *periph;
257	struct nda_softc *softc;
258	int error;
259
260	periph = (struct cam_periph *)dp->d_drv1;
261	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
262		return(ENXIO);
263	}
264
265	cam_periph_lock(periph);
266	if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) {
267		cam_periph_unlock(periph);
268		cam_periph_release(periph);
269		return (error);
270	}
271
272	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
273	    ("ndaopen\n"));
274
275	softc = (struct nda_softc *)periph->softc;
276	softc->flags |= NDA_FLAG_OPEN;
277
278	cam_periph_unhold(periph);
279	cam_periph_unlock(periph);
280	return (0);
281}
282
283static int
284ndaclose(struct disk *dp)
285{
286	struct	cam_periph *periph;
287	struct	nda_softc *softc;
288	union ccb *ccb;
289	int error;
290
291	periph = (struct cam_periph *)dp->d_drv1;
292	softc = (struct nda_softc *)periph->softc;
293	cam_periph_lock(periph);
294
295	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
296	    ("ndaclose\n"));
297
298	if ((softc->flags & NDA_FLAG_DIRTY) != 0 &&
299	    (periph->flags & CAM_PERIPH_INVALID) == 0 &&
300	    cam_periph_hold(periph, PRIBIO) == 0) {
301
302		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
303		nda_nvme_flush(softc, &ccb->nvmeio);
304		error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0,
305		    /*sense_flags*/0, softc->disk->d_devstat);
306
307		if (error != 0)
308			xpt_print(periph->path, "Synchronize cache failed\n");
309		else
310			softc->flags &= ~NDA_FLAG_DIRTY;
311		xpt_release_ccb(ccb);
312		cam_periph_unhold(periph);
313	}
314
315	softc->flags &= ~NDA_FLAG_OPEN;
316
317	while (softc->refcount != 0)
318		cam_periph_sleep(periph, &softc->refcount, PRIBIO, "ndaclose", 1);
319	cam_periph_unlock(periph);
320	cam_periph_release(periph);
321	return (0);
322}
323
324static void
325ndaschedule(struct cam_periph *periph)
326{
327	struct nda_softc *softc = (struct nda_softc *)periph->softc;
328
329	if (softc->state != NDA_STATE_NORMAL)
330		return;
331
332	cam_iosched_schedule(softc->cam_iosched, periph);
333}
334
335/*
336 * Actually translate the requested transfer into one the physical driver
337 * can understand.  The transfer is described by a buf and will include
338 * only one physical transfer.
339 */
340static void
341ndastrategy(struct bio *bp)
342{
343	struct cam_periph *periph;
344	struct nda_softc *softc;
345
346	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
347	softc = (struct nda_softc *)periph->softc;
348
349	cam_periph_lock(periph);
350
351	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastrategy(%p)\n", bp));
352
353	/*
354	 * If the device has been made invalid, error out
355	 */
356	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
357		cam_periph_unlock(periph);
358		biofinish(bp, NULL, ENXIO);
359		return;
360	}
361
362	/*
363	 * Place it in the queue of disk activities for this disk
364	 */
365	cam_iosched_queue_work(softc->cam_iosched, bp);
366
367	/*
368	 * Schedule ourselves for performing the work.
369	 */
370	ndaschedule(periph);
371	cam_periph_unlock(periph);
372
373	return;
374}
375
376static int
377ndadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length)
378{
379	struct	    cam_periph *periph;
380	struct	    nda_softc *softc;
381	u_int	    secsize;
382	struct ccb_nvmeio nvmeio;
383	struct	    disk *dp;
384	uint64_t    lba;
385	uint32_t    count;
386	int	    error = 0;
387
388	dp = arg;
389	periph = dp->d_drv1;
390	softc = (struct nda_softc *)periph->softc;
391	cam_periph_lock(periph);
392	secsize = softc->disk->d_sectorsize;
393	lba = offset / secsize;
394	count = length / secsize;
395
396	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
397		cam_periph_unlock(periph);
398		return (ENXIO);
399	}
400
401	/* xpt_get_ccb returns a zero'd allocation for the ccb, mimic that here */
402	memset(&nvmeio, 0, sizeof(nvmeio));
403	if (length > 0) {
404		xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
405		nvmeio.ccb_h.ccb_state = NDA_CCB_DUMP;
406		nda_nvme_write(softc, &nvmeio, virtual, lba, length, count);
407		xpt_polled_action((union ccb *)&nvmeio);
408
409		error = cam_periph_error((union ccb *)&nvmeio,
410		    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
411		if ((nvmeio.ccb_h.status & CAM_DEV_QFRZN) != 0)
412			cam_release_devq(nvmeio.ccb_h.path, /*relsim_flags*/0,
413			    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
414		if (error != 0)
415			printf("Aborting dump due to I/O error.\n");
416
417		cam_periph_unlock(periph);
418		return (error);
419	}
420
421	/* Flush */
422	xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
423
424	nvmeio.ccb_h.ccb_state = NDA_CCB_DUMP;
425	nda_nvme_flush(softc, &nvmeio);
426	xpt_polled_action((union ccb *)&nvmeio);
427
428	error = cam_periph_error((union ccb *)&nvmeio,
429	    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
430	if ((nvmeio.ccb_h.status & CAM_DEV_QFRZN) != 0)
431		cam_release_devq(nvmeio.ccb_h.path, /*relsim_flags*/0,
432		    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
433	if (error != 0)
434		xpt_print(periph->path, "flush cmd failed\n");
435	cam_periph_unlock(periph);
436	return (error);
437}
438
439static void
440ndainit(void)
441{
442	cam_status status;
443
444	/*
445	 * Install a global async callback.  This callback will
446	 * receive async callbacks like "new device found".
447	 */
448	status = xpt_register_async(AC_FOUND_DEVICE, ndaasync, NULL, NULL);
449
450	if (status != CAM_REQ_CMP) {
451		printf("nda: Failed to attach master async callback "
452		       "due to status 0x%x!\n", status);
453	} else if (nda_send_ordered) {
454
455		/* Register our event handlers */
456		if ((EVENTHANDLER_REGISTER(power_suspend, ndasuspend,
457					   NULL, EVENTHANDLER_PRI_LAST)) == NULL)
458		    printf("ndainit: power event registration failed!\n");
459		if ((EVENTHANDLER_REGISTER(shutdown_post_sync, ndashutdown,
460					   NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
461		    printf("ndainit: shutdown event registration failed!\n");
462	}
463}
464
465/*
466 * Callback from GEOM, called when it has finished cleaning up its
467 * resources.
468 */
469static void
470ndadiskgonecb(struct disk *dp)
471{
472	struct cam_periph *periph;
473
474	periph = (struct cam_periph *)dp->d_drv1;
475
476	cam_periph_release(periph);
477}
478
479static void
480ndaoninvalidate(struct cam_periph *periph)
481{
482	struct nda_softc *softc;
483
484	softc = (struct nda_softc *)periph->softc;
485
486	/*
487	 * De-register any async callbacks.
488	 */
489	xpt_register_async(0, ndaasync, periph, periph->path);
490#ifdef CAM_IO_STATS
491	softc->invalidations++;
492#endif
493
494	/*
495	 * Return all queued I/O with ENXIO.
496	 * XXX Handle any transactions queued to the card
497	 *     with XPT_ABORT_CCB.
498	 */
499	cam_iosched_flush(softc->cam_iosched, NULL, ENXIO);
500
501	disk_gone(softc->disk);
502}
503
504static void
505ndacleanup(struct cam_periph *periph)
506{
507	struct nda_softc *softc;
508
509	softc = (struct nda_softc *)periph->softc;
510
511	cam_periph_unlock(periph);
512
513	cam_iosched_fini(softc->cam_iosched);
514
515	/*
516	 * If we can't free the sysctl tree, oh well...
517	 */
518	if ((softc->flags & NDA_FLAG_SCTX_INIT) != 0) {
519#ifdef CAM_IO_STATS
520		if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
521			xpt_print(periph->path,
522			    "can't remove sysctl stats context\n");
523#endif
524		if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
525			xpt_print(periph->path,
526			    "can't remove sysctl context\n");
527	}
528
529	disk_destroy(softc->disk);
530	free(softc, M_DEVBUF);
531	cam_periph_lock(periph);
532}
533
534static void
535ndaasync(void *callback_arg, u_int32_t code,
536	struct cam_path *path, void *arg)
537{
538	struct cam_periph *periph;
539
540	periph = (struct cam_periph *)callback_arg;
541	switch (code) {
542	case AC_FOUND_DEVICE:
543	{
544		struct ccb_getdev *cgd;
545		cam_status status;
546
547		cgd = (struct ccb_getdev *)arg;
548		if (cgd == NULL)
549			break;
550
551		if (cgd->protocol != PROTO_NVME)
552			break;
553
554		/*
555		 * Allocate a peripheral instance for
556		 * this device and start the probe
557		 * process.
558		 */
559		status = cam_periph_alloc(ndaregister, ndaoninvalidate,
560					  ndacleanup, ndastart,
561					  "nda", CAM_PERIPH_BIO,
562					  path, ndaasync,
563					  AC_FOUND_DEVICE, cgd);
564
565		if (status != CAM_REQ_CMP
566		 && status != CAM_REQ_INPROG)
567			printf("ndaasync: Unable to attach to new device "
568				"due to status 0x%x\n", status);
569		break;
570	}
571	case AC_ADVINFO_CHANGED:
572	{
573		uintptr_t buftype;
574
575		buftype = (uintptr_t)arg;
576		if (buftype == CDAI_TYPE_PHYS_PATH) {
577			struct nda_softc *softc;
578
579			softc = periph->softc;
580			disk_attr_changed(softc->disk, "GEOM::physpath",
581					  M_NOWAIT);
582		}
583		break;
584	}
585	case AC_LOST_DEVICE:
586	default:
587		cam_periph_async(periph, code, path, arg);
588		break;
589	}
590}
591
592static void
593ndasysctlinit(void *context, int pending)
594{
595	struct cam_periph *periph;
596	struct nda_softc *softc;
597	char tmpstr[32], tmpstr2[16];
598
599	periph = (struct cam_periph *)context;
600
601	/* periph was held for us when this task was enqueued */
602	if ((periph->flags & CAM_PERIPH_INVALID) != 0) {
603		cam_periph_release(periph);
604		return;
605	}
606
607	softc = (struct nda_softc *)periph->softc;
608	snprintf(tmpstr, sizeof(tmpstr), "CAM NDA unit %d", periph->unit_number);
609	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
610
611	sysctl_ctx_init(&softc->sysctl_ctx);
612	softc->flags |= NDA_FLAG_SCTX_INIT;
613	softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx,
614		SYSCTL_STATIC_CHILDREN(_kern_cam_nda), OID_AUTO, tmpstr2,
615		CTLFLAG_RD, 0, tmpstr);
616	if (softc->sysctl_tree == NULL) {
617		printf("ndasysctlinit: unable to allocate sysctl tree\n");
618		cam_periph_release(periph);
619		return;
620	}
621
622	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
623		OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE,
624		&softc->unmappedio, 0, "Unmapped I/O leaf");
625
626	SYSCTL_ADD_INT(&softc->sysctl_ctx,
627		       SYSCTL_CHILDREN(softc->sysctl_tree),
628		       OID_AUTO,
629		       "rotating",
630		       CTLFLAG_RD | CTLFLAG_MPSAFE,
631		       &nda_rotating_media,
632		       0,
633		       "Rotating media");
634
635#ifdef CAM_IO_STATS
636	softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
637		SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
638		CTLFLAG_RD, 0, "Statistics");
639	if (softc->sysctl_stats_tree == NULL) {
640		printf("ndasysctlinit: unable to allocate sysctl tree for stats\n");
641		cam_periph_release(periph);
642		return;
643	}
644	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
645		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
646		OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE,
647		&softc->timeouts, 0,
648		"Device timeouts reported by the SIM");
649	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
650		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
651		OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE,
652		&softc->errors, 0,
653		"Transport errors reported by the SIM.");
654	SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
655		SYSCTL_CHILDREN(softc->sysctl_stats_tree),
656		OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE,
657		&softc->invalidations, 0,
658		"Device pack invalidations.");
659#endif
660
661	cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
662	    softc->sysctl_tree);
663
664	cam_periph_release(periph);
665}
666
667static int
668ndagetattr(struct bio *bp)
669{
670	int ret;
671	struct cam_periph *periph;
672
673	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
674	cam_periph_lock(periph);
675	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
676	    periph->path);
677	cam_periph_unlock(periph);
678	if (ret == 0)
679		bp->bio_completed = bp->bio_length;
680	return ret;
681}
682
683static cam_status
684ndaregister(struct cam_periph *periph, void *arg)
685{
686	struct nda_softc *softc;
687	struct disk *disk;
688	struct ccb_pathinq cpi;
689	const struct nvme_namespace_data *nsd;
690	const struct nvme_controller_data *cd;
691	char   announce_buf[80];
692	u_int maxio;
693	int quirks;
694
695	nsd = nvme_get_identify_ns(periph);
696	cd = nvme_get_identify_cntrl(periph);
697
698	softc = (struct nda_softc *)malloc(sizeof(*softc), M_DEVBUF,
699	    M_NOWAIT | M_ZERO);
700
701	if (softc == NULL) {
702		printf("ndaregister: Unable to probe new device. "
703		    "Unable to allocate softc\n");
704		return(CAM_REQ_CMP_ERR);
705	}
706
707	if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
708		printf("ndaregister: Unable to probe new device. "
709		       "Unable to allocate iosched memory\n");
710		return(CAM_REQ_CMP_ERR);
711	}
712
713	/* ident_data parsing */
714
715	periph->softc = softc;
716
717	softc->quirks = NDA_Q_NONE;
718
719	xpt_path_inq(&cpi, periph->path);
720
721	TASK_INIT(&softc->sysctl_task, 0, ndasysctlinit, periph);
722
723	/*
724	 * The name space ID is the lun, save it for later I/O
725	 */
726	softc->nsid = (uint32_t)xpt_path_lun_id(periph->path);
727
728	/*
729	 * Register this media as a disk
730	 */
731	(void)cam_periph_hold(periph, PRIBIO);
732	cam_periph_unlock(periph);
733	snprintf(announce_buf, sizeof(announce_buf),
734	    "kern.cam.nda.%d.quirks", periph->unit_number);
735	quirks = softc->quirks;
736	TUNABLE_INT_FETCH(announce_buf, &quirks);
737	softc->quirks = quirks;
738	cam_iosched_set_sort_queue(softc->cam_iosched, 0);
739	softc->disk = disk = disk_alloc();
740	strlcpy(softc->disk->d_descr, cd->mn,
741	    MIN(sizeof(softc->disk->d_descr), sizeof(cd->mn)));
742	strlcpy(softc->disk->d_ident, cd->sn,
743	    MIN(sizeof(softc->disk->d_ident), sizeof(cd->sn)));
744	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
745	disk->d_open = ndaopen;
746	disk->d_close = ndaclose;
747	disk->d_strategy = ndastrategy;
748	disk->d_getattr = ndagetattr;
749	disk->d_dump = ndadump;
750	disk->d_gone = ndadiskgonecb;
751	disk->d_name = "nda";
752	disk->d_drv1 = periph;
753	disk->d_unit = periph->unit_number;
754	maxio = cpi.maxio;		/* Honor max I/O size of SIM */
755	if (maxio == 0)
756		maxio = DFLTPHYS;	/* traditional default */
757	else if (maxio > MAXPHYS)
758		maxio = MAXPHYS;	/* for safety */
759	disk->d_maxsize = maxio;
760	disk->d_sectorsize = 1 << nsd->lbaf[nsd->flbas.format].lbads;
761	disk->d_mediasize = (off_t)(disk->d_sectorsize * nsd->nsze);
762	disk->d_delmaxsize = disk->d_mediasize;
763	disk->d_flags = DISKFLAG_DIRECT_COMPLETION;
764//	if (cd->oncs.dsm) // XXX broken?
765		disk->d_flags |= DISKFLAG_CANDELETE;
766	if (cd->vwc.present)
767		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
768	if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
769		disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
770		softc->unmappedio = 1;
771	}
772	/*
773	 * d_ident and d_descr are both far bigger than the length of either
774	 *  the serial or model number strings.
775	 */
776	nvme_strvis(disk->d_descr, cd->mn,
777	    sizeof(disk->d_descr), NVME_MODEL_NUMBER_LENGTH);
778	nvme_strvis(disk->d_ident, cd->sn,
779	    sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
780	disk->d_hba_vendor = cpi.hba_vendor;
781	disk->d_hba_device = cpi.hba_device;
782	disk->d_hba_subvendor = cpi.hba_subvendor;
783	disk->d_hba_subdevice = cpi.hba_subdevice;
784	disk->d_stripesize = disk->d_sectorsize;
785	disk->d_stripeoffset = 0;
786	disk->d_devstat = devstat_new_entry(periph->periph_name,
787	    periph->unit_number, disk->d_sectorsize,
788	    DEVSTAT_ALL_SUPPORTED,
789	    DEVSTAT_TYPE_DIRECT | XPORT_DEVSTAT_TYPE(cpi.transport),
790	    DEVSTAT_PRIORITY_DISK);
791
792	/*
793	 * Acquire a reference to the periph before we register with GEOM.
794	 * We'll release this reference once GEOM calls us back (via
795	 * ndadiskgonecb()) telling us that our provider has been freed.
796	 */
797	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
798		xpt_print(periph->path, "%s: lost periph during "
799			  "registration!\n", __func__);
800		cam_periph_lock(periph);
801		return (CAM_REQ_CMP_ERR);
802	}
803	disk_create(softc->disk, DISK_VERSION);
804	cam_periph_lock(periph);
805	cam_periph_unhold(periph);
806
807	snprintf(announce_buf, sizeof(announce_buf),
808		"%juMB (%ju %u byte sectors)",
809	    (uintmax_t)((uintmax_t)disk->d_mediasize / (1024*1024)),
810		(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
811		disk->d_sectorsize);
812	xpt_announce_periph(periph, announce_buf);
813	xpt_announce_quirks(periph, softc->quirks, NDA_Q_BIT_STRING);
814
815	/*
816	 * Create our sysctl variables, now that we know
817	 * we have successfully attached.
818	 */
819	if (cam_periph_acquire(periph) == CAM_REQ_CMP)
820		taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task);
821
822	/*
823	 * Register for device going away and info about the drive
824	 * changing (though with NVMe, it can't)
825	 */
826	xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED,
827	    ndaasync, periph, periph->path);
828
829	softc->state = NDA_STATE_NORMAL;
830	return(CAM_REQ_CMP);
831}
832
833static void
834ndastart(struct cam_periph *periph, union ccb *start_ccb)
835{
836	struct nda_softc *softc = (struct nda_softc *)periph->softc;
837	struct ccb_nvmeio *nvmeio = &start_ccb->nvmeio;
838
839	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart\n"));
840
841	switch (softc->state) {
842	case NDA_STATE_NORMAL:
843	{
844		struct bio *bp;
845
846		bp = cam_iosched_next_bio(softc->cam_iosched);
847		CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart: bio %p\n", bp));
848		if (bp == NULL) {
849			xpt_release_ccb(start_ccb);
850			break;
851		}
852
853		switch (bp->bio_cmd) {
854		case BIO_WRITE:
855			softc->flags |= NDA_FLAG_DIRTY;
856			/* FALLTHROUGH */
857		case BIO_READ:
858		{
859#ifdef NDA_TEST_FAILURE
860			int fail = 0;
861
862			/*
863			 * Support the failure ioctls.  If the command is a
864			 * read, and there are pending forced read errors, or
865			 * if a write and pending write errors, then fail this
866			 * operation with EIO.  This is useful for testing
867			 * purposes.  Also, support having every Nth read fail.
868			 *
869			 * This is a rather blunt tool.
870			 */
871			if (bp->bio_cmd == BIO_READ) {
872				if (softc->force_read_error) {
873					softc->force_read_error--;
874					fail = 1;
875				}
876				if (softc->periodic_read_error > 0) {
877					if (++softc->periodic_read_count >=
878					    softc->periodic_read_error) {
879						softc->periodic_read_count = 0;
880						fail = 1;
881					}
882				}
883			} else {
884				if (softc->force_write_error) {
885					softc->force_write_error--;
886					fail = 1;
887				}
888			}
889			if (fail) {
890				biofinish(bp, NULL, EIO);
891				xpt_release_ccb(start_ccb);
892				ndaschedule(periph);
893				return;
894			}
895#endif
896			KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 ||
897			    round_page(bp->bio_bcount + bp->bio_ma_offset) /
898			    PAGE_SIZE == bp->bio_ma_n,
899			    ("Short bio %p", bp));
900			nda_nvme_rw_bio(softc, &start_ccb->nvmeio, bp, bp->bio_cmd == BIO_READ ?
901			    NVME_OPC_READ : NVME_OPC_WRITE);
902			break;
903		}
904		case BIO_DELETE:
905		{
906			struct nvme_dsm_range *dsm_range;
907
908			dsm_range =
909			    malloc(sizeof(*dsm_range), M_NVMEDA, M_ZERO | M_WAITOK);
910			dsm_range->length =
911			    bp->bio_bcount / softc->disk->d_sectorsize;
912			dsm_range->starting_lba =
913			    bp->bio_offset / softc->disk->d_sectorsize;
914			bp->bio_driver2 = dsm_range;
915			nda_nvme_trim(softc, &start_ccb->nvmeio, dsm_range, 1);
916			start_ccb->ccb_h.ccb_state = NDA_CCB_TRIM;
917			start_ccb->ccb_h.flags |= CAM_UNLOCKED;
918			/*
919			 * Note: We can have multiple TRIMs in flight, so we don't call
920			 * cam_iosched_submit_trim(softc->cam_iosched);
921			 * since that forces the I/O scheduler to only schedule one at a time.
922			 * On NVMe drives, this is a performance disaster.
923			 */
924			goto out;
925		}
926		case BIO_FLUSH:
927			nda_nvme_flush(softc, nvmeio);
928			break;
929		}
930		start_ccb->ccb_h.ccb_state = NDA_CCB_BUFFER_IO;
931		start_ccb->ccb_h.flags |= CAM_UNLOCKED;
932out:
933		start_ccb->ccb_h.ccb_bp = bp;
934		softc->outstanding_cmds++;
935		softc->refcount++;
936		cam_periph_unlock(periph);
937		xpt_action(start_ccb);
938		cam_periph_lock(periph);
939		softc->refcount--;
940
941		/* May have more work to do, so ensure we stay scheduled */
942		ndaschedule(periph);
943		break;
944		}
945	}
946}
947
948static void
949ndadone(struct cam_periph *periph, union ccb *done_ccb)
950{
951	struct nda_softc *softc;
952	struct ccb_nvmeio *nvmeio = &done_ccb->nvmeio;
953	struct cam_path *path;
954	int state;
955
956	softc = (struct nda_softc *)periph->softc;
957	path = done_ccb->ccb_h.path;
958
959	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("ndadone\n"));
960
961	state = nvmeio->ccb_h.ccb_state & NDA_CCB_TYPE_MASK;
962	switch (state) {
963	case NDA_CCB_BUFFER_IO:
964	case NDA_CCB_TRIM:
965	{
966		struct bio *bp;
967		int error;
968
969		cam_periph_lock(periph);
970		bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
971		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
972			error = ndaerror(done_ccb, 0, 0);
973			if (error == ERESTART) {
974				/* A retry was scheduled, so just return. */
975				cam_periph_unlock(periph);
976				return;
977			}
978			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
979				cam_release_devq(path,
980						 /*relsim_flags*/0,
981						 /*reduction*/0,
982						 /*timeout*/0,
983						 /*getcount_only*/0);
984		} else {
985			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
986				panic("REQ_CMP with QFRZN");
987			error = 0;
988		}
989		bp->bio_error = error;
990		if (error != 0) {
991			bp->bio_resid = bp->bio_bcount;
992			bp->bio_flags |= BIO_ERROR;
993		} else {
994			bp->bio_resid = 0;
995		}
996		if (state == NDA_CCB_TRIM)
997			free(bp->bio_driver2, M_NVMEDA);
998		softc->outstanding_cmds--;
999
1000		cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
1001		xpt_release_ccb(done_ccb);
1002		if (state == NDA_CCB_TRIM) {
1003#ifdef notyet
1004			TAILQ_HEAD(, bio) queue;
1005			struct bio *bp1;
1006
1007			TAILQ_INIT(&queue);
1008			TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue);
1009#endif
1010			/*
1011			 * Since we can have multiple trims in flight, we don't
1012			 * need to call this here.
1013			 * cam_iosched_trim_done(softc->cam_iosched);
1014			 */
1015			ndaschedule(periph);
1016			cam_periph_unlock(periph);
1017#ifdef notyet
1018/* Not yet collapsing several BIO_DELETE requests into one TRIM */
1019			while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
1020				TAILQ_REMOVE(&queue, bp1, bio_queue);
1021				bp1->bio_error = error;
1022				if (error != 0) {
1023					bp1->bio_flags |= BIO_ERROR;
1024					bp1->bio_resid = bp1->bio_bcount;
1025				} else
1026					bp1->bio_resid = 0;
1027				biodone(bp1);
1028			}
1029#else
1030			biodone(bp);
1031#endif
1032		} else {
1033			ndaschedule(periph);
1034			cam_periph_unlock(periph);
1035			biodone(bp);
1036		}
1037		return;
1038	}
1039	case NDA_CCB_DUMP:
1040		/* No-op.  We're polling */
1041		return;
1042	default:
1043		break;
1044	}
1045	xpt_release_ccb(done_ccb);
1046}
1047
1048static int
1049ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
1050{
1051	struct nda_softc *softc;
1052	struct cam_periph *periph;
1053
1054	periph = xpt_path_periph(ccb->ccb_h.path);
1055	softc = (struct nda_softc *)periph->softc;
1056
1057	switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
1058	case CAM_CMD_TIMEOUT:
1059#ifdef CAM_IO_STATS
1060		softc->timeouts++;
1061#endif
1062		break;
1063	case CAM_REQ_ABORTED:
1064	case CAM_REQ_CMP_ERR:
1065	case CAM_REQ_TERMIO:
1066	case CAM_UNREC_HBA_ERROR:
1067	case CAM_DATA_RUN_ERR:
1068	case CAM_ATA_STATUS_ERROR:
1069#ifdef CAM_IO_STATS
1070		softc->errors++;
1071#endif
1072		break;
1073	default:
1074		break;
1075	}
1076
1077	return(cam_periph_error(ccb, cam_flags, sense_flags, NULL));
1078}
1079
1080/*
1081 * Step through all NDA peripheral drivers, and if the device is still open,
1082 * sync the disk cache to physical media.
1083 */
1084static void
1085ndaflush(void)
1086{
1087	struct cam_periph *periph;
1088	struct nda_softc *softc;
1089	union ccb *ccb;
1090	int error;
1091
1092	CAM_PERIPH_FOREACH(periph, &ndadriver) {
1093		softc = (struct nda_softc *)periph->softc;
1094		if (SCHEDULER_STOPPED()) {
1095			/* If we paniced with the lock held, do not recurse. */
1096			if (!cam_periph_owned(periph) &&
1097			    (softc->flags & NDA_FLAG_OPEN)) {
1098				ndadump(softc->disk, NULL, 0, 0, 0);
1099			}
1100			continue;
1101		}
1102		cam_periph_lock(periph);
1103		/*
1104		 * We only sync the cache if the drive is still open, and
1105		 * if the drive is capable of it..
1106		 */
1107		if ((softc->flags & NDA_FLAG_OPEN) == 0) {
1108			cam_periph_unlock(periph);
1109			continue;
1110		}
1111
1112		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
1113		nda_nvme_flush(softc, &ccb->nvmeio);
1114		error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0,
1115		    /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY,
1116		    softc->disk->d_devstat);
1117		if (error != 0)
1118			xpt_print(periph->path, "Synchronize cache failed\n");
1119		xpt_release_ccb(ccb);
1120		cam_periph_unlock(periph);
1121	}
1122}
1123
1124static void
1125ndashutdown(void *arg, int howto)
1126{
1127
1128	ndaflush();
1129}
1130
1131static void
1132ndasuspend(void *arg)
1133{
1134
1135	ndaflush();
1136}
1137