1/*	$NetBSD: nvme.c,v 1.69 2024/03/11 21:10:46 riastradh Exp $	*/
2/*	$OpenBSD: nvme.c,v 1.49 2016/04/18 05:59:50 dlg Exp $ */
3
4/*
5 * Copyright (c) 2014 David Gwynne <dlg@openbsd.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/cdefs.h>
21__KERNEL_RCSID(0, "$NetBSD: nvme.c,v 1.69 2024/03/11 21:10:46 riastradh Exp $");
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/kernel.h>
26#include <sys/atomic.h>
27#include <sys/bus.h>
28#include <sys/buf.h>
29#include <sys/conf.h>
30#include <sys/device.h>
31#include <sys/kmem.h>
32#include <sys/once.h>
33#include <sys/proc.h>
34#include <sys/queue.h>
35#include <sys/mutex.h>
36
37#include <uvm/uvm_extern.h>
38
39#include <dev/ic/nvmereg.h>
40#include <dev/ic/nvmevar.h>
41#include <dev/ic/nvmeio.h>
42
43#include "ioconf.h"
44#include "locators.h"
45
46#define	B4_CHK_RDY_DELAY_MS	2300	/* workaround controller bug */
47
48int nvme_adminq_size = 32;
49int nvme_ioq_size = 1024;
50
51static int	nvme_print(void *, const char *);
52
53static int	nvme_ready(struct nvme_softc *, uint32_t);
54static int	nvme_enable(struct nvme_softc *, u_int);
55static int	nvme_disable(struct nvme_softc *);
56static int	nvme_shutdown(struct nvme_softc *);
57
58uint32_t	nvme_op_sq_enter(struct nvme_softc *,
59		    struct nvme_queue *, struct nvme_ccb *);
60void		nvme_op_sq_leave(struct nvme_softc *,
61		    struct nvme_queue *, struct nvme_ccb *);
62uint32_t	nvme_op_sq_enter_locked(struct nvme_softc *,
63		    struct nvme_queue *, struct nvme_ccb *);
64void		nvme_op_sq_leave_locked(struct nvme_softc *,
65		    struct nvme_queue *, struct nvme_ccb *);
66
67void		nvme_op_cq_done(struct nvme_softc *,
68		    struct nvme_queue *, struct nvme_ccb *);
69
70static const struct nvme_ops nvme_ops = {
71	.op_sq_enter		= nvme_op_sq_enter,
72	.op_sq_leave		= nvme_op_sq_leave,
73	.op_sq_enter_locked	= nvme_op_sq_enter_locked,
74	.op_sq_leave_locked	= nvme_op_sq_leave_locked,
75
76	.op_cq_done		= nvme_op_cq_done,
77};
78
79#ifdef NVME_DEBUG
80static void	nvme_dumpregs(struct nvme_softc *);
81#endif
82static int	nvme_identify(struct nvme_softc *, u_int);
83static void	nvme_fill_identify(struct nvme_queue *, struct nvme_ccb *,
84		    void *);
85
86static int	nvme_ccbs_alloc(struct nvme_queue *, uint16_t);
87static void	nvme_ccbs_free(struct nvme_queue *);
88
89static struct nvme_ccb *
90		nvme_ccb_get(struct nvme_queue *, bool);
91static struct nvme_ccb *
92		nvme_ccb_get_bio(struct nvme_softc *, struct buf *,
93		    struct nvme_queue **);
94static void	nvme_ccb_put(struct nvme_queue *, struct nvme_ccb *);
95
96static int	nvme_poll(struct nvme_softc *, struct nvme_queue *,
97		    struct nvme_ccb *, void (*)(struct nvme_queue *,
98		    struct nvme_ccb *, void *), int);
99static void	nvme_poll_fill(struct nvme_queue *, struct nvme_ccb *, void *);
100static void	nvme_poll_done(struct nvme_queue *, struct nvme_ccb *,
101		    struct nvme_cqe *);
102static void	nvme_sqe_fill(struct nvme_queue *, struct nvme_ccb *, void *);
103static void	nvme_empty_done(struct nvme_queue *, struct nvme_ccb *,
104		    struct nvme_cqe *);
105
106static struct nvme_queue *
107		nvme_q_alloc(struct nvme_softc *, uint16_t, u_int, u_int);
108static int	nvme_q_create(struct nvme_softc *, struct nvme_queue *);
109static void	nvme_q_reset(struct nvme_softc *, struct nvme_queue *);
110static int	nvme_q_delete(struct nvme_softc *, struct nvme_queue *);
111static void	nvme_q_submit(struct nvme_softc *, struct nvme_queue *,
112		    struct nvme_ccb *, void (*)(struct nvme_queue *,
113		    struct nvme_ccb *, void *));
114static int	nvme_q_complete(struct nvme_softc *, struct nvme_queue *q);
115static void	nvme_q_free(struct nvme_softc *, struct nvme_queue *);
116static void	nvme_q_wait_complete(struct nvme_softc *, struct nvme_queue *,
117		    bool (*)(void *), void *);
118
119static void	nvme_ns_io_fill(struct nvme_queue *, struct nvme_ccb *,
120		    void *);
121static void	nvme_ns_io_done(struct nvme_queue *, struct nvme_ccb *,
122		    struct nvme_cqe *);
123static void	nvme_ns_sync_fill(struct nvme_queue *, struct nvme_ccb *,
124		    void *);
125static void	nvme_ns_sync_done(struct nvme_queue *, struct nvme_ccb *,
126		    struct nvme_cqe *);
127static void	nvme_getcache_fill(struct nvme_queue *, struct nvme_ccb *,
128		    void *);
129static void	nvme_getcache_done(struct nvme_queue *, struct nvme_ccb *,
130		    struct nvme_cqe *);
131
132static void	nvme_pt_fill(struct nvme_queue *, struct nvme_ccb *,
133		    void *);
134static void	nvme_pt_done(struct nvme_queue *, struct nvme_ccb *,
135		    struct nvme_cqe *);
136static int	nvme_command_passthrough(struct nvme_softc *,
137		    struct nvme_pt_command *, uint32_t, struct lwp *, bool);
138
139static int	nvme_set_number_of_queues(struct nvme_softc *, u_int, u_int *,
140		    u_int *);
141
142#define NVME_TIMO_QOP		5	/* queue create and delete timeout */
143#define NVME_TIMO_IDENT		10	/* probe identify timeout */
144#define NVME_TIMO_PT		-1	/* passthrough cmd timeout */
145#define NVME_TIMO_SY		60	/* sync cache timeout */
146
147/*
148 * Some controllers, at least Apple NVMe, always require split
149 * transfers, so don't use bus_space_{read,write}_8() on LP64.
150 */
151uint64_t
152nvme_read8(struct nvme_softc *sc, bus_size_t r)
153{
154	uint64_t v;
155	uint32_t *a = (uint32_t *)&v;
156
157#if _BYTE_ORDER == _LITTLE_ENDIAN
158	a[0] = nvme_read4(sc, r);
159	a[1] = nvme_read4(sc, r + 4);
160#else /* _BYTE_ORDER == _LITTLE_ENDIAN */
161	a[1] = nvme_read4(sc, r);
162	a[0] = nvme_read4(sc, r + 4);
163#endif
164
165	return v;
166}
167
168void
169nvme_write8(struct nvme_softc *sc, bus_size_t r, uint64_t v)
170{
171	uint32_t *a = (uint32_t *)&v;
172
173#if _BYTE_ORDER == _LITTLE_ENDIAN
174	nvme_write4(sc, r, a[0]);
175	nvme_write4(sc, r + 4, a[1]);
176#else /* _BYTE_ORDER == _LITTLE_ENDIAN */
177	nvme_write4(sc, r, a[1]);
178	nvme_write4(sc, r + 4, a[0]);
179#endif
180}
181
182#ifdef NVME_DEBUG
183static __used void
184nvme_dumpregs(struct nvme_softc *sc)
185{
186	uint64_t r8;
187	uint32_t r4;
188
189#define	DEVNAME(_sc) device_xname((_sc)->sc_dev)
190	r8 = nvme_read8(sc, NVME_CAP);
191	printf("%s: cap  0x%016"PRIx64"\n", DEVNAME(sc), nvme_read8(sc, NVME_CAP));
192	printf("%s:  mpsmax %u (%u)\n", DEVNAME(sc),
193	    (u_int)NVME_CAP_MPSMAX(r8), (1 << NVME_CAP_MPSMAX(r8)));
194	printf("%s:  mpsmin %u (%u)\n", DEVNAME(sc),
195	    (u_int)NVME_CAP_MPSMIN(r8), (1 << NVME_CAP_MPSMIN(r8)));
196	printf("%s:  css %"PRIu64"\n", DEVNAME(sc), NVME_CAP_CSS(r8));
197	printf("%s:  nssrs %"PRIu64"\n", DEVNAME(sc), NVME_CAP_NSSRS(r8));
198	printf("%s:  dstrd %"PRIu64"\n", DEVNAME(sc), NVME_CAP_DSTRD(r8));
199	printf("%s:  to %"PRIu64" msec\n", DEVNAME(sc), NVME_CAP_TO(r8));
200	printf("%s:  ams %"PRIu64"\n", DEVNAME(sc), NVME_CAP_AMS(r8));
201	printf("%s:  cqr %"PRIu64"\n", DEVNAME(sc), NVME_CAP_CQR(r8));
202	printf("%s:  mqes %"PRIu64"\n", DEVNAME(sc), NVME_CAP_MQES(r8));
203
204	printf("%s: vs   0x%04x\n", DEVNAME(sc), nvme_read4(sc, NVME_VS));
205
206	r4 = nvme_read4(sc, NVME_CC);
207	printf("%s: cc   0x%04x\n", DEVNAME(sc), r4);
208	printf("%s:  iocqes %u (%u)\n", DEVNAME(sc), NVME_CC_IOCQES_R(r4),
209	    (1 << NVME_CC_IOCQES_R(r4)));
210	printf("%s:  iosqes %u (%u)\n", DEVNAME(sc), NVME_CC_IOSQES_R(r4),
211	    (1 << NVME_CC_IOSQES_R(r4)));
212	printf("%s:  shn %u\n", DEVNAME(sc), NVME_CC_SHN_R(r4));
213	printf("%s:  ams %u\n", DEVNAME(sc), NVME_CC_AMS_R(r4));
214	printf("%s:  mps %u (%u)\n", DEVNAME(sc), NVME_CC_MPS_R(r4),
215	    (1 << NVME_CC_MPS_R(r4)));
216	printf("%s:  css %u\n", DEVNAME(sc), NVME_CC_CSS_R(r4));
217	printf("%s:  en %u\n", DEVNAME(sc), ISSET(r4, NVME_CC_EN) ? 1 : 0);
218
219	r4 = nvme_read4(sc, NVME_CSTS);
220	printf("%s: csts 0x%08x\n", DEVNAME(sc), r4);
221	printf("%s:  rdy %u\n", DEVNAME(sc), r4 & NVME_CSTS_RDY);
222	printf("%s:  cfs %u\n", DEVNAME(sc), r4 & NVME_CSTS_CFS);
223	printf("%s:  shst %x\n", DEVNAME(sc), r4 & NVME_CSTS_SHST_MASK);
224
225	r4 = nvme_read4(sc, NVME_AQA);
226	printf("%s: aqa  0x%08x\n", DEVNAME(sc), r4);
227	printf("%s:  acqs %u\n", DEVNAME(sc), NVME_AQA_ACQS_R(r4));
228	printf("%s:  asqs %u\n", DEVNAME(sc), NVME_AQA_ASQS_R(r4));
229
230	printf("%s: asq  0x%016"PRIx64"\n", DEVNAME(sc), nvme_read8(sc, NVME_ASQ));
231	printf("%s: acq  0x%016"PRIx64"\n", DEVNAME(sc), nvme_read8(sc, NVME_ACQ));
232#undef	DEVNAME
233}
234#endif	/* NVME_DEBUG */
235
236static int
237nvme_ready(struct nvme_softc *sc, uint32_t rdy)
238{
239	u_int i = 0;
240
241	while ((nvme_read4(sc, NVME_CSTS) & NVME_CSTS_RDY) != rdy) {
242		if (i++ > sc->sc_rdy_to)
243			return ENXIO;
244
245		delay(1000);
246		nvme_barrier(sc, NVME_CSTS, 4, BUS_SPACE_BARRIER_READ);
247	}
248
249	return 0;
250}
251
252static int
253nvme_enable(struct nvme_softc *sc, u_int mps)
254{
255	uint32_t cc, csts;
256	int error;
257
258	cc = nvme_read4(sc, NVME_CC);
259	csts = nvme_read4(sc, NVME_CSTS);
260
261	/*
262	 * See note in nvme_disable. Short circuit if we're already enabled.
263	 */
264	if (ISSET(cc, NVME_CC_EN)) {
265		if (ISSET(csts, NVME_CSTS_RDY))
266			return 0;
267
268		goto waitready;
269	} else {
270		/* EN == 0 already wait for RDY == 0 or fail */
271		error = nvme_ready(sc, 0);
272		if (error)
273			return error;
274	}
275
276	if (sc->sc_ops->op_enable != NULL)
277		sc->sc_ops->op_enable(sc);
278
279	nvme_write8(sc, NVME_ASQ, NVME_DMA_DVA(sc->sc_admin_q->q_sq_dmamem));
280	nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_WRITE);
281	delay(5000);
282	nvme_write8(sc, NVME_ACQ, NVME_DMA_DVA(sc->sc_admin_q->q_cq_dmamem));
283	nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_WRITE);
284	delay(5000);
285
286	nvme_write4(sc, NVME_AQA, NVME_AQA_ACQS(sc->sc_admin_q->q_entries) |
287	    NVME_AQA_ASQS(sc->sc_admin_q->q_entries));
288	nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_WRITE);
289	delay(5000);
290
291	CLR(cc, NVME_CC_IOCQES_MASK | NVME_CC_IOSQES_MASK | NVME_CC_SHN_MASK |
292	    NVME_CC_AMS_MASK | NVME_CC_MPS_MASK | NVME_CC_CSS_MASK);
293	SET(cc, NVME_CC_IOSQES(ffs(64) - 1) | NVME_CC_IOCQES(ffs(16) - 1));
294	SET(cc, NVME_CC_SHN(NVME_CC_SHN_NONE));
295	SET(cc, NVME_CC_CSS(NVME_CC_CSS_NVM));
296	SET(cc, NVME_CC_AMS(NVME_CC_AMS_RR));
297	SET(cc, NVME_CC_MPS(mps));
298	SET(cc, NVME_CC_EN);
299
300	nvme_write4(sc, NVME_CC, cc);
301	nvme_barrier(sc, 0, sc->sc_ios,
302	    BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE);
303
304    waitready:
305	return nvme_ready(sc, NVME_CSTS_RDY);
306}
307
308static int
309nvme_disable(struct nvme_softc *sc)
310{
311	uint32_t cc, csts;
312	int error;
313
314	cc = nvme_read4(sc, NVME_CC);
315	csts = nvme_read4(sc, NVME_CSTS);
316
317	/*
318	 * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
319	 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
320	 * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
321	 * isn't the desired value. Short circuit if we're already disabled.
322	 */
323	if (ISSET(cc, NVME_CC_EN)) {
324		if (!ISSET(csts, NVME_CSTS_RDY)) {
325			/* EN == 1, wait for RDY == 1 or fail */
326			error = nvme_ready(sc, NVME_CSTS_RDY);
327			if (error)
328				return error;
329		}
330	} else {
331		/* EN == 0 already wait for RDY == 0 */
332		if (!ISSET(csts, NVME_CSTS_RDY))
333			return 0;
334
335		goto waitready;
336	}
337
338	CLR(cc, NVME_CC_EN);
339	nvme_write4(sc, NVME_CC, cc);
340	nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_READ);
341
342	/*
343	 * Some drives have issues with accessing the mmio after we disable,
344	 * so delay for a bit after we write the bit to cope with these issues.
345	 */
346	if (ISSET(sc->sc_quirks, NVME_QUIRK_DELAY_B4_CHK_RDY))
347		delay(B4_CHK_RDY_DELAY_MS);
348
349    waitready:
350	return nvme_ready(sc, 0);
351}
352
353int
354nvme_attach(struct nvme_softc *sc)
355{
356	uint64_t cap;
357	uint32_t reg;
358	u_int mps = PAGE_SHIFT;
359	u_int ncq, nsq;
360	uint16_t adminq_entries = nvme_adminq_size;
361	uint16_t ioq_entries = nvme_ioq_size;
362	int i;
363
364	if (sc->sc_ops == NULL)
365		sc->sc_ops = &nvme_ops;
366
367	reg = nvme_read4(sc, NVME_VS);
368	if (reg == 0xffffffff) {
369		aprint_error_dev(sc->sc_dev, "invalid mapping\n");
370		return 1;
371	}
372
373	if (NVME_VS_TER(reg) == 0)
374		aprint_normal_dev(sc->sc_dev, "NVMe %d.%d\n", NVME_VS_MJR(reg),
375		    NVME_VS_MNR(reg));
376	else
377		aprint_normal_dev(sc->sc_dev, "NVMe %d.%d.%d\n", NVME_VS_MJR(reg),
378		    NVME_VS_MNR(reg), NVME_VS_TER(reg));
379
380	cap = nvme_read8(sc, NVME_CAP);
381	sc->sc_dstrd = NVME_CAP_DSTRD(cap);
382	if (NVME_CAP_MPSMIN(cap) > PAGE_SHIFT) {
383		aprint_error_dev(sc->sc_dev, "NVMe minimum page size %u "
384		    "is greater than CPU page size %u\n",
385		    1 << NVME_CAP_MPSMIN(cap), 1 << PAGE_SHIFT);
386		return 1;
387	}
388	if (NVME_CAP_MPSMAX(cap) < mps)
389		mps = NVME_CAP_MPSMAX(cap);
390	if (ioq_entries > NVME_CAP_MQES(cap))
391		ioq_entries = NVME_CAP_MQES(cap);
392
393	/* set initial values to be used for admin queue during probe */
394	sc->sc_rdy_to = NVME_CAP_TO(cap);
395	sc->sc_mps = 1 << mps;
396	sc->sc_mdts = MAXPHYS;
397	sc->sc_max_sgl = btoc(round_page(sc->sc_mdts));
398
399	if (nvme_disable(sc) != 0) {
400		aprint_error_dev(sc->sc_dev, "unable to disable controller\n");
401		return 1;
402	}
403
404	sc->sc_admin_q = nvme_q_alloc(sc, NVME_ADMIN_Q, adminq_entries,
405	    sc->sc_dstrd);
406	if (sc->sc_admin_q == NULL) {
407		aprint_error_dev(sc->sc_dev,
408		    "unable to allocate admin queue\n");
409		return 1;
410	}
411	if (sc->sc_intr_establish(sc, NVME_ADMIN_Q, sc->sc_admin_q))
412		goto free_admin_q;
413
414	if (nvme_enable(sc, mps) != 0) {
415		aprint_error_dev(sc->sc_dev, "unable to enable controller\n");
416		goto disestablish_admin_q;
417	}
418
419	if (nvme_identify(sc, NVME_CAP_MPSMIN(cap)) != 0) {
420		aprint_error_dev(sc->sc_dev, "unable to identify controller\n");
421		goto disable;
422	}
423	if (sc->sc_nn == 0) {
424		aprint_error_dev(sc->sc_dev, "namespace not found\n");
425		goto disable;
426	}
427
428	/* we know how big things are now */
429	sc->sc_max_sgl = sc->sc_mdts / sc->sc_mps;
430
431	/* reallocate ccbs of admin queue with new max sgl. */
432	nvme_ccbs_free(sc->sc_admin_q);
433	nvme_ccbs_alloc(sc->sc_admin_q, sc->sc_admin_q->q_entries);
434
435	if (sc->sc_use_mq) {
436		/* Limit the number of queues to the number allocated in HW */
437		if (nvme_set_number_of_queues(sc, sc->sc_nq, &ncq, &nsq) != 0) {
438			aprint_error_dev(sc->sc_dev,
439			    "unable to get number of queues\n");
440			goto disable;
441		}
442		if (sc->sc_nq > ncq)
443			sc->sc_nq = ncq;
444		if (sc->sc_nq > nsq)
445			sc->sc_nq = nsq;
446	}
447
448	sc->sc_q = kmem_zalloc(sizeof(*sc->sc_q) * sc->sc_nq, KM_SLEEP);
449	for (i = 0; i < sc->sc_nq; i++) {
450		sc->sc_q[i] = nvme_q_alloc(sc, i + 1, ioq_entries,
451		    sc->sc_dstrd);
452		if (sc->sc_q[i] == NULL) {
453			aprint_error_dev(sc->sc_dev,
454			    "unable to allocate io queue\n");
455			goto free_q;
456		}
457		if (nvme_q_create(sc, sc->sc_q[i]) != 0) {
458			aprint_error_dev(sc->sc_dev,
459			    "unable to create io queue\n");
460			nvme_q_free(sc, sc->sc_q[i]);
461			goto free_q;
462		}
463	}
464
465	if (!sc->sc_use_mq)
466		nvme_write4(sc, NVME_INTMC, 1);
467
468	/* probe subdevices */
469	sc->sc_namespaces = kmem_zalloc(sizeof(*sc->sc_namespaces) * sc->sc_nn,
470	    KM_SLEEP);
471	nvme_rescan(sc->sc_dev, NULL, NULL);
472
473	return 0;
474
475free_q:
476	while (--i >= 0) {
477		nvme_q_delete(sc, sc->sc_q[i]);
478		nvme_q_free(sc, sc->sc_q[i]);
479	}
480disable:
481	nvme_disable(sc);
482disestablish_admin_q:
483	sc->sc_intr_disestablish(sc, NVME_ADMIN_Q);
484free_admin_q:
485	nvme_q_free(sc, sc->sc_admin_q);
486
487	return 1;
488}
489
490int
491nvme_rescan(device_t self, const char *ifattr, const int *locs)
492{
493	struct nvme_softc *sc = device_private(self);
494	struct nvme_attach_args naa;
495	struct nvm_namespace_format *f;
496	struct nvme_namespace *ns;
497	uint64_t cap;
498	int ioq_entries = nvme_ioq_size;
499	int i, mlocs[NVMECF_NLOCS];
500	int error;
501
502	cap = nvme_read8(sc, NVME_CAP);
503	if (ioq_entries > NVME_CAP_MQES(cap))
504		ioq_entries = NVME_CAP_MQES(cap);
505
506	for (i = 1; i <= sc->sc_nn; i++) {
507		if (sc->sc_namespaces[i - 1].dev)
508			continue;
509
510		/* identify to check for availability */
511		error = nvme_ns_identify(sc, i);
512		if (error) {
513			aprint_error_dev(self, "couldn't identify namespace #%d\n", i);
514			continue;
515		}
516
517		ns = nvme_ns_get(sc, i);
518		KASSERT(ns);
519
520		f = &ns->ident->lbaf[NVME_ID_NS_FLBAS(ns->ident->flbas)];
521
522		/*
523		 * NVME1.0e 6.11 Identify command
524		 *
525		 * LBADS values smaller than 9 are not supported, a value
526		 * of zero means that the format is not used.
527		 */
528		if (f->lbads < 9) {
529			if (f->lbads > 0)
530				aprint_error_dev(self,
531						 "unsupported logical data size %u\n", f->lbads);
532			continue;
533		}
534
535		mlocs[NVMECF_NSID] = i;
536
537		memset(&naa, 0, sizeof(naa));
538		naa.naa_nsid = i;
539		naa.naa_qentries = (ioq_entries - 1) * sc->sc_nq;
540		naa.naa_maxphys = sc->sc_mdts;
541		naa.naa_typename = sc->sc_modelname;
542		sc->sc_namespaces[i - 1].dev =
543		    config_found(sc->sc_dev, &naa, nvme_print,
544				 CFARGS(.submatch = config_stdsubmatch,
545					.locators = mlocs));
546	}
547	return 0;
548}
549
550static int
551nvme_print(void *aux, const char *pnp)
552{
553	struct nvme_attach_args *naa = aux;
554
555	if (pnp)
556		aprint_normal("ld at %s", pnp);
557
558	if (naa->naa_nsid > 0)
559		aprint_normal(" nsid %d", naa->naa_nsid);
560
561	return UNCONF;
562}
563
564int
565nvme_detach(struct nvme_softc *sc, int flags)
566{
567	int i, error;
568
569	error = config_detach_children(sc->sc_dev, flags);
570	if (error)
571		return error;
572
573	error = nvme_shutdown(sc);
574	if (error)
575		return error;
576
577	/* from now on we are committed to detach, following will never fail */
578	for (i = 0; i < sc->sc_nq; i++)
579		nvme_q_free(sc, sc->sc_q[i]);
580	kmem_free(sc->sc_q, sizeof(*sc->sc_q) * sc->sc_nq);
581	nvme_q_free(sc, sc->sc_admin_q);
582
583	return 0;
584}
585
586int
587nvme_suspend(struct nvme_softc *sc)
588{
589
590	return nvme_shutdown(sc);
591}
592
593int
594nvme_resume(struct nvme_softc *sc)
595{
596	int i, error;
597
598	error = nvme_disable(sc);
599	if (error) {
600		device_printf(sc->sc_dev, "unable to disable controller\n");
601		return error;
602	}
603
604	nvme_q_reset(sc, sc->sc_admin_q);
605	if (sc->sc_intr_establish(sc, NVME_ADMIN_Q, sc->sc_admin_q)) {
606		error = EIO;
607		device_printf(sc->sc_dev, "unable to establish admin q\n");
608		goto disable;
609	}
610
611	error = nvme_enable(sc, ffs(sc->sc_mps) - 1);
612	if (error) {
613		device_printf(sc->sc_dev, "unable to enable controller\n");
614		return error;
615	}
616
617	for (i = 0; i < sc->sc_nq; i++) {
618		nvme_q_reset(sc, sc->sc_q[i]);
619		if (nvme_q_create(sc, sc->sc_q[i]) != 0) {
620			error = EIO;
621			device_printf(sc->sc_dev, "unable to create io q %d"
622			    "\n", i);
623			goto disable;
624		}
625	}
626
627	if (!sc->sc_use_mq)
628		nvme_write4(sc, NVME_INTMC, 1);
629
630	return 0;
631
632disable:
633	(void)nvme_disable(sc);
634
635	return error;
636}
637
638static int
639nvme_shutdown(struct nvme_softc *sc)
640{
641	uint32_t cc, csts;
642	bool disabled = false;
643	int i;
644
645	if (!sc->sc_use_mq)
646		nvme_write4(sc, NVME_INTMS, 1);
647
648	for (i = 0; i < sc->sc_nq; i++) {
649		if (nvme_q_delete(sc, sc->sc_q[i]) != 0) {
650			aprint_error_dev(sc->sc_dev,
651			    "unable to delete io queue %d, disabling\n", i + 1);
652			disabled = true;
653		}
654	}
655	if (disabled)
656		goto disable;
657
658	sc->sc_intr_disestablish(sc, NVME_ADMIN_Q);
659
660	cc = nvme_read4(sc, NVME_CC);
661	CLR(cc, NVME_CC_SHN_MASK);
662	SET(cc, NVME_CC_SHN(NVME_CC_SHN_NORMAL));
663	nvme_write4(sc, NVME_CC, cc);
664
665	for (i = 0; i < 4000; i++) {
666		nvme_barrier(sc, 0, sc->sc_ios,
667		    BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE);
668		csts = nvme_read4(sc, NVME_CSTS);
669		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_DONE)
670			return 0;
671
672		delay(1000);
673	}
674
675	aprint_error_dev(sc->sc_dev, "unable to shudown, disabling\n");
676
677disable:
678	nvme_disable(sc);
679	return 0;
680}
681
682void
683nvme_childdet(device_t self, device_t child)
684{
685	struct nvme_softc *sc = device_private(self);
686	int i;
687
688	for (i = 0; i < sc->sc_nn; i++) {
689		if (sc->sc_namespaces[i].dev == child) {
690			/* Already freed ns->ident. */
691			sc->sc_namespaces[i].dev = NULL;
692			break;
693		}
694	}
695}
696
697int
698nvme_ns_identify(struct nvme_softc *sc, uint16_t nsid)
699{
700	struct nvme_sqe sqe;
701	struct nvm_identify_namespace *identify;
702	struct nvme_dmamem *mem;
703	struct nvme_ccb *ccb;
704	struct nvme_namespace *ns;
705	int rv;
706
707	KASSERT(nsid > 0);
708
709	ns = nvme_ns_get(sc, nsid);
710	KASSERT(ns);
711
712	if (ns->ident != NULL)
713		return 0;
714
715	ccb = nvme_ccb_get(sc->sc_admin_q, false);
716	KASSERT(ccb != NULL); /* it's a bug if we don't have spare ccb here */
717
718	mem = nvme_dmamem_alloc(sc, sizeof(*identify));
719	if (mem == NULL) {
720		nvme_ccb_put(sc->sc_admin_q, ccb);
721		return ENOMEM;
722	}
723
724	memset(&sqe, 0, sizeof(sqe));
725	sqe.opcode = NVM_ADMIN_IDENTIFY;
726	htolem32(&sqe.nsid, nsid);
727	htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
728	htolem32(&sqe.cdw10, 0);
729
730	ccb->ccb_done = nvme_empty_done;
731	ccb->ccb_cookie = &sqe;
732
733	nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
734	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_IDENT);
735	nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
736
737	nvme_ccb_put(sc->sc_admin_q, ccb);
738
739	if (rv != 0) {
740		rv = EIO;
741		goto done;
742	}
743
744	/* commit */
745
746	identify = kmem_zalloc(sizeof(*identify), KM_SLEEP);
747	*identify = *((volatile struct nvm_identify_namespace *)NVME_DMA_KVA(mem));
748
749	/* Convert data to host endian */
750	nvme_identify_namespace_swapbytes(identify);
751
752	ns->ident = identify;
753
754done:
755	nvme_dmamem_free(sc, mem);
756
757	return rv;
758}
759
760int
761nvme_ns_dobio(struct nvme_softc *sc, uint16_t nsid, void *cookie,
762    struct buf *bp, void *data, size_t datasize,
763    int secsize, daddr_t blkno, int flags, nvme_nnc_done nnc_done)
764{
765	struct nvme_queue *q;
766	struct nvme_ccb *ccb;
767	bus_dmamap_t dmap;
768	int i, error;
769
770	ccb = nvme_ccb_get_bio(sc, bp, &q);
771	if (ccb == NULL)
772		return EAGAIN;
773
774	ccb->ccb_done = nvme_ns_io_done;
775	ccb->ccb_cookie = cookie;
776
777	/* namespace context */
778	ccb->nnc_nsid = nsid;
779	ccb->nnc_flags = flags;
780	ccb->nnc_buf = bp;
781	ccb->nnc_datasize = datasize;
782	ccb->nnc_secsize = secsize;
783	ccb->nnc_blkno = blkno;
784	ccb->nnc_done = nnc_done;
785
786	dmap = ccb->ccb_dmamap;
787	error = bus_dmamap_load(sc->sc_dmat, dmap, data,
788	    datasize, NULL,
789	    (ISSET(flags, NVME_NS_CTX_F_POLL) ?
790	      BUS_DMA_NOWAIT : BUS_DMA_WAITOK) |
791	    (ISSET(flags, NVME_NS_CTX_F_READ) ?
792	      BUS_DMA_READ : BUS_DMA_WRITE));
793	if (error) {
794		nvme_ccb_put(q, ccb);
795		return error;
796	}
797
798	bus_dmamap_sync(sc->sc_dmat, dmap, 0, dmap->dm_mapsize,
799	    ISSET(flags, NVME_NS_CTX_F_READ) ?
800	    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
801
802	if (dmap->dm_nsegs > 2) {
803		for (i = 1; i < dmap->dm_nsegs; i++) {
804			htolem64(&ccb->ccb_prpl[i - 1],
805			    dmap->dm_segs[i].ds_addr);
806		}
807		bus_dmamap_sync(sc->sc_dmat,
808		    NVME_DMA_MAP(q->q_ccb_prpls),
809		    ccb->ccb_prpl_off,
810		    sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
811		    BUS_DMASYNC_PREWRITE);
812	}
813
814	if (ISSET(flags, NVME_NS_CTX_F_POLL)) {
815		if (nvme_poll(sc, q, ccb, nvme_ns_io_fill, NVME_TIMO_PT) != 0)
816			return EIO;
817		return 0;
818	}
819
820	nvme_q_submit(sc, q, ccb, nvme_ns_io_fill);
821	return 0;
822}
823
824static void
825nvme_ns_io_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
826{
827	struct nvme_sqe_io *sqe = slot;
828	bus_dmamap_t dmap = ccb->ccb_dmamap;
829
830	sqe->opcode = ISSET(ccb->nnc_flags, NVME_NS_CTX_F_READ) ?
831	    NVM_CMD_READ : NVM_CMD_WRITE;
832	htolem32(&sqe->nsid, ccb->nnc_nsid);
833
834	htolem64(&sqe->entry.prp[0], dmap->dm_segs[0].ds_addr);
835	switch (dmap->dm_nsegs) {
836	case 1:
837		break;
838	case 2:
839		htolem64(&sqe->entry.prp[1], dmap->dm_segs[1].ds_addr);
840		break;
841	default:
842		/* the prp list is already set up and synced */
843		htolem64(&sqe->entry.prp[1], ccb->ccb_prpl_dva);
844		break;
845	}
846
847	htolem64(&sqe->slba, ccb->nnc_blkno);
848
849	if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
850		htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
851
852	/* guaranteed by upper layers, but check just in case */
853	KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
854	htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
855}
856
857static void
858nvme_ns_io_done(struct nvme_queue *q, struct nvme_ccb *ccb,
859    struct nvme_cqe *cqe)
860{
861	struct nvme_softc *sc = q->q_sc;
862	bus_dmamap_t dmap = ccb->ccb_dmamap;
863	void *nnc_cookie = ccb->ccb_cookie;
864	nvme_nnc_done nnc_done = ccb->nnc_done;
865	struct buf *bp = ccb->nnc_buf;
866
867	if (dmap->dm_nsegs > 2) {
868		bus_dmamap_sync(sc->sc_dmat,
869		    NVME_DMA_MAP(q->q_ccb_prpls),
870		    ccb->ccb_prpl_off,
871		    sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
872		    BUS_DMASYNC_POSTWRITE);
873	}
874
875	bus_dmamap_sync(sc->sc_dmat, dmap, 0, dmap->dm_mapsize,
876	    ISSET(ccb->nnc_flags, NVME_NS_CTX_F_READ) ?
877	    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
878
879	bus_dmamap_unload(sc->sc_dmat, dmap);
880	nvme_ccb_put(q, ccb);
881
882	nnc_done(nnc_cookie, bp, lemtoh16(&cqe->flags), lemtoh32(&cqe->cdw0));
883}
884
885/*
886 * If there is no volatile write cache, it makes no sense to issue
887 * flush commands or query for the status.
888 */
889static bool
890nvme_has_volatile_write_cache(struct nvme_softc *sc)
891{
892	/* sc_identify is filled during attachment */
893	return  ((sc->sc_identify.vwc & NVME_ID_CTRLR_VWC_PRESENT) != 0);
894}
895
896static bool
897nvme_ns_sync_finished(void *cookie)
898{
899	int *result = cookie;
900
901	return (*result != 0);
902}
903
904int
905nvme_ns_sync(struct nvme_softc *sc, uint16_t nsid, int flags)
906{
907	struct nvme_queue *q = nvme_get_q(sc);
908	struct nvme_ccb *ccb;
909	int result = 0;
910
911	if (!nvme_has_volatile_write_cache(sc)) {
912		/* cache not present, no value in trying to flush it */
913		return 0;
914	}
915
916	ccb = nvme_ccb_get(q, true);
917	KASSERT(ccb != NULL);
918
919	ccb->ccb_done = nvme_ns_sync_done;
920	ccb->ccb_cookie = &result;
921
922	/* namespace context */
923	ccb->nnc_nsid = nsid;
924	ccb->nnc_flags = flags;
925	ccb->nnc_done = NULL;
926
927	if (ISSET(flags, NVME_NS_CTX_F_POLL)) {
928		if (nvme_poll(sc, q, ccb, nvme_ns_sync_fill, NVME_TIMO_SY) != 0)
929			return EIO;
930		return 0;
931	}
932
933	nvme_q_submit(sc, q, ccb, nvme_ns_sync_fill);
934
935	/* wait for completion */
936	nvme_q_wait_complete(sc, q, nvme_ns_sync_finished, &result);
937	KASSERT(result != 0);
938
939	return (result > 0) ? 0 : EIO;
940}
941
942static void
943nvme_ns_sync_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
944{
945	struct nvme_sqe *sqe = slot;
946
947	sqe->opcode = NVM_CMD_FLUSH;
948	htolem32(&sqe->nsid, ccb->nnc_nsid);
949}
950
951static void
952nvme_ns_sync_done(struct nvme_queue *q, struct nvme_ccb *ccb,
953    struct nvme_cqe *cqe)
954{
955	int *result = ccb->ccb_cookie;
956	uint16_t status = NVME_CQE_SC(lemtoh16(&cqe->flags));
957
958	if (status == NVME_CQE_SC_SUCCESS)
959		*result = 1;
960	else
961		*result = -1;
962
963	nvme_ccb_put(q, ccb);
964}
965
966static bool
967nvme_getcache_finished(void *xc)
968{
969	int *addr = xc;
970
971	return (*addr != 0);
972}
973
974/*
975 * Get status of volatile write cache. Always asynchronous.
976 */
977int
978nvme_admin_getcache(struct nvme_softc *sc, int *addr)
979{
980	struct nvme_ccb *ccb;
981	struct nvme_queue *q = sc->sc_admin_q;
982	int result = 0, error;
983
984	if (!nvme_has_volatile_write_cache(sc)) {
985		/* cache simply not present */
986		*addr = 0;
987		return 0;
988	}
989
990	ccb = nvme_ccb_get(q, true);
991	KASSERT(ccb != NULL);
992
993	ccb->ccb_done = nvme_getcache_done;
994	ccb->ccb_cookie = &result;
995
996	/* namespace context */
997	ccb->nnc_flags = 0;
998	ccb->nnc_done = NULL;
999
1000	nvme_q_submit(sc, q, ccb, nvme_getcache_fill);
1001
1002	/* wait for completion */
1003	nvme_q_wait_complete(sc, q, nvme_getcache_finished, &result);
1004	KASSERT(result != 0);
1005
1006	if (result > 0) {
1007		*addr = result;
1008		error = 0;
1009	} else
1010		error = EINVAL;
1011
1012	return error;
1013}
1014
1015static void
1016nvme_getcache_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
1017{
1018	struct nvme_sqe *sqe = slot;
1019
1020	sqe->opcode = NVM_ADMIN_GET_FEATURES;
1021	htolem32(&sqe->cdw10, NVM_FEATURE_VOLATILE_WRITE_CACHE);
1022	htolem32(&sqe->cdw11, NVM_VOLATILE_WRITE_CACHE_WCE);
1023}
1024
1025static void
1026nvme_getcache_done(struct nvme_queue *q, struct nvme_ccb *ccb,
1027    struct nvme_cqe *cqe)
1028{
1029	int *addr = ccb->ccb_cookie;
1030	uint16_t status = NVME_CQE_SC(lemtoh16(&cqe->flags));
1031	uint32_t cdw0 = lemtoh32(&cqe->cdw0);
1032	int result;
1033
1034	if (status == NVME_CQE_SC_SUCCESS) {
1035		result = 0;
1036
1037		/*
1038		 * DPO not supported, Dataset Management (DSM) field doesn't
1039		 * specify the same semantics. FUA is always supported.
1040		 */
1041		result = DKCACHE_FUA;
1042
1043		if (cdw0 & NVM_VOLATILE_WRITE_CACHE_WCE)
1044			result |= DKCACHE_WRITE;
1045
1046		/*
1047		 * If volatile write cache is present, the flag shall also be
1048		 * settable.
1049		 */
1050		result |= DKCACHE_WCHANGE;
1051
1052		/*
1053		 * ONCS field indicates whether the optional SAVE is also
1054		 * supported for Set Features. According to spec v1.3,
1055		 * Volatile Write Cache however doesn't support persistency
1056		 * across power cycle/reset.
1057		 */
1058
1059	} else {
1060		result = -1;
1061	}
1062
1063	*addr = result;
1064
1065	nvme_ccb_put(q, ccb);
1066}
1067
1068struct nvme_setcache_state {
1069	int dkcache;
1070	int result;
1071};
1072
1073static bool
1074nvme_setcache_finished(void *xc)
1075{
1076	struct nvme_setcache_state *st = xc;
1077
1078	return (st->result != 0);
1079}
1080
1081static void
1082nvme_setcache_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
1083{
1084	struct nvme_sqe *sqe = slot;
1085	struct nvme_setcache_state *st = ccb->ccb_cookie;
1086
1087	sqe->opcode = NVM_ADMIN_SET_FEATURES;
1088	htolem32(&sqe->cdw10, NVM_FEATURE_VOLATILE_WRITE_CACHE);
1089	if (st->dkcache & DKCACHE_WRITE)
1090		htolem32(&sqe->cdw11, NVM_VOLATILE_WRITE_CACHE_WCE);
1091}
1092
1093static void
1094nvme_setcache_done(struct nvme_queue *q, struct nvme_ccb *ccb,
1095    struct nvme_cqe *cqe)
1096{
1097	struct nvme_setcache_state *st = ccb->ccb_cookie;
1098	uint16_t status = NVME_CQE_SC(lemtoh16(&cqe->flags));
1099
1100	if (status == NVME_CQE_SC_SUCCESS) {
1101		st->result = 1;
1102	} else {
1103		st->result = -1;
1104	}
1105
1106	nvme_ccb_put(q, ccb);
1107}
1108
1109/*
1110 * Set status of volatile write cache. Always asynchronous.
1111 */
1112int
1113nvme_admin_setcache(struct nvme_softc *sc, int dkcache)
1114{
1115	struct nvme_ccb *ccb;
1116	struct nvme_queue *q = sc->sc_admin_q;
1117	int error;
1118	struct nvme_setcache_state st;
1119
1120	if (!nvme_has_volatile_write_cache(sc)) {
1121		/* cache simply not present */
1122		return EOPNOTSUPP;
1123	}
1124
1125	if (dkcache & ~(DKCACHE_WRITE)) {
1126		/* unsupported parameters */
1127		return EOPNOTSUPP;
1128	}
1129
1130	ccb = nvme_ccb_get(q, true);
1131	KASSERT(ccb != NULL);
1132
1133	memset(&st, 0, sizeof(st));
1134	st.dkcache = dkcache;
1135
1136	ccb->ccb_done = nvme_setcache_done;
1137	ccb->ccb_cookie = &st;
1138
1139	/* namespace context */
1140	ccb->nnc_flags = 0;
1141	ccb->nnc_done = NULL;
1142
1143	nvme_q_submit(sc, q, ccb, nvme_setcache_fill);
1144
1145	/* wait for completion */
1146	nvme_q_wait_complete(sc, q, nvme_setcache_finished, &st);
1147	KASSERT(st.result != 0);
1148
1149	if (st.result > 0)
1150		error = 0;
1151	else
1152		error = EINVAL;
1153
1154	return error;
1155}
1156
1157void
1158nvme_ns_free(struct nvme_softc *sc, uint16_t nsid)
1159{
1160	struct nvme_namespace *ns;
1161	struct nvm_identify_namespace *identify;
1162
1163	ns = nvme_ns_get(sc, nsid);
1164	KASSERT(ns);
1165
1166	identify = ns->ident;
1167	ns->ident = NULL;
1168	if (identify != NULL)
1169		kmem_free(identify, sizeof(*identify));
1170}
1171
1172struct nvme_pt_state {
1173	struct nvme_pt_command *pt;
1174	bool finished;
1175};
1176
1177static void
1178nvme_pt_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
1179{
1180	struct nvme_softc *sc = q->q_sc;
1181	struct nvme_sqe *sqe = slot;
1182	struct nvme_pt_state *state = ccb->ccb_cookie;
1183	struct nvme_pt_command *pt = state->pt;
1184	bus_dmamap_t dmap = ccb->ccb_dmamap;
1185	int i;
1186
1187	sqe->opcode = pt->cmd.opcode;
1188	htolem32(&sqe->nsid, pt->cmd.nsid);
1189
1190	if (pt->buf != NULL && pt->len > 0) {
1191		htolem64(&sqe->entry.prp[0], dmap->dm_segs[0].ds_addr);
1192		switch (dmap->dm_nsegs) {
1193		case 1:
1194			break;
1195		case 2:
1196			htolem64(&sqe->entry.prp[1], dmap->dm_segs[1].ds_addr);
1197			break;
1198		default:
1199			for (i = 1; i < dmap->dm_nsegs; i++) {
1200				htolem64(&ccb->ccb_prpl[i - 1],
1201				    dmap->dm_segs[i].ds_addr);
1202			}
1203			bus_dmamap_sync(sc->sc_dmat,
1204			    NVME_DMA_MAP(q->q_ccb_prpls),
1205			    ccb->ccb_prpl_off,
1206			    sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
1207			    BUS_DMASYNC_PREWRITE);
1208			htolem64(&sqe->entry.prp[1], ccb->ccb_prpl_dva);
1209			break;
1210		}
1211	}
1212
1213	htolem32(&sqe->cdw10, pt->cmd.cdw10);
1214	htolem32(&sqe->cdw11, pt->cmd.cdw11);
1215	htolem32(&sqe->cdw12, pt->cmd.cdw12);
1216	htolem32(&sqe->cdw13, pt->cmd.cdw13);
1217	htolem32(&sqe->cdw14, pt->cmd.cdw14);
1218	htolem32(&sqe->cdw15, pt->cmd.cdw15);
1219}
1220
1221static void
1222nvme_pt_done(struct nvme_queue *q, struct nvme_ccb *ccb, struct nvme_cqe *cqe)
1223{
1224	struct nvme_softc *sc = q->q_sc;
1225	struct nvme_pt_state *state = ccb->ccb_cookie;
1226	struct nvme_pt_command *pt = state->pt;
1227	bus_dmamap_t dmap = ccb->ccb_dmamap;
1228
1229	if (pt->buf != NULL && pt->len > 0) {
1230		if (dmap->dm_nsegs > 2) {
1231			bus_dmamap_sync(sc->sc_dmat,
1232			    NVME_DMA_MAP(q->q_ccb_prpls),
1233			    ccb->ccb_prpl_off,
1234			    sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
1235			    BUS_DMASYNC_POSTWRITE);
1236		}
1237
1238		bus_dmamap_sync(sc->sc_dmat, dmap, 0, dmap->dm_mapsize,
1239		    pt->is_read ? BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
1240		bus_dmamap_unload(sc->sc_dmat, dmap);
1241	}
1242
1243	pt->cpl.cdw0 = lemtoh32(&cqe->cdw0);
1244	pt->cpl.flags = lemtoh16(&cqe->flags) & ~NVME_CQE_PHASE;
1245
1246	state->finished = true;
1247
1248	nvme_ccb_put(q, ccb);
1249}
1250
1251static bool
1252nvme_pt_finished(void *cookie)
1253{
1254	struct nvme_pt_state *state = cookie;
1255
1256	return state->finished;
1257}
1258
1259static int
1260nvme_command_passthrough(struct nvme_softc *sc, struct nvme_pt_command *pt,
1261    uint32_t nsid, struct lwp *l, bool is_adminq)
1262{
1263	struct nvme_queue *q;
1264	struct nvme_ccb *ccb;
1265	void *buf = NULL;
1266	struct nvme_pt_state state;
1267	int error;
1268
1269	/* limit command size to maximum data transfer size */
1270	if ((pt->buf == NULL && pt->len > 0) ||
1271	    (pt->buf != NULL && (pt->len == 0 || pt->len > sc->sc_mdts)))
1272		return EINVAL;
1273
1274	q = is_adminq ? sc->sc_admin_q : nvme_get_q(sc);
1275	ccb = nvme_ccb_get(q, true);
1276	KASSERT(ccb != NULL);
1277
1278	if (pt->buf != NULL) {
1279		KASSERT(pt->len > 0);
1280		buf = kmem_alloc(pt->len, KM_SLEEP);
1281		if (!pt->is_read) {
1282			error = copyin(pt->buf, buf, pt->len);
1283			if (error)
1284				goto kmem_free;
1285		}
1286		error = bus_dmamap_load(sc->sc_dmat, ccb->ccb_dmamap, buf,
1287		    pt->len, NULL,
1288		    BUS_DMA_WAITOK |
1289		      (pt->is_read ? BUS_DMA_READ : BUS_DMA_WRITE));
1290		if (error)
1291			goto kmem_free;
1292		bus_dmamap_sync(sc->sc_dmat, ccb->ccb_dmamap,
1293		    0, ccb->ccb_dmamap->dm_mapsize,
1294		    pt->is_read ? BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
1295	}
1296
1297	memset(&state, 0, sizeof(state));
1298	state.pt = pt;
1299	state.finished = false;
1300
1301	ccb->ccb_done = nvme_pt_done;
1302	ccb->ccb_cookie = &state;
1303
1304	pt->cmd.nsid = nsid;
1305
1306	nvme_q_submit(sc, q, ccb, nvme_pt_fill);
1307
1308	/* wait for completion */
1309	nvme_q_wait_complete(sc, q, nvme_pt_finished, &state);
1310	KASSERT(state.finished);
1311
1312	error = 0;
1313
1314	if (buf != NULL) {
1315		if (error == 0 && pt->is_read)
1316			error = copyout(buf, pt->buf, pt->len);
1317kmem_free:
1318		kmem_free(buf, pt->len);
1319	}
1320
1321	return error;
1322}
1323
1324uint32_t
1325nvme_op_sq_enter(struct nvme_softc *sc,
1326    struct nvme_queue *q, struct nvme_ccb *ccb)
1327{
1328	mutex_enter(&q->q_sq_mtx);
1329
1330	return nvme_op_sq_enter_locked(sc, q, ccb);
1331}
1332
1333uint32_t
1334nvme_op_sq_enter_locked(struct nvme_softc *sc,
1335    struct nvme_queue *q, struct nvme_ccb *ccb)
1336{
1337	return q->q_sq_tail;
1338}
1339
1340void
1341nvme_op_sq_leave_locked(struct nvme_softc *sc,
1342    struct nvme_queue *q, struct nvme_ccb *ccb)
1343{
1344	uint32_t tail;
1345
1346	tail = ++q->q_sq_tail;
1347	if (tail >= q->q_entries)
1348		tail = 0;
1349	q->q_sq_tail = tail;
1350	nvme_write4(sc, q->q_sqtdbl, tail);
1351}
1352
1353void
1354nvme_op_sq_leave(struct nvme_softc *sc,
1355    struct nvme_queue *q, struct nvme_ccb *ccb)
1356{
1357	nvme_op_sq_leave_locked(sc, q, ccb);
1358
1359	mutex_exit(&q->q_sq_mtx);
1360}
1361
1362static void
1363nvme_q_submit(struct nvme_softc *sc, struct nvme_queue *q, struct nvme_ccb *ccb,
1364    void (*fill)(struct nvme_queue *, struct nvme_ccb *, void *))
1365{
1366	struct nvme_sqe *sqe = NVME_DMA_KVA(q->q_sq_dmamem);
1367	uint32_t tail;
1368
1369	tail = sc->sc_ops->op_sq_enter(sc, q, ccb);
1370
1371	sqe += tail;
1372
1373	bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
1374	    sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_POSTWRITE);
1375	memset(sqe, 0, sizeof(*sqe));
1376	(*fill)(q, ccb, sqe);
1377	htolem16(&sqe->cid, ccb->ccb_id);
1378	bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
1379	    sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_PREWRITE);
1380
1381	sc->sc_ops->op_sq_leave(sc, q, ccb);
1382}
1383
1384struct nvme_poll_state {
1385	struct nvme_sqe s;
1386	struct nvme_cqe c;
1387	void *cookie;
1388	void (*done)(struct nvme_queue *, struct nvme_ccb *, struct nvme_cqe *);
1389};
1390
1391static int
1392nvme_poll(struct nvme_softc *sc, struct nvme_queue *q, struct nvme_ccb *ccb,
1393    void (*fill)(struct nvme_queue *, struct nvme_ccb *, void *), int timo_sec)
1394{
1395	struct nvme_poll_state state;
1396	uint16_t flags;
1397	int step = 10;
1398	int maxloop = timo_sec * 1000000 / step;
1399	int error = 0;
1400
1401	memset(&state, 0, sizeof(state));
1402	(*fill)(q, ccb, &state.s);
1403
1404	state.done = ccb->ccb_done;
1405	state.cookie = ccb->ccb_cookie;
1406
1407	ccb->ccb_done = nvme_poll_done;
1408	ccb->ccb_cookie = &state;
1409
1410	nvme_q_submit(sc, q, ccb, nvme_poll_fill);
1411	while (!ISSET(state.c.flags, htole16(NVME_CQE_PHASE))) {
1412		if (nvme_q_complete(sc, q) == 0)
1413			delay(step);
1414
1415		if (timo_sec >= 0 && --maxloop <= 0) {
1416			error = ETIMEDOUT;
1417			break;
1418		}
1419	}
1420
1421	if (error == 0) {
1422		flags = lemtoh16(&state.c.flags);
1423		return flags & ~NVME_CQE_PHASE;
1424	} else {
1425		/*
1426		 * If it succeds later, it would hit ccb which will have been
1427		 * already reused for something else. Not good. Cross
1428		 * fingers and hope for best. XXX do controller reset?
1429		 */
1430		aprint_error_dev(sc->sc_dev, "polled command timed out\n");
1431
1432		/* Invoke the callback to clean state anyway */
1433		struct nvme_cqe cqe;
1434		memset(&cqe, 0, sizeof(cqe));
1435		ccb->ccb_done(q, ccb, &cqe);
1436
1437		return 1;
1438	}
1439}
1440
1441static void
1442nvme_poll_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
1443{
1444	struct nvme_sqe *sqe = slot;
1445	struct nvme_poll_state *state = ccb->ccb_cookie;
1446
1447	*sqe = state->s;
1448}
1449
1450static void
1451nvme_poll_done(struct nvme_queue *q, struct nvme_ccb *ccb,
1452    struct nvme_cqe *cqe)
1453{
1454	struct nvme_poll_state *state = ccb->ccb_cookie;
1455
1456	state->c = *cqe;
1457	SET(state->c.flags, htole16(NVME_CQE_PHASE));
1458
1459	ccb->ccb_cookie = state->cookie;
1460	state->done(q, ccb, &state->c);
1461}
1462
1463static void
1464nvme_sqe_fill(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
1465{
1466	struct nvme_sqe *src = ccb->ccb_cookie;
1467	struct nvme_sqe *dst = slot;
1468
1469	*dst = *src;
1470}
1471
1472static void
1473nvme_empty_done(struct nvme_queue *q, struct nvme_ccb *ccb,
1474    struct nvme_cqe *cqe)
1475{
1476}
1477
1478void
1479nvme_op_cq_done(struct nvme_softc *sc,
1480    struct nvme_queue *q, struct nvme_ccb *ccb)
1481{
1482	/* nop */
1483}
1484
1485static int
1486nvme_q_complete(struct nvme_softc *sc, struct nvme_queue *q)
1487{
1488	struct nvme_ccb *ccb;
1489	struct nvme_cqe *ring = NVME_DMA_KVA(q->q_cq_dmamem), *cqe;
1490	uint16_t flags;
1491	int rv = 0;
1492
1493	mutex_enter(&q->q_cq_mtx);
1494
1495	nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
1496	for (;;) {
1497		cqe = &ring[q->q_cq_head];
1498		flags = lemtoh16(&cqe->flags);
1499		if ((flags & NVME_CQE_PHASE) != q->q_cq_phase)
1500			break;
1501
1502		/*
1503		 * Make sure we have read the flags _before_ we read
1504		 * the cid.  Otherwise the CPU might speculatively read
1505		 * the cid before the entry has been assigned to our
1506		 * phase.
1507		 */
1508		nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
1509
1510		ccb = &q->q_ccbs[lemtoh16(&cqe->cid)];
1511
1512		if (++q->q_cq_head >= q->q_entries) {
1513			q->q_cq_head = 0;
1514			q->q_cq_phase ^= NVME_CQE_PHASE;
1515		}
1516
1517#ifdef DEBUG
1518		/*
1519		 * If we get spurious completion notification, something
1520		 * is seriously hosed up. Very likely DMA to some random
1521		 * memory place happened, so just bail out.
1522		 */
1523		if ((intptr_t)ccb->ccb_cookie == NVME_CCB_FREE) {
1524			panic("%s: invalid ccb detected",
1525			    device_xname(sc->sc_dev));
1526			/* NOTREACHED */
1527		}
1528#endif
1529
1530		rv++;
1531
1532		sc->sc_ops->op_cq_done(sc, q, ccb);
1533
1534		/*
1535		 * Unlock the mutex before calling the ccb_done callback
1536		 * and re-lock afterwards. The callback triggers lddone()
1537		 * which schedules another i/o, and also calls nvme_ccb_put().
1538		 * Unlock/relock avoids possibility of deadlock.
1539		 */
1540		mutex_exit(&q->q_cq_mtx);
1541		ccb->ccb_done(q, ccb, cqe);
1542		mutex_enter(&q->q_cq_mtx);
1543	}
1544	nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
1545
1546	if (rv)
1547		nvme_write4(sc, q->q_cqhdbl, q->q_cq_head);
1548
1549	mutex_exit(&q->q_cq_mtx);
1550
1551	return rv;
1552}
1553
1554static void
1555nvme_q_wait_complete(struct nvme_softc *sc,
1556    struct nvme_queue *q, bool (*finished)(void *), void *cookie)
1557{
1558	mutex_enter(&q->q_ccb_mtx);
1559	if (finished(cookie))
1560		goto out;
1561
1562	for(;;) {
1563		q->q_ccb_waiting = true;
1564		cv_wait(&q->q_ccb_wait, &q->q_ccb_mtx);
1565
1566		if (finished(cookie))
1567			break;
1568	}
1569
1570out:
1571	mutex_exit(&q->q_ccb_mtx);
1572}
1573
1574static int
1575nvme_identify(struct nvme_softc *sc, u_int mps)
1576{
1577	char sn[41], mn[81], fr[17];
1578	struct nvm_identify_controller *identify;
1579	struct nvme_dmamem *mem;
1580	struct nvme_ccb *ccb;
1581	u_int mdts;
1582	int rv = 1;
1583
1584	ccb = nvme_ccb_get(sc->sc_admin_q, false);
1585	KASSERT(ccb != NULL); /* it's a bug if we don't have spare ccb here */
1586
1587	mem = nvme_dmamem_alloc(sc, sizeof(*identify));
1588	if (mem == NULL)
1589		return 1;
1590
1591	ccb->ccb_done = nvme_empty_done;
1592	ccb->ccb_cookie = mem;
1593
1594	nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
1595	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_fill_identify,
1596	    NVME_TIMO_IDENT);
1597	nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
1598
1599	nvme_ccb_put(sc->sc_admin_q, ccb);
1600
1601	if (rv != 0)
1602		goto done;
1603
1604	identify = NVME_DMA_KVA(mem);
1605	sc->sc_identify = *identify;
1606	identify = NULL;
1607
1608	/* Convert data to host endian */
1609	nvme_identify_controller_swapbytes(&sc->sc_identify);
1610
1611	strnvisx(sn, sizeof(sn), (const char *)sc->sc_identify.sn,
1612	    sizeof(sc->sc_identify.sn), VIS_TRIM|VIS_SAFE|VIS_OCTAL);
1613	strnvisx(mn, sizeof(mn), (const char *)sc->sc_identify.mn,
1614	    sizeof(sc->sc_identify.mn), VIS_TRIM|VIS_SAFE|VIS_OCTAL);
1615	strnvisx(fr, sizeof(fr), (const char *)sc->sc_identify.fr,
1616	    sizeof(sc->sc_identify.fr), VIS_TRIM|VIS_SAFE|VIS_OCTAL);
1617	aprint_normal_dev(sc->sc_dev, "%s, firmware %s, serial %s\n", mn, fr,
1618	    sn);
1619
1620	strlcpy(sc->sc_modelname, mn, sizeof(sc->sc_modelname));
1621
1622	if (sc->sc_identify.mdts > 0) {
1623		mdts = (1 << sc->sc_identify.mdts) * (1 << mps);
1624		if (mdts < sc->sc_mdts)
1625			sc->sc_mdts = mdts;
1626	}
1627
1628	sc->sc_nn = sc->sc_identify.nn;
1629
1630done:
1631	nvme_dmamem_free(sc, mem);
1632
1633	return rv;
1634}
1635
1636static int
1637nvme_q_create(struct nvme_softc *sc, struct nvme_queue *q)
1638{
1639	struct nvme_sqe_q sqe;
1640	struct nvme_ccb *ccb;
1641	int rv;
1642
1643	if (sc->sc_use_mq && sc->sc_intr_establish(sc, q->q_id, q) != 0)
1644		return 1;
1645
1646	ccb = nvme_ccb_get(sc->sc_admin_q, false);
1647	KASSERT(ccb != NULL);
1648
1649	ccb->ccb_done = nvme_empty_done;
1650	ccb->ccb_cookie = &sqe;
1651
1652	memset(&sqe, 0, sizeof(sqe));
1653	sqe.opcode = NVM_ADMIN_ADD_IOCQ;
1654	htolem64(&sqe.prp1, NVME_DMA_DVA(q->q_cq_dmamem));
1655	htolem16(&sqe.qsize, q->q_entries - 1);
1656	htolem16(&sqe.qid, q->q_id);
1657	sqe.qflags = NVM_SQE_CQ_IEN | NVM_SQE_Q_PC;
1658	if (sc->sc_use_mq)
1659		htolem16(&sqe.cqid, q->q_id);	/* qid == vector */
1660
1661	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1662	if (rv != 0)
1663		goto fail;
1664
1665	ccb->ccb_done = nvme_empty_done;
1666	ccb->ccb_cookie = &sqe;
1667
1668	memset(&sqe, 0, sizeof(sqe));
1669	sqe.opcode = NVM_ADMIN_ADD_IOSQ;
1670	htolem64(&sqe.prp1, NVME_DMA_DVA(q->q_sq_dmamem));
1671	htolem16(&sqe.qsize, q->q_entries - 1);
1672	htolem16(&sqe.qid, q->q_id);
1673	htolem16(&sqe.cqid, q->q_id);
1674	sqe.qflags = NVM_SQE_Q_PC;
1675
1676	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1677	if (rv != 0)
1678		goto fail;
1679
1680	nvme_ccb_put(sc->sc_admin_q, ccb);
1681	return 0;
1682
1683fail:
1684	if (sc->sc_use_mq)
1685		sc->sc_intr_disestablish(sc, q->q_id);
1686
1687	nvme_ccb_put(sc->sc_admin_q, ccb);
1688	return rv;
1689}
1690
1691static int
1692nvme_q_delete(struct nvme_softc *sc, struct nvme_queue *q)
1693{
1694	struct nvme_sqe_q sqe;
1695	struct nvme_ccb *ccb;
1696	int rv;
1697
1698	ccb = nvme_ccb_get(sc->sc_admin_q, false);
1699	KASSERT(ccb != NULL);
1700
1701	ccb->ccb_done = nvme_empty_done;
1702	ccb->ccb_cookie = &sqe;
1703
1704	memset(&sqe, 0, sizeof(sqe));
1705	sqe.opcode = NVM_ADMIN_DEL_IOSQ;
1706	htolem16(&sqe.qid, q->q_id);
1707
1708	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1709	if (rv != 0)
1710		goto fail;
1711
1712	ccb->ccb_done = nvme_empty_done;
1713	ccb->ccb_cookie = &sqe;
1714
1715	memset(&sqe, 0, sizeof(sqe));
1716	sqe.opcode = NVM_ADMIN_DEL_IOCQ;
1717	htolem16(&sqe.qid, q->q_id);
1718
1719	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1720	if (rv != 0)
1721		goto fail;
1722
1723fail:
1724	nvme_ccb_put(sc->sc_admin_q, ccb);
1725
1726	if (rv == 0 && sc->sc_use_mq) {
1727		if (sc->sc_intr_disestablish(sc, q->q_id))
1728			rv = 1;
1729	}
1730
1731	return rv;
1732}
1733
1734static void
1735nvme_fill_identify(struct nvme_queue *q, struct nvme_ccb *ccb, void *slot)
1736{
1737	struct nvme_sqe *sqe = slot;
1738	struct nvme_dmamem *mem = ccb->ccb_cookie;
1739
1740	sqe->opcode = NVM_ADMIN_IDENTIFY;
1741	htolem64(&sqe->entry.prp[0], NVME_DMA_DVA(mem));
1742	htolem32(&sqe->cdw10, 1);
1743}
1744
1745static int
1746nvme_set_number_of_queues(struct nvme_softc *sc, u_int nq, u_int *ncqa,
1747    u_int *nsqa)
1748{
1749	struct nvme_pt_state state;
1750	struct nvme_pt_command pt;
1751	struct nvme_ccb *ccb;
1752	int rv;
1753
1754	ccb = nvme_ccb_get(sc->sc_admin_q, false);
1755	KASSERT(ccb != NULL); /* it's a bug if we don't have spare ccb here */
1756
1757	memset(&pt, 0, sizeof(pt));
1758	pt.cmd.opcode = NVM_ADMIN_SET_FEATURES;
1759	pt.cmd.cdw10 = NVM_FEATURE_NUMBER_OF_QUEUES;
1760	pt.cmd.cdw11 = ((nq - 1) << 16) | (nq - 1);
1761
1762	memset(&state, 0, sizeof(state));
1763	state.pt = &pt;
1764	state.finished = false;
1765
1766	ccb->ccb_done = nvme_pt_done;
1767	ccb->ccb_cookie = &state;
1768
1769	rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_pt_fill, NVME_TIMO_QOP);
1770
1771	if (rv != 0) {
1772		*ncqa = *nsqa = 0;
1773		return EIO;
1774	}
1775
1776	*ncqa = (pt.cpl.cdw0 >> 16) + 1;
1777	*nsqa = (pt.cpl.cdw0 & 0xffff) + 1;
1778
1779	return 0;
1780}
1781
1782static int
1783nvme_ccbs_alloc(struct nvme_queue *q, uint16_t nccbs)
1784{
1785	struct nvme_softc *sc = q->q_sc;
1786	struct nvme_ccb *ccb;
1787	bus_addr_t off;
1788	uint64_t *prpl;
1789	u_int i;
1790
1791	mutex_init(&q->q_ccb_mtx, MUTEX_DEFAULT, IPL_BIO);
1792	cv_init(&q->q_ccb_wait, "nvmeqw");
1793	q->q_ccb_waiting = false;
1794	SIMPLEQ_INIT(&q->q_ccb_list);
1795
1796	q->q_ccbs = kmem_alloc(sizeof(*ccb) * nccbs, KM_SLEEP);
1797
1798	q->q_nccbs = nccbs;
1799	q->q_ccb_prpls = nvme_dmamem_alloc(sc,
1800	    sizeof(*prpl) * sc->sc_max_sgl * nccbs);
1801
1802	prpl = NVME_DMA_KVA(q->q_ccb_prpls);
1803	off = 0;
1804
1805	for (i = 0; i < nccbs; i++) {
1806		ccb = &q->q_ccbs[i];
1807
1808		if (bus_dmamap_create(sc->sc_dmat, sc->sc_mdts,
1809		    sc->sc_max_sgl + 1 /* we get a free prp in the sqe */,
1810		    sc->sc_mps, sc->sc_mps, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
1811		    &ccb->ccb_dmamap) != 0)
1812			goto free_maps;
1813
1814		ccb->ccb_id = i;
1815		ccb->ccb_prpl = prpl;
1816		ccb->ccb_prpl_off = off;
1817		ccb->ccb_prpl_dva = NVME_DMA_DVA(q->q_ccb_prpls) + off;
1818
1819		SIMPLEQ_INSERT_TAIL(&q->q_ccb_list, ccb, ccb_entry);
1820
1821		prpl += sc->sc_max_sgl;
1822		off += sizeof(*prpl) * sc->sc_max_sgl;
1823	}
1824
1825	return 0;
1826
1827free_maps:
1828	nvme_ccbs_free(q);
1829	return 1;
1830}
1831
1832static struct nvme_ccb *
1833nvme_ccb_get(struct nvme_queue *q, bool wait)
1834{
1835	struct nvme_ccb *ccb = NULL;
1836
1837	mutex_enter(&q->q_ccb_mtx);
1838again:
1839	ccb = SIMPLEQ_FIRST(&q->q_ccb_list);
1840	if (ccb != NULL) {
1841		SIMPLEQ_REMOVE_HEAD(&q->q_ccb_list, ccb_entry);
1842#ifdef DEBUG
1843		ccb->ccb_cookie = NULL;
1844#endif
1845	} else {
1846		if (__predict_false(wait)) {
1847			q->q_ccb_waiting = true;
1848			cv_wait(&q->q_ccb_wait, &q->q_ccb_mtx);
1849			goto again;
1850		}
1851	}
1852	mutex_exit(&q->q_ccb_mtx);
1853
1854	return ccb;
1855}
1856
1857static struct nvme_ccb *
1858nvme_ccb_get_bio(struct nvme_softc *sc, struct buf *bp,
1859    struct nvme_queue **selq)
1860{
1861	u_int cpuindex = cpu_index((bp && bp->b_ci) ? bp->b_ci : curcpu());
1862
1863	/*
1864	 * Find a queue with available ccbs, preferring the originating
1865	 * CPU's queue.
1866	 */
1867
1868	for (u_int qoff = 0; qoff < sc->sc_nq; qoff++) {
1869		struct nvme_queue *q = sc->sc_q[(cpuindex + qoff) % sc->sc_nq];
1870		struct nvme_ccb *ccb;
1871
1872		mutex_enter(&q->q_ccb_mtx);
1873		ccb = SIMPLEQ_FIRST(&q->q_ccb_list);
1874		if (ccb != NULL) {
1875			SIMPLEQ_REMOVE_HEAD(&q->q_ccb_list, ccb_entry);
1876#ifdef DEBUG
1877			ccb->ccb_cookie = NULL;
1878#endif
1879		}
1880		mutex_exit(&q->q_ccb_mtx);
1881
1882		if (ccb != NULL) {
1883			*selq = q;
1884			return ccb;
1885		}
1886	}
1887
1888	return NULL;
1889}
1890
1891static void
1892nvme_ccb_put(struct nvme_queue *q, struct nvme_ccb *ccb)
1893{
1894
1895	mutex_enter(&q->q_ccb_mtx);
1896#ifdef DEBUG
1897	ccb->ccb_cookie = (void *)NVME_CCB_FREE;
1898#endif
1899	SIMPLEQ_INSERT_HEAD(&q->q_ccb_list, ccb, ccb_entry);
1900
1901	/* It's unlikely there are any waiters, it's not used for regular I/O */
1902	if (__predict_false(q->q_ccb_waiting)) {
1903		q->q_ccb_waiting = false;
1904		cv_broadcast(&q->q_ccb_wait);
1905	}
1906
1907	mutex_exit(&q->q_ccb_mtx);
1908}
1909
1910static void
1911nvme_ccbs_free(struct nvme_queue *q)
1912{
1913	struct nvme_softc *sc = q->q_sc;
1914	struct nvme_ccb *ccb;
1915
1916	mutex_enter(&q->q_ccb_mtx);
1917	while ((ccb = SIMPLEQ_FIRST(&q->q_ccb_list)) != NULL) {
1918		SIMPLEQ_REMOVE_HEAD(&q->q_ccb_list, ccb_entry);
1919		/*
1920		 * bus_dmamap_destroy() may call vm_map_lock() and rw_enter()
1921		 * internally. don't hold spin mutex
1922		 */
1923		mutex_exit(&q->q_ccb_mtx);
1924		bus_dmamap_destroy(sc->sc_dmat, ccb->ccb_dmamap);
1925		mutex_enter(&q->q_ccb_mtx);
1926	}
1927	mutex_exit(&q->q_ccb_mtx);
1928
1929	nvme_dmamem_free(sc, q->q_ccb_prpls);
1930	kmem_free(q->q_ccbs, sizeof(*ccb) * q->q_nccbs);
1931	q->q_ccbs = NULL;
1932	cv_destroy(&q->q_ccb_wait);
1933	mutex_destroy(&q->q_ccb_mtx);
1934}
1935
1936static struct nvme_queue *
1937nvme_q_alloc(struct nvme_softc *sc, uint16_t id, u_int entries, u_int dstrd)
1938{
1939	struct nvme_queue *q;
1940
1941	q = kmem_alloc(sizeof(*q), KM_SLEEP);
1942	q->q_sc = sc;
1943	q->q_sq_dmamem = nvme_dmamem_alloc(sc,
1944	    sizeof(struct nvme_sqe) * entries);
1945	if (q->q_sq_dmamem == NULL)
1946		goto free;
1947
1948	q->q_cq_dmamem = nvme_dmamem_alloc(sc,
1949	    sizeof(struct nvme_cqe) * entries);
1950	if (q->q_cq_dmamem == NULL)
1951		goto free_sq;
1952
1953	memset(NVME_DMA_KVA(q->q_sq_dmamem), 0, NVME_DMA_LEN(q->q_sq_dmamem));
1954	memset(NVME_DMA_KVA(q->q_cq_dmamem), 0, NVME_DMA_LEN(q->q_cq_dmamem));
1955
1956	mutex_init(&q->q_sq_mtx, MUTEX_DEFAULT, IPL_BIO);
1957	mutex_init(&q->q_cq_mtx, MUTEX_DEFAULT, IPL_BIO);
1958	q->q_sqtdbl = NVME_SQTDBL(id, dstrd);
1959	q->q_cqhdbl = NVME_CQHDBL(id, dstrd);
1960	q->q_id = id;
1961	q->q_entries = entries;
1962	q->q_sq_tail = 0;
1963	q->q_cq_head = 0;
1964	q->q_cq_phase = NVME_CQE_PHASE;
1965
1966	if (sc->sc_ops->op_q_alloc != NULL) {
1967		if (sc->sc_ops->op_q_alloc(sc, q) != 0)
1968			goto free_cq;
1969	}
1970
1971	nvme_dmamem_sync(sc, q->q_sq_dmamem, BUS_DMASYNC_PREWRITE);
1972	nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
1973
1974	/*
1975	 * Due to definition of full and empty queue (queue is empty
1976	 * when head == tail, full when tail is one less then head),
1977	 * we can actually only have (entries - 1) in-flight commands.
1978	 */
1979	if (nvme_ccbs_alloc(q, entries - 1) != 0) {
1980		aprint_error_dev(sc->sc_dev, "unable to allocate ccbs\n");
1981		goto free_cq;
1982	}
1983
1984	return q;
1985
1986free_cq:
1987	nvme_dmamem_free(sc, q->q_cq_dmamem);
1988free_sq:
1989	nvme_dmamem_free(sc, q->q_sq_dmamem);
1990free:
1991	kmem_free(q, sizeof(*q));
1992
1993	return NULL;
1994}
1995
1996static void
1997nvme_q_reset(struct nvme_softc *sc, struct nvme_queue *q)
1998{
1999
2000	memset(NVME_DMA_KVA(q->q_sq_dmamem), 0, NVME_DMA_LEN(q->q_sq_dmamem));
2001	memset(NVME_DMA_KVA(q->q_cq_dmamem), 0, NVME_DMA_LEN(q->q_cq_dmamem));
2002
2003	q->q_sq_tail = 0;
2004	q->q_cq_head = 0;
2005	q->q_cq_phase = NVME_CQE_PHASE;
2006
2007	nvme_dmamem_sync(sc, q->q_sq_dmamem, BUS_DMASYNC_PREWRITE);
2008	nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
2009}
2010
2011static void
2012nvme_q_free(struct nvme_softc *sc, struct nvme_queue *q)
2013{
2014	nvme_ccbs_free(q);
2015	mutex_destroy(&q->q_sq_mtx);
2016	mutex_destroy(&q->q_cq_mtx);
2017	nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
2018	nvme_dmamem_sync(sc, q->q_sq_dmamem, BUS_DMASYNC_POSTWRITE);
2019
2020	if (sc->sc_ops->op_q_alloc != NULL)
2021		sc->sc_ops->op_q_free(sc, q);
2022
2023	nvme_dmamem_free(sc, q->q_cq_dmamem);
2024	nvme_dmamem_free(sc, q->q_sq_dmamem);
2025	kmem_free(q, sizeof(*q));
2026}
2027
2028int
2029nvme_intr(void *xsc)
2030{
2031	struct nvme_softc *sc = xsc;
2032
2033	KASSERT(!sc->sc_use_mq);
2034
2035	/*
2036	 * INTx is level triggered, controller deasserts the interrupt only
2037	 * when we advance command queue head via write to the doorbell.
2038	 * Tell the controller to block the interrupts while we process
2039	 * the queue(s).
2040	 */
2041	nvme_write4(sc, NVME_INTMS, 1);
2042
2043	softint_schedule(sc->sc_softih[0]);
2044
2045	/* don't know, might not have been for us */
2046	return 1;
2047}
2048
2049void
2050nvme_softintr_intx(void *xq)
2051{
2052	struct nvme_queue *q = xq;
2053	struct nvme_softc *sc = q->q_sc;
2054
2055	KASSERT(!sc->sc_use_mq);
2056
2057	nvme_q_complete(sc, sc->sc_admin_q);
2058	if (sc->sc_q != NULL)
2059	        nvme_q_complete(sc, sc->sc_q[0]);
2060
2061	/*
2062	 * Processing done, tell controller to issue interrupts again. There
2063	 * is no race, as NVMe spec requires the controller to maintain state,
2064	 * and assert the interrupt whenever there are unacknowledged
2065	 * completion queue entries.
2066	 */
2067	nvme_write4(sc, NVME_INTMC, 1);
2068}
2069
2070int
2071nvme_intr_msi(void *xq)
2072{
2073	struct nvme_queue *q = xq;
2074
2075	KASSERT(q);
2076	KASSERT(q->q_sc);
2077	KASSERT(q->q_sc->sc_softih);
2078	KASSERT(q->q_sc->sc_softih[q->q_id]);
2079
2080	/*
2081	 * MSI/MSI-X are edge triggered, so can handover processing to softint
2082	 * without masking the interrupt.
2083	 */
2084	softint_schedule(q->q_sc->sc_softih[q->q_id]);
2085
2086	return 1;
2087}
2088
2089void
2090nvme_softintr_msi(void *xq)
2091{
2092	struct nvme_queue *q = xq;
2093	struct nvme_softc *sc = q->q_sc;
2094
2095	nvme_q_complete(sc, q);
2096}
2097
2098struct nvme_dmamem *
2099nvme_dmamem_alloc(struct nvme_softc *sc, size_t size)
2100{
2101	struct nvme_dmamem *ndm;
2102	int nsegs;
2103
2104	ndm = kmem_zalloc(sizeof(*ndm), KM_SLEEP);
2105	if (ndm == NULL)
2106		return NULL;
2107
2108	ndm->ndm_size = size;
2109
2110	if (bus_dmamap_create(sc->sc_dmat, size, btoc(round_page(size)), size, 0,
2111	    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW, &ndm->ndm_map) != 0)
2112		goto ndmfree;
2113
2114	if (bus_dmamem_alloc(sc->sc_dmat, size, sc->sc_mps, 0, &ndm->ndm_seg,
2115	    1, &nsegs, BUS_DMA_WAITOK) != 0)
2116		goto destroy;
2117
2118	if (bus_dmamem_map(sc->sc_dmat, &ndm->ndm_seg, nsegs, size,
2119	    &ndm->ndm_kva, BUS_DMA_WAITOK) != 0)
2120		goto free;
2121
2122	if (bus_dmamap_load(sc->sc_dmat, ndm->ndm_map, ndm->ndm_kva, size,
2123	    NULL, BUS_DMA_WAITOK) != 0)
2124		goto unmap;
2125
2126	memset(ndm->ndm_kva, 0, size);
2127	bus_dmamap_sync(sc->sc_dmat, ndm->ndm_map, 0, size, BUS_DMASYNC_PREREAD);
2128
2129	return ndm;
2130
2131unmap:
2132	bus_dmamem_unmap(sc->sc_dmat, ndm->ndm_kva, size);
2133free:
2134	bus_dmamem_free(sc->sc_dmat, &ndm->ndm_seg, 1);
2135destroy:
2136	bus_dmamap_destroy(sc->sc_dmat, ndm->ndm_map);
2137ndmfree:
2138	kmem_free(ndm, sizeof(*ndm));
2139	return NULL;
2140}
2141
2142void
2143nvme_dmamem_sync(struct nvme_softc *sc, struct nvme_dmamem *mem, int ops)
2144{
2145	bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(mem),
2146	    0, NVME_DMA_LEN(mem), ops);
2147}
2148
2149void
2150nvme_dmamem_free(struct nvme_softc *sc, struct nvme_dmamem *ndm)
2151{
2152	bus_dmamap_unload(sc->sc_dmat, ndm->ndm_map);
2153	bus_dmamem_unmap(sc->sc_dmat, ndm->ndm_kva, ndm->ndm_size);
2154	bus_dmamem_free(sc->sc_dmat, &ndm->ndm_seg, 1);
2155	bus_dmamap_destroy(sc->sc_dmat, ndm->ndm_map);
2156	kmem_free(ndm, sizeof(*ndm));
2157}
2158
2159/*
2160 * ioctl
2161 */
2162
2163dev_type_open(nvmeopen);
2164dev_type_close(nvmeclose);
2165dev_type_ioctl(nvmeioctl);
2166
2167const struct cdevsw nvme_cdevsw = {
2168	.d_open = nvmeopen,
2169	.d_close = nvmeclose,
2170	.d_read = noread,
2171	.d_write = nowrite,
2172	.d_ioctl = nvmeioctl,
2173	.d_stop = nostop,
2174	.d_tty = notty,
2175	.d_poll = nopoll,
2176	.d_mmap = nommap,
2177	.d_kqfilter = nokqfilter,
2178	.d_discard = nodiscard,
2179	.d_flag = D_OTHER,
2180};
2181
2182/*
2183 * Accept an open operation on the control device.
2184 */
2185int
2186nvmeopen(dev_t dev, int flag, int mode, struct lwp *l)
2187{
2188	struct nvme_softc *sc;
2189	int unit = minor(dev) / 0x10000;
2190	int nsid = minor(dev) & 0xffff;
2191	int nsidx;
2192
2193	if ((sc = device_lookup_private(&nvme_cd, unit)) == NULL)
2194		return ENXIO;
2195	if ((sc->sc_flags & NVME_F_ATTACHED) == 0)
2196		return ENXIO;
2197
2198	if (nsid == 0) {
2199		/* controller */
2200		if (ISSET(sc->sc_flags, NVME_F_OPEN))
2201			return EBUSY;
2202		SET(sc->sc_flags, NVME_F_OPEN);
2203	} else {
2204		/* namespace */
2205		nsidx = nsid - 1;
2206		if (nsidx >= sc->sc_nn || sc->sc_namespaces[nsidx].dev == NULL)
2207			return ENXIO;
2208		if (ISSET(sc->sc_namespaces[nsidx].flags, NVME_NS_F_OPEN))
2209			return EBUSY;
2210		SET(sc->sc_namespaces[nsidx].flags, NVME_NS_F_OPEN);
2211	}
2212	return 0;
2213}
2214
2215/*
2216 * Accept the last close on the control device.
2217 */
2218int
2219nvmeclose(dev_t dev, int flag, int mode, struct lwp *l)
2220{
2221	struct nvme_softc *sc;
2222	int unit = minor(dev) / 0x10000;
2223	int nsid = minor(dev) & 0xffff;
2224	int nsidx;
2225
2226	sc = device_lookup_private(&nvme_cd, unit);
2227	if (sc == NULL)
2228		return ENXIO;
2229
2230	if (nsid == 0) {
2231		/* controller */
2232		CLR(sc->sc_flags, NVME_F_OPEN);
2233	} else {
2234		/* namespace */
2235		nsidx = nsid - 1;
2236		if (nsidx >= sc->sc_nn)
2237			return ENXIO;
2238		CLR(sc->sc_namespaces[nsidx].flags, NVME_NS_F_OPEN);
2239	}
2240
2241	return 0;
2242}
2243
2244/*
2245 * Handle control operations.
2246 */
2247int
2248nvmeioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
2249{
2250	struct nvme_softc *sc;
2251	int unit = minor(dev) / 0x10000;
2252	int nsid = minor(dev) & 0xffff;
2253	struct nvme_pt_command *pt;
2254
2255	sc = device_lookup_private(&nvme_cd, unit);
2256	if (sc == NULL)
2257		return ENXIO;
2258
2259	switch (cmd) {
2260	case NVME_PASSTHROUGH_CMD:
2261		pt = data;
2262		return nvme_command_passthrough(sc, data,
2263		    nsid == 0 ? pt->cmd.nsid : (uint32_t)nsid, l, nsid == 0);
2264	}
2265
2266	return ENOTTY;
2267}
2268