1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8#include <sys/param.h>
9#include <sys/bus.h>
10#include <sys/conf.h>
11#include <sys/eventhandler.h>
12#include <sys/lock.h>
13#include <sys/kernel.h>
14#include <sys/malloc.h>
15#include <sys/memdesc.h>
16#include <sys/module.h>
17#include <sys/mutex.h>
18#include <sys/reboot.h>
19#include <sys/sx.h>
20#include <sys/sysctl.h>
21#include <sys/taskqueue.h>
22#include <dev/nvme/nvme.h>
23#include <dev/nvmf/nvmf.h>
24#include <dev/nvmf/nvmf_transport.h>
25#include <dev/nvmf/host/nvmf_var.h>
26
27static struct cdevsw nvmf_cdevsw;
28
29bool nvmf_fail_disconnect = false;
30SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
31    &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
32
33MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
34
35static void	nvmf_disconnect_task(void *arg, int pending);
36static void	nvmf_shutdown_pre_sync(void *arg, int howto);
37static void	nvmf_shutdown_post_sync(void *arg, int howto);
38
39void
40nvmf_complete(void *arg, const struct nvme_completion *cqe)
41{
42	struct nvmf_completion_status *status = arg;
43	struct mtx *mtx;
44
45	status->cqe = *cqe;
46	mtx = mtx_pool_find(mtxpool_sleep, status);
47	mtx_lock(mtx);
48	status->done = true;
49	mtx_unlock(mtx);
50	wakeup(status);
51}
52
53void
54nvmf_io_complete(void *arg, size_t xfered, int error)
55{
56	struct nvmf_completion_status *status = arg;
57	struct mtx *mtx;
58
59	status->io_error = error;
60	mtx = mtx_pool_find(mtxpool_sleep, status);
61	mtx_lock(mtx);
62	status->io_done = true;
63	mtx_unlock(mtx);
64	wakeup(status);
65}
66
67void
68nvmf_wait_for_reply(struct nvmf_completion_status *status)
69{
70	struct mtx *mtx;
71
72	mtx = mtx_pool_find(mtxpool_sleep, status);
73	mtx_lock(mtx);
74	while (!status->done || !status->io_done)
75		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
76	mtx_unlock(mtx);
77}
78
79static int
80nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
81    uint64_t *value)
82{
83	const struct nvmf_fabric_prop_get_rsp *rsp;
84	struct nvmf_completion_status status;
85
86	nvmf_status_init(&status);
87	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
88	    M_WAITOK))
89		return (ECONNABORTED);
90	nvmf_wait_for_reply(&status);
91
92	if (status.cqe.status != 0) {
93		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
94		    le16toh(status.cqe.status));
95		return (EIO);
96	}
97
98	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
99	if (size == 8)
100		*value = le64toh(rsp->value.u64);
101	else
102		*value = le32toh(rsp->value.u32.low);
103	return (0);
104}
105
106static int
107nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
108    uint64_t value)
109{
110	struct nvmf_completion_status status;
111
112	nvmf_status_init(&status);
113	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
114	    M_WAITOK))
115		return (ECONNABORTED);
116	nvmf_wait_for_reply(&status);
117
118	if (status.cqe.status != 0) {
119		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
120		    le16toh(status.cqe.status));
121		return (EIO);
122	}
123	return (0);
124}
125
126static void
127nvmf_shutdown_controller(struct nvmf_softc *sc)
128{
129	uint64_t cc;
130	int error;
131
132	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
133	if (error != 0) {
134		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
135		return;
136	}
137
138	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
139
140	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
141	if (error != 0)
142		device_printf(sc->dev,
143		    "Failed to set CC to trigger shutdown\n");
144}
145
146static void
147nvmf_check_keep_alive(void *arg)
148{
149	struct nvmf_softc *sc = arg;
150	int traffic;
151
152	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
153	if (traffic == 0) {
154		device_printf(sc->dev,
155		    "disconnecting due to KeepAlive timeout\n");
156		nvmf_disconnect(sc);
157		return;
158	}
159
160	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
161}
162
163static void
164nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
165{
166	struct nvmf_softc *sc = arg;
167
168	atomic_store_int(&sc->ka_active_rx_traffic, 1);
169	if (cqe->status != 0) {
170		device_printf(sc->dev,
171		    "KeepAlive response reported status %#x\n",
172		    le16toh(cqe->status));
173	}
174}
175
176static void
177nvmf_send_keep_alive(void *arg)
178{
179	struct nvmf_softc *sc = arg;
180	int traffic;
181
182	/*
183	 * Don't bother sending a KeepAlive command if TKAS is active
184	 * and another command has been sent during the interval.
185	 */
186	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
187	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
188	    sc, M_NOWAIT))
189		device_printf(sc->dev,
190		    "Failed to allocate KeepAlive command\n");
191
192	/* Clear ka_active_tx_traffic after sending the keep alive command. */
193	atomic_store_int(&sc->ka_active_tx_traffic, 0);
194
195	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
196}
197
198int
199nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
200{
201	size_t len;
202	u_int i;
203	int error;
204
205	memset(ivars, 0, sizeof(*ivars));
206
207	if (!hh->admin.admin || hh->num_io_queues < 1)
208		return (EINVAL);
209
210	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
211	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
212	if (error != 0)
213		goto out;
214	nvme_controller_data_swapbytes(ivars->cdata);
215
216	len = hh->num_io_queues * sizeof(*ivars->io_params);
217	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
218	error = copyin(hh->io, ivars->io_params, len);
219	if (error != 0)
220		goto out;
221	for (i = 0; i < hh->num_io_queues; i++) {
222		if (ivars->io_params[i].admin) {
223			error = EINVAL;
224			goto out;
225		}
226
227		/* Require all I/O queues to be the same size. */
228		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
229			error = EINVAL;
230			goto out;
231		}
232	}
233
234	ivars->hh = hh;
235	return (0);
236
237out:
238	free(ivars->io_params, M_NVMF);
239	free(ivars->cdata, M_NVMF);
240	return (error);
241}
242
243void
244nvmf_free_ivars(struct nvmf_ivars *ivars)
245{
246	free(ivars->io_params, M_NVMF);
247	free(ivars->cdata, M_NVMF);
248}
249
250static int
251nvmf_probe(device_t dev)
252{
253	struct nvmf_ivars *ivars = device_get_ivars(dev);
254	char desc[260];
255
256	if (ivars == NULL)
257		return (ENXIO);
258
259	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
260	device_set_desc_copy(dev, desc);
261	return (BUS_PROBE_DEFAULT);
262}
263
264static int
265nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
266{
267	char name[16];
268
269	/* Setup the admin queue. */
270	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
271	    "admin queue");
272	if (sc->admin == NULL) {
273		device_printf(sc->dev, "Failed to setup admin queue\n");
274		return (ENXIO);
275	}
276
277	/* Setup I/O queues. */
278	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
279	    M_WAITOK | M_ZERO);
280	sc->num_io_queues = ivars->hh->num_io_queues;
281	for (u_int i = 0; i < sc->num_io_queues; i++) {
282		snprintf(name, sizeof(name), "I/O queue %u", i);
283		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
284		    &ivars->io_params[i], name);
285		if (sc->io[i] == NULL) {
286			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
287			    i + 1);
288			return (ENXIO);
289		}
290	}
291
292	/* Start KeepAlive timers. */
293	if (ivars->hh->kato != 0) {
294		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
295		    sc->cdata->ctratt) != 0;
296		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
297		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
298		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
299		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
300		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
301		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
302	}
303
304	return (0);
305}
306
307typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
308    const struct nvme_namespace_data *, void *);
309
310static bool
311nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
312    struct nvme_namespace_data *data, uint32_t *nsidp,
313    nvmf_scan_active_ns_cb *cb, void *cb_arg)
314{
315	struct nvmf_completion_status status;
316	uint32_t nsid;
317
318	nvmf_status_init(&status);
319	nvmf_status_wait_io(&status);
320	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
321	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
322		device_printf(sc->dev,
323		    "failed to send IDENTIFY active namespaces command\n");
324		return (false);
325	}
326	nvmf_wait_for_reply(&status);
327
328	if (status.cqe.status != 0) {
329		device_printf(sc->dev,
330		    "IDENTIFY active namespaces failed, status %#x\n",
331		    le16toh(status.cqe.status));
332		return (false);
333	}
334
335	if (status.io_error != 0) {
336		device_printf(sc->dev,
337		    "IDENTIFY active namespaces failed with I/O error %d\n",
338		    status.io_error);
339		return (false);
340	}
341
342	for (u_int i = 0; i < nitems(nslist->ns); i++) {
343		nsid = nslist->ns[i];
344		if (nsid == 0) {
345			*nsidp = 0;
346			return (true);
347		}
348
349		nvmf_status_init(&status);
350		nvmf_status_wait_io(&status);
351		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
352		    &status, nvmf_io_complete, &status, M_WAITOK)) {
353			device_printf(sc->dev,
354			    "failed to send IDENTIFY namespace %u command\n",
355			    nsid);
356			return (false);
357		}
358		nvmf_wait_for_reply(&status);
359
360		if (status.cqe.status != 0) {
361			device_printf(sc->dev,
362			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
363			    le16toh(status.cqe.status));
364			return (false);
365		}
366
367		if (status.io_error != 0) {
368			device_printf(sc->dev,
369			    "IDENTIFY namespace %u failed with I/O error %d\n",
370			    nsid, status.io_error);
371			return (false);
372		}
373
374		nvme_namespace_data_swapbytes(data);
375		if (!cb(sc, nsid, data, cb_arg))
376			return (false);
377	}
378
379	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
380
381	if (nsid >= 0xfffffffd)
382		*nsidp = 0;
383	else
384		*nsidp = nsid + 1;
385	return (true);
386}
387
388static bool
389nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
390    void *cb_arg)
391{
392	struct nvme_namespace_data *data;
393	struct nvme_ns_list *nslist;
394	uint32_t nsid;
395	bool retval;
396
397	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
398	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
399
400	nsid = 0;
401	retval = true;
402	for (;;) {
403		if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
404		    cb_arg)) {
405			retval = false;
406			break;
407		}
408		if (nsid == 0)
409			break;
410	}
411
412	free(data, M_NVMF);
413	free(nslist, M_NVMF);
414	return (retval);
415}
416
417static bool
418nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
419    const struct nvme_namespace_data *data, void *arg __unused)
420{
421	if (sc->ns[nsid - 1] != NULL) {
422		device_printf(sc->dev,
423		    "duplicate namespace %u in active namespace list\n",
424		    nsid);
425		return (false);
426	}
427
428	/*
429	 * As in nvme_ns_construct, a size of zero indicates an
430	 * invalid namespace.
431	 */
432	if (data->nsze == 0) {
433		device_printf(sc->dev,
434		    "ignoring active namespace %u with zero size\n", nsid);
435		return (true);
436	}
437
438	sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
439
440	nvmf_sim_rescan_ns(sc, nsid);
441	return (true);
442}
443
444static bool
445nvmf_add_namespaces(struct nvmf_softc *sc)
446{
447	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
448	    M_WAITOK | M_ZERO);
449	return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
450}
451
452static int
453nvmf_attach(device_t dev)
454{
455	struct make_dev_args mda;
456	struct nvmf_softc *sc = device_get_softc(dev);
457	struct nvmf_ivars *ivars = device_get_ivars(dev);
458	uint64_t val;
459	u_int i;
460	int error;
461
462	if (ivars == NULL)
463		return (ENXIO);
464
465	sc->dev = dev;
466	sc->trtype = ivars->hh->trtype;
467	callout_init(&sc->ka_rx_timer, 1);
468	callout_init(&sc->ka_tx_timer, 1);
469	sx_init(&sc->connection_lock, "nvmf connection");
470	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
471
472	/* Claim the cdata pointer from ivars. */
473	sc->cdata = ivars->cdata;
474	ivars->cdata = NULL;
475
476	nvmf_init_aer(sc);
477
478	/* TODO: Multiqueue support. */
479	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
480
481	error = nvmf_establish_connection(sc, ivars);
482	if (error != 0)
483		goto out;
484
485	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
486	if (error != 0) {
487		device_printf(sc->dev, "Failed to fetch CAP\n");
488		error = ENXIO;
489		goto out;
490	}
491
492	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
493	if (error != 0) {
494		device_printf(sc->dev, "Failed to fetch VS\n");
495		error = ENXIO;
496		goto out;
497	}
498	sc->vs = val;
499
500	/* Honor MDTS if it is set. */
501	sc->max_xfer_size = maxphys;
502	if (sc->cdata->mdts != 0) {
503		sc->max_xfer_size = ulmin(sc->max_xfer_size,
504		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
505		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
506	}
507
508	error = nvmf_init_sim(sc);
509	if (error != 0)
510		goto out;
511
512	error = nvmf_start_aer(sc);
513	if (error != 0) {
514		nvmf_destroy_sim(sc);
515		goto out;
516	}
517
518	if (!nvmf_add_namespaces(sc)) {
519		nvmf_destroy_sim(sc);
520		goto out;
521	}
522
523	make_dev_args_init(&mda);
524	mda.mda_devsw = &nvmf_cdevsw;
525	mda.mda_uid = UID_ROOT;
526	mda.mda_gid = GID_WHEEL;
527	mda.mda_mode = 0600;
528	mda.mda_si_drv1 = sc;
529	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
530	if (error != 0) {
531		nvmf_destroy_sim(sc);
532		goto out;
533	}
534
535	sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
536	    nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
537	sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
538	    nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST);
539
540	return (0);
541out:
542	if (sc->ns != NULL) {
543		for (i = 0; i < sc->cdata->nn; i++) {
544			if (sc->ns[i] != NULL)
545				nvmf_destroy_ns(sc->ns[i]);
546		}
547		free(sc->ns, M_NVMF);
548	}
549
550	callout_drain(&sc->ka_tx_timer);
551	callout_drain(&sc->ka_rx_timer);
552
553	if (sc->admin != NULL)
554		nvmf_shutdown_controller(sc);
555
556	for (i = 0; i < sc->num_io_queues; i++) {
557		if (sc->io[i] != NULL)
558			nvmf_destroy_qp(sc->io[i]);
559	}
560	free(sc->io, M_NVMF);
561	if (sc->admin != NULL)
562		nvmf_destroy_qp(sc->admin);
563
564	nvmf_destroy_aer(sc);
565
566	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
567	sx_destroy(&sc->connection_lock);
568	free(sc->cdata, M_NVMF);
569	return (error);
570}
571
572void
573nvmf_disconnect(struct nvmf_softc *sc)
574{
575	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
576}
577
578static void
579nvmf_disconnect_task(void *arg, int pending __unused)
580{
581	struct nvmf_softc *sc = arg;
582	u_int i;
583
584	sx_xlock(&sc->connection_lock);
585	if (sc->admin == NULL) {
586		/*
587		 * Ignore transport errors if there is no active
588		 * association.
589		 */
590		sx_xunlock(&sc->connection_lock);
591		return;
592	}
593
594	if (sc->detaching) {
595		if (sc->admin != NULL) {
596			/*
597			 * This unsticks the detach process if a
598			 * transport error occurs during detach.
599			 */
600			nvmf_shutdown_qp(sc->admin);
601		}
602		sx_xunlock(&sc->connection_lock);
603		return;
604	}
605
606	if (sc->cdev == NULL) {
607		/*
608		 * Transport error occurred during attach (nvmf_add_namespaces).
609		 * Shutdown the admin queue.
610		 */
611		nvmf_shutdown_qp(sc->admin);
612		sx_xunlock(&sc->connection_lock);
613		return;
614	}
615
616	callout_drain(&sc->ka_tx_timer);
617	callout_drain(&sc->ka_rx_timer);
618	sc->ka_traffic = false;
619
620	/* Quiesce namespace consumers. */
621	nvmf_disconnect_sim(sc);
622	for (i = 0; i < sc->cdata->nn; i++) {
623		if (sc->ns[i] != NULL)
624			nvmf_disconnect_ns(sc->ns[i]);
625	}
626
627	/* Shutdown the existing qpairs. */
628	for (i = 0; i < sc->num_io_queues; i++) {
629		nvmf_destroy_qp(sc->io[i]);
630	}
631	free(sc->io, M_NVMF);
632	sc->io = NULL;
633	sc->num_io_queues = 0;
634	nvmf_destroy_qp(sc->admin);
635	sc->admin = NULL;
636
637	sx_xunlock(&sc->connection_lock);
638}
639
640static int
641nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
642{
643	struct nvmf_ivars ivars;
644	u_int i;
645	int error;
646
647	/* XXX: Should we permit changing the transport type? */
648	if (sc->trtype != hh->trtype) {
649		device_printf(sc->dev,
650		    "transport type mismatch on reconnect\n");
651		return (EINVAL);
652	}
653
654	error = nvmf_init_ivars(&ivars, hh);
655	if (error != 0)
656		return (error);
657
658	sx_xlock(&sc->connection_lock);
659	if (sc->admin != NULL || sc->detaching) {
660		error = EBUSY;
661		goto out;
662	}
663
664	/*
665	 * Ensure this is for the same controller.  Note that the
666	 * controller ID can vary across associations if the remote
667	 * system is using the dynamic controller model.  This merely
668	 * ensures the new association is connected to the same NVMe
669	 * subsystem.
670	 */
671	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
672	    sizeof(ivars.cdata->subnqn)) != 0) {
673		device_printf(sc->dev,
674		    "controller subsystem NQN mismatch on reconnect\n");
675		error = EINVAL;
676		goto out;
677	}
678
679	/*
680	 * XXX: Require same number and size of I/O queues so that
681	 * max_pending_io is still correct?
682	 */
683
684	error = nvmf_establish_connection(sc, &ivars);
685	if (error != 0)
686		goto out;
687
688	error = nvmf_start_aer(sc);
689	if (error != 0)
690		goto out;
691
692	device_printf(sc->dev,
693	    "established new association with %u I/O queues\n",
694	    sc->num_io_queues);
695
696	/* Restart namespace consumers. */
697	for (i = 0; i < sc->cdata->nn; i++) {
698		if (sc->ns[i] != NULL)
699			nvmf_reconnect_ns(sc->ns[i]);
700	}
701	nvmf_reconnect_sim(sc);
702
703	nvmf_rescan_all_ns(sc);
704out:
705	sx_xunlock(&sc->connection_lock);
706	nvmf_free_ivars(&ivars);
707	return (error);
708}
709
710static void
711nvmf_shutdown_pre_sync(void *arg, int howto)
712{
713	struct nvmf_softc *sc = arg;
714
715	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
716		return;
717
718	/*
719	 * If this association is disconnected, abort any pending
720	 * requests with an error to permit filesystems to unmount
721	 * without hanging.
722	 */
723	sx_xlock(&sc->connection_lock);
724	if (sc->admin != NULL || sc->detaching) {
725		sx_xunlock(&sc->connection_lock);
726		return;
727	}
728
729	for (u_int i = 0; i < sc->cdata->nn; i++) {
730		if (sc->ns[i] != NULL)
731			nvmf_shutdown_ns(sc->ns[i]);
732	}
733	nvmf_shutdown_sim(sc);
734	sx_xunlock(&sc->connection_lock);
735}
736
737static void
738nvmf_shutdown_post_sync(void *arg, int howto)
739{
740	struct nvmf_softc *sc = arg;
741
742	if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
743		return;
744
745	/*
746	 * If this association is connected, disconnect gracefully.
747	 */
748	sx_xlock(&sc->connection_lock);
749	if (sc->admin == NULL || sc->detaching) {
750		sx_xunlock(&sc->connection_lock);
751		return;
752	}
753
754	callout_drain(&sc->ka_tx_timer);
755	callout_drain(&sc->ka_rx_timer);
756
757	nvmf_shutdown_controller(sc);
758	for (u_int i = 0; i < sc->num_io_queues; i++) {
759		nvmf_destroy_qp(sc->io[i]);
760	}
761	nvmf_destroy_qp(sc->admin);
762	sc->admin = NULL;
763	sx_xunlock(&sc->connection_lock);
764}
765
766static int
767nvmf_detach(device_t dev)
768{
769	struct nvmf_softc *sc = device_get_softc(dev);
770	u_int i;
771
772	destroy_dev(sc->cdev);
773
774	sx_xlock(&sc->connection_lock);
775	sc->detaching = true;
776	sx_xunlock(&sc->connection_lock);
777
778	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
779	EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_post_sync_eh);
780
781	nvmf_destroy_sim(sc);
782	for (i = 0; i < sc->cdata->nn; i++) {
783		if (sc->ns[i] != NULL)
784			nvmf_destroy_ns(sc->ns[i]);
785	}
786	free(sc->ns, M_NVMF);
787
788	callout_drain(&sc->ka_tx_timer);
789	callout_drain(&sc->ka_rx_timer);
790
791	if (sc->admin != NULL)
792		nvmf_shutdown_controller(sc);
793
794	for (i = 0; i < sc->num_io_queues; i++) {
795		nvmf_destroy_qp(sc->io[i]);
796	}
797	free(sc->io, M_NVMF);
798
799	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
800
801	if (sc->admin != NULL)
802		nvmf_destroy_qp(sc->admin);
803
804	nvmf_destroy_aer(sc);
805
806	sx_destroy(&sc->connection_lock);
807	free(sc->cdata, M_NVMF);
808	return (0);
809}
810
811static void
812nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
813    const struct nvme_namespace_data *data)
814{
815	struct nvmf_namespace *ns;
816
817	/* XXX: Needs locking around sc->ns[]. */
818	ns = sc->ns[nsid - 1];
819	if (data->nsze == 0) {
820		/* XXX: Needs locking */
821		if (ns != NULL) {
822			nvmf_destroy_ns(ns);
823			sc->ns[nsid - 1] = NULL;
824		}
825	} else {
826		/* XXX: Needs locking */
827		if (ns == NULL) {
828			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
829		} else {
830			if (!nvmf_update_ns(ns, data)) {
831				nvmf_destroy_ns(ns);
832				sc->ns[nsid - 1] = NULL;
833			}
834		}
835	}
836
837	nvmf_sim_rescan_ns(sc, nsid);
838}
839
840void
841nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
842{
843	struct nvmf_completion_status status;
844	struct nvme_namespace_data *data;
845
846	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
847
848	nvmf_status_init(&status);
849	nvmf_status_wait_io(&status);
850	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
851	    &status, nvmf_io_complete, &status, M_WAITOK)) {
852		device_printf(sc->dev,
853		    "failed to send IDENTIFY namespace %u command\n", nsid);
854		free(data, M_NVMF);
855		return;
856	}
857	nvmf_wait_for_reply(&status);
858
859	if (status.cqe.status != 0) {
860		device_printf(sc->dev,
861		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
862		    le16toh(status.cqe.status));
863		free(data, M_NVMF);
864		return;
865	}
866
867	if (status.io_error != 0) {
868		device_printf(sc->dev,
869		    "IDENTIFY namespace %u failed with I/O error %d\n",
870		    nsid, status.io_error);
871		free(data, M_NVMF);
872		return;
873	}
874
875	nvme_namespace_data_swapbytes(data);
876
877	nvmf_rescan_ns_1(sc, nsid, data);
878
879	free(data, M_NVMF);
880}
881
882static void
883nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
884    uint32_t next_valid_nsid)
885{
886	struct nvmf_namespace *ns;
887
888	for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
889	{
890		/* XXX: Needs locking around sc->ns[]. */
891		ns = sc->ns[nsid - 1];
892		if (ns != NULL) {
893			nvmf_destroy_ns(ns);
894			sc->ns[nsid - 1] = NULL;
895
896			nvmf_sim_rescan_ns(sc, nsid);
897		}
898	}
899}
900
901static bool
902nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
903    const struct nvme_namespace_data *data, void *arg)
904{
905	uint32_t *last_nsid = arg;
906
907	/* Check for any gaps prior to this namespace. */
908	nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
909	*last_nsid = nsid;
910
911	nvmf_rescan_ns_1(sc, nsid, data);
912	return (true);
913}
914
915void
916nvmf_rescan_all_ns(struct nvmf_softc *sc)
917{
918	uint32_t last_nsid;
919
920	last_nsid = 0;
921	if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
922		return;
923
924	/*
925	 * Check for any namespace devices after the last active
926	 * namespace.
927	 */
928	nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
929}
930
931int
932nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
933    bool admin)
934{
935	struct nvmf_completion_status status;
936	struct nvme_command cmd;
937	struct memdesc mem;
938	struct nvmf_host_qpair *qp;
939	struct nvmf_request *req;
940	void *buf;
941	int error;
942
943	if (pt->len > sc->max_xfer_size)
944		return (EINVAL);
945
946	buf = NULL;
947	if (pt->len != 0) {
948		/*
949		 * XXX: Depending on the size we may want to pin the
950		 * user pages and use a memdesc with vm_page_t's
951		 * instead.
952		 */
953		buf = malloc(pt->len, M_NVMF, M_WAITOK);
954		if (pt->is_read == 0) {
955			error = copyin(pt->buf, buf, pt->len);
956			if (error != 0) {
957				free(buf, M_NVMF);
958				return (error);
959			}
960		} else {
961			/* Ensure no kernel data is leaked to userland. */
962			memset(buf, 0, pt->len);
963		}
964	}
965
966	memset(&cmd, 0, sizeof(cmd));
967	cmd.opc = pt->cmd.opc;
968	cmd.fuse = pt->cmd.fuse;
969	cmd.nsid = pt->cmd.nsid;
970	cmd.cdw10 = pt->cmd.cdw10;
971	cmd.cdw11 = pt->cmd.cdw11;
972	cmd.cdw12 = pt->cmd.cdw12;
973	cmd.cdw13 = pt->cmd.cdw13;
974	cmd.cdw14 = pt->cmd.cdw14;
975	cmd.cdw15 = pt->cmd.cdw15;
976
977	if (admin)
978		qp = sc->admin;
979	else
980		qp = nvmf_select_io_queue(sc);
981	nvmf_status_init(&status);
982	req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
983	if (req == NULL) {
984		device_printf(sc->dev, "failed to send passthrough command\n");
985		error = ECONNABORTED;
986		goto error;
987	}
988
989	if (pt->len != 0) {
990		mem = memdesc_vaddr(buf, pt->len);
991		nvmf_capsule_append_data(req->nc, &mem, pt->len,
992		    pt->is_read == 0, nvmf_io_complete, &status);
993		nvmf_status_wait_io(&status);
994	}
995
996	nvmf_submit_request(req);
997	nvmf_wait_for_reply(&status);
998
999	memset(&pt->cpl, 0, sizeof(pt->cpl));
1000	pt->cpl.cdw0 = status.cqe.cdw0;
1001	pt->cpl.status = status.cqe.status;
1002
1003	error = status.io_error;
1004	if (error == 0 && pt->len != 0 && pt->is_read != 0)
1005		error = copyout(buf, pt->buf, pt->len);
1006error:
1007	free(buf, M_NVMF);
1008	return (error);
1009}
1010
1011static int
1012nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1013    struct thread *td)
1014{
1015	struct nvmf_softc *sc = cdev->si_drv1;
1016	struct nvme_get_nsid *gnsid;
1017	struct nvme_pt_command *pt;
1018	struct nvmf_reconnect_params *rp;
1019	struct nvmf_handoff_host *hh;
1020
1021	switch (cmd) {
1022	case NVME_PASSTHROUGH_CMD:
1023		pt = (struct nvme_pt_command *)arg;
1024		return (nvmf_passthrough_cmd(sc, pt, true));
1025	case NVME_GET_NSID:
1026		gnsid = (struct nvme_get_nsid *)arg;
1027		strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
1028		    sizeof(gnsid->cdev));
1029		gnsid->nsid = 0;
1030		return (0);
1031	case NVME_GET_MAX_XFER_SIZE:
1032		*(uint64_t *)arg = sc->max_xfer_size;
1033		return (0);
1034	case NVMF_RECONNECT_PARAMS:
1035		rp = (struct nvmf_reconnect_params *)arg;
1036		if ((sc->cdata->fcatt & 1) == 0)
1037			rp->cntlid = NVMF_CNTLID_DYNAMIC;
1038		else
1039			rp->cntlid = sc->cdata->ctrlr_id;
1040		memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
1041		return (0);
1042	case NVMF_RECONNECT_HOST:
1043		hh = (struct nvmf_handoff_host *)arg;
1044		return (nvmf_reconnect_host(sc, hh));
1045	default:
1046		return (ENOTTY);
1047	}
1048}
1049
1050static struct cdevsw nvmf_cdevsw = {
1051	.d_version = D_VERSION,
1052	.d_ioctl = nvmf_ioctl
1053};
1054
1055static int
1056nvmf_modevent(module_t mod, int what, void *arg)
1057{
1058	switch (what) {
1059	case MOD_LOAD:
1060		return (nvmf_ctl_load());
1061	case MOD_QUIESCE:
1062		return (0);
1063	case MOD_UNLOAD:
1064		nvmf_ctl_unload();
1065		destroy_dev_drain(&nvmf_cdevsw);
1066		return (0);
1067	default:
1068		return (EOPNOTSUPP);
1069	}
1070}
1071
1072static device_method_t nvmf_methods[] = {
1073	/* Device interface */
1074	DEVMETHOD(device_probe,     nvmf_probe),
1075	DEVMETHOD(device_attach,    nvmf_attach),
1076	DEVMETHOD(device_detach,    nvmf_detach),
1077	DEVMETHOD_END
1078};
1079
1080driver_t nvme_nvmf_driver = {
1081	"nvme",
1082	nvmf_methods,
1083	sizeof(struct nvmf_softc),
1084};
1085
1086DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
1087MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
1088