1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8#include <sys/param.h>
9#include <sys/callout.h>
10#include <sys/kernel.h>
11#include <sys/lock.h>
12#include <sys/malloc.h>
13#include <sys/mbuf.h>
14#include <sys/memdesc.h>
15#include <sys/mutex.h>
16#include <sys/sbuf.h>
17#include <sys/sx.h>
18#include <sys/taskqueue.h>
19
20#include <dev/nvmf/nvmf_transport.h>
21#include <dev/nvmf/controller/nvmft_subr.h>
22#include <dev/nvmf/controller/nvmft_var.h>
23
24static void	nvmft_controller_shutdown(void *arg, int pending);
25static void	nvmft_controller_terminate(void *arg, int pending);
26
27int
28nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
29{
30	char buf[128];
31	struct sbuf sb;
32	va_list ap;
33	size_t retval;
34
35	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
36	sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
37
38	sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
39
40	va_start(ap, fmt);
41	sbuf_vprintf(&sb, fmt, ap);
42	va_end(ap);
43
44	sbuf_finish(&sb);
45	sbuf_delete(&sb);
46
47	return (retval);
48}
49
50static struct nvmft_controller *
51nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
52    const struct nvmf_fabric_connect_data *data)
53{
54	struct nvmft_controller *ctrlr;
55
56	ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
57	ctrlr->cntlid = cntlid;
58	nvmft_port_ref(np);
59	TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
60	ctrlr->np = np;
61	mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
62	callout_init(&ctrlr->ka_timer, 1);
63	TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
64	TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
65	    nvmft_controller_terminate, ctrlr);
66
67	ctrlr->cdata = np->cdata;
68	ctrlr->cdata.ctrlr_id = htole16(cntlid);
69	memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
70	memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
71	ctrlr->hip.power_cycles[0] = 1;
72	ctrlr->create_time = sbinuptime();
73
74	ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
75	    M_WAITOK | M_ZERO);
76
77	return (ctrlr);
78}
79
80static void
81nvmft_controller_free(struct nvmft_controller *ctrlr)
82{
83	mtx_destroy(&ctrlr->lock);
84	MPASS(ctrlr->io_qpairs == NULL);
85	free(ctrlr->changed_ns, M_NVMFT);
86	free(ctrlr, M_NVMFT);
87}
88
89static void
90nvmft_keep_alive_timer(void *arg)
91{
92	struct nvmft_controller *ctrlr = arg;
93	int traffic;
94
95	if (ctrlr->shutdown)
96		return;
97
98	traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
99	if (traffic == 0) {
100		nvmft_printf(ctrlr,
101		    "disconnecting due to KeepAlive timeout\n");
102		nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
103		return;
104	}
105
106	callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
107}
108
109int
110nvmft_handoff_admin_queue(struct nvmft_port *np,
111    const struct nvmf_handoff_controller_qpair *handoff,
112    const struct nvmf_fabric_connect_cmd *cmd,
113    const struct nvmf_fabric_connect_data *data)
114{
115	struct nvmft_controller *ctrlr;
116	struct nvmft_qpair *qp;
117	uint32_t kato;
118	int cntlid;
119
120	if (cmd->qid != htole16(0))
121		return (EINVAL);
122
123	qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
124	    "admin queue");
125
126	sx_xlock(&np->lock);
127	cntlid = alloc_unr(np->ids);
128	if (cntlid == -1) {
129		sx_xunlock(&np->lock);
130		printf("NVMFT: Unable to allocate controller for %.*s\n",
131		    (int)sizeof(data->hostnqn), data->hostnqn);
132		nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
133		    NVMF_FABRIC_SC_INVALID_HOST);
134		nvmft_qpair_destroy(qp);
135		return (ENOMEM);
136	}
137
138#ifdef INVARIANTS
139	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
140		KASSERT(ctrlr->cntlid != cntlid,
141		    ("%s: duplicate controllers with id %d", __func__, cntlid));
142	}
143#endif
144
145	ctrlr = nvmft_controller_alloc(np, cntlid, data);
146	nvmft_printf(ctrlr, "associated with %.*s\n",
147	    (int)sizeof(data->hostnqn), data->hostnqn);
148	ctrlr->admin = qp;
149	ctrlr->trtype = handoff->trtype;
150
151	/*
152	 * The spec requires a non-zero KeepAlive timer, but allow a
153	 * zero KATO value to match Linux.
154	 */
155	kato = le32toh(cmd->kato);
156	if (kato != 0) {
157		/*
158		 * Round up to 1 second matching granularity
159		 * advertised in cdata.
160		 */
161		ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
162		callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
163		    nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
164	}
165
166	nvmft_finish_accept(qp, cmd, ctrlr);
167	sx_xunlock(&np->lock);
168
169	return (0);
170}
171
172int
173nvmft_handoff_io_queue(struct nvmft_port *np,
174    const struct nvmf_handoff_controller_qpair *handoff,
175    const struct nvmf_fabric_connect_cmd *cmd,
176    const struct nvmf_fabric_connect_data *data)
177{
178	struct nvmft_controller *ctrlr;
179	struct nvmft_qpair *qp;
180	char name[16];
181	uint16_t cntlid, qid;
182
183	qid = le16toh(cmd->qid);
184	if (qid == 0)
185		return (EINVAL);
186	cntlid = le16toh(data->cntlid);
187
188	snprintf(name, sizeof(name), "I/O queue %u", qid);
189	qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
190
191	sx_slock(&np->lock);
192	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
193		if (ctrlr->cntlid == cntlid)
194			break;
195	}
196	if (ctrlr == NULL) {
197		sx_sunlock(&np->lock);
198		printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
199		    ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
200		    data->hostnqn);
201		nvmft_connect_invalid_parameters(qp, cmd, true,
202		    offsetof(struct nvmf_fabric_connect_data, cntlid));
203		nvmft_qpair_destroy(qp);
204		return (ENOENT);
205	}
206
207	if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
208		sx_sunlock(&np->lock);
209		nvmft_printf(ctrlr,
210		    "hostid mismatch for I/O queue %u from %.*s\n", qid,
211		    (int)sizeof(data->hostnqn), data->hostnqn);
212		nvmft_connect_invalid_parameters(qp, cmd, true,
213		    offsetof(struct nvmf_fabric_connect_data, hostid));
214		nvmft_qpair_destroy(qp);
215		return (EINVAL);
216	}
217	if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
218		sx_sunlock(&np->lock);
219		nvmft_printf(ctrlr,
220		    "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
221		    (int)sizeof(data->hostnqn), data->hostnqn);
222		nvmft_connect_invalid_parameters(qp, cmd, true,
223		    offsetof(struct nvmf_fabric_connect_data, hostnqn));
224		nvmft_qpair_destroy(qp);
225		return (EINVAL);
226	}
227
228	/* XXX: Require handoff->trtype == ctrlr->trtype? */
229
230	mtx_lock(&ctrlr->lock);
231	if (ctrlr->shutdown) {
232		mtx_unlock(&ctrlr->lock);
233		sx_sunlock(&np->lock);
234		nvmft_printf(ctrlr,
235		    "attempt to create I/O queue %u on disabled controller from %.*s\n",
236		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
237		nvmft_connect_invalid_parameters(qp, cmd, true,
238		    offsetof(struct nvmf_fabric_connect_data, cntlid));
239		nvmft_qpair_destroy(qp);
240		return (EINVAL);
241	}
242	if (ctrlr->num_io_queues == 0) {
243		mtx_unlock(&ctrlr->lock);
244		sx_sunlock(&np->lock);
245		nvmft_printf(ctrlr,
246		    "attempt to create I/O queue %u without enabled queues from %.*s\n",
247		    qid, (int)sizeof(data->hostnqn), data->hostnqn);
248		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
249		    NVME_SC_COMMAND_SEQUENCE_ERROR);
250		nvmft_qpair_destroy(qp);
251		return (EINVAL);
252	}
253	if (cmd->qid > ctrlr->num_io_queues) {
254		mtx_unlock(&ctrlr->lock);
255		sx_sunlock(&np->lock);
256		nvmft_printf(ctrlr,
257		    "attempt to create invalid I/O queue %u from %.*s\n", qid,
258		    (int)sizeof(data->hostnqn), data->hostnqn);
259		nvmft_connect_invalid_parameters(qp, cmd, false,
260		    offsetof(struct nvmf_fabric_connect_cmd, qid));
261		nvmft_qpair_destroy(qp);
262		return (EINVAL);
263	}
264	if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
265		mtx_unlock(&ctrlr->lock);
266		sx_sunlock(&np->lock);
267		nvmft_printf(ctrlr,
268		    "attempt to re-create I/O queue %u from %.*s\n", qid,
269		    (int)sizeof(data->hostnqn), data->hostnqn);
270		nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
271		    NVME_SC_COMMAND_SEQUENCE_ERROR);
272		nvmft_qpair_destroy(qp);
273		return (EINVAL);
274	}
275
276	ctrlr->io_qpairs[qid - 1].qp = qp;
277	mtx_unlock(&ctrlr->lock);
278	nvmft_finish_accept(qp, cmd, ctrlr);
279	sx_sunlock(&np->lock);
280
281	return (0);
282}
283
284static void
285nvmft_controller_shutdown(void *arg, int pending)
286{
287	struct nvmft_controller *ctrlr = arg;
288
289	MPASS(pending == 1);
290
291	/*
292	 * Shutdown all I/O queues to terminate pending datamoves and
293	 * stop receiving new commands.
294	 */
295	mtx_lock(&ctrlr->lock);
296	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
297		if (ctrlr->io_qpairs[i].qp != NULL) {
298			ctrlr->io_qpairs[i].shutdown = true;
299			mtx_unlock(&ctrlr->lock);
300			nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
301			mtx_lock(&ctrlr->lock);
302		}
303	}
304	mtx_unlock(&ctrlr->lock);
305
306	/* Terminate active CTL commands. */
307	nvmft_terminate_commands(ctrlr);
308
309	/* Wait for all pending CTL commands to complete. */
310	mtx_lock(&ctrlr->lock);
311	while (ctrlr->pending_commands != 0)
312		mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
313		    hz / 100);
314	mtx_unlock(&ctrlr->lock);
315
316	/* Delete all of the I/O queues. */
317	for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
318		if (ctrlr->io_qpairs[i].qp != NULL)
319			nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
320	}
321	free(ctrlr->io_qpairs, M_NVMFT);
322	ctrlr->io_qpairs = NULL;
323
324	mtx_lock(&ctrlr->lock);
325	ctrlr->num_io_queues = 0;
326
327	/* Mark shutdown complete. */
328	if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
329		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
330		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
331	}
332
333	if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
334		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
335		ctrlr->shutdown = false;
336	}
337	mtx_unlock(&ctrlr->lock);
338
339	/*
340	 * If the admin queue was closed while shutting down or a
341	 * fatal controller error has occurred, terminate the
342	 * association immediately, otherwise wait up to 2 minutes
343	 * (NVMe-over-Fabrics 1.1 4.6).
344	 */
345	if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
346		nvmft_controller_terminate(ctrlr, 0);
347	else
348		taskqueue_enqueue_timeout(taskqueue_thread,
349		    &ctrlr->terminate_task, hz * 60 * 2);
350}
351
352static void
353nvmft_controller_terminate(void *arg, int pending)
354{
355	struct nvmft_controller *ctrlr = arg;
356	struct nvmft_port *np;
357	bool wakeup_np;
358
359	/* If the controller has been re-enabled, nothing to do. */
360	mtx_lock(&ctrlr->lock);
361	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
362		mtx_unlock(&ctrlr->lock);
363
364		if (ctrlr->ka_sbt != 0)
365			callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
366			    C_HARDCLOCK);
367		return;
368	}
369
370	/* Disable updates to CC while destroying admin qpair. */
371	ctrlr->shutdown = true;
372	mtx_unlock(&ctrlr->lock);
373
374	nvmft_qpair_destroy(ctrlr->admin);
375
376	/* Remove association (CNTLID). */
377	np = ctrlr->np;
378	sx_xlock(&np->lock);
379	TAILQ_REMOVE(&np->controllers, ctrlr, link);
380	free_unr(np->ids, ctrlr->cntlid);
381	wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
382	sx_xunlock(&np->lock);
383	if (wakeup_np)
384		wakeup(np);
385
386	callout_drain(&ctrlr->ka_timer);
387
388	nvmft_printf(ctrlr, "association terminated\n");
389	nvmft_controller_free(ctrlr);
390	nvmft_port_rele(np);
391}
392
393void
394nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
395    int error)
396{
397	/*
398	 * If a queue pair is closed, that isn't an error per se.
399	 * That just means additional commands cannot be received on
400	 * that queue pair.
401	 *
402	 * If the admin queue pair is closed while idle or while
403	 * shutting down, terminate the association immediately.
404	 *
405	 * If an I/O queue pair is closed, just ignore it.
406	 */
407	if (error == 0) {
408		if (qp != ctrlr->admin)
409			return;
410
411		mtx_lock(&ctrlr->lock);
412		if (ctrlr->shutdown) {
413			ctrlr->admin_closed = true;
414			mtx_unlock(&ctrlr->lock);
415			return;
416		}
417
418		if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
419			MPASS(ctrlr->num_io_queues == 0);
420			mtx_unlock(&ctrlr->lock);
421
422			/*
423			 * Ok to drop lock here since ctrlr->cc can't
424			 * change if the admin queue pair has closed.
425			 * This also means no new queues can be handed
426			 * off, etc.  Note that since there are no I/O
427			 * queues, only the admin queue needs to be
428			 * destroyed, so it is safe to skip
429			 * nvmft_controller_shutdown and just schedule
430			 * nvmft_controller_terminate.  Note that we
431			 * cannot call nvmft_controller_terminate from
432			 * here directly as this is called from the
433			 * transport layer and freeing the admin qpair
434			 * might deadlock waiting for the current
435			 * thread to exit.
436			 */
437			if (taskqueue_cancel_timeout(taskqueue_thread,
438			    &ctrlr->terminate_task, NULL) == 0)
439				taskqueue_enqueue_timeout(taskqueue_thread,
440				    &ctrlr->terminate_task, 0);
441			return;
442		}
443
444		/*
445		 * Treat closing of the admin queue pair while enabled
446		 * as a transport error.  Note that the admin queue
447		 * pair has been closed.
448		 */
449		ctrlr->admin_closed = true;
450	} else
451		mtx_lock(&ctrlr->lock);
452
453	/* Ignore transport errors if we are already shutting down. */
454	if (ctrlr->shutdown) {
455		mtx_unlock(&ctrlr->lock);
456		return;
457	}
458
459	ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
460	ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
461	ctrlr->shutdown = true;
462	mtx_unlock(&ctrlr->lock);
463
464	callout_stop(&ctrlr->ka_timer);
465	taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
466}
467
468/* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
469static struct mbuf *
470m_getml(size_t len, int how)
471{
472	struct mbuf *m, *n;
473
474	m = m_getm2(NULL, len, how, MT_DATA, 0);
475	if (m == NULL)
476		return (NULL);
477	for (n = m; len > 0; n = n->m_next) {
478		n->m_len = M_SIZE(n);
479		if (n->m_len >= len) {
480			n->m_len = len;
481			MPASS(n->m_next == NULL);
482		}
483		len -= n->m_len;
484	}
485	return (m);
486}
487
488static void
489m_zero(struct mbuf *m, u_int offset, u_int len)
490{
491	u_int todo;
492
493	if (len == 0)
494		return;
495
496	while (m->m_len <= offset) {
497		offset -= m->m_len;
498		m = m->m_next;
499	}
500
501	todo = m->m_len - offset;
502	if (todo > len)
503		todo = len;
504	memset(mtodo(m, offset), 0, todo);
505	m = m->m_next;
506	len -= todo;
507
508	while (len > 0) {
509		todo = m->m_len;
510		if (todo > len)
511			todo = len;
512		memset(mtod(m, void *), 0, todo);
513		m = m->m_next;
514		len -= todo;
515	}
516}
517
518static void
519handle_get_log_page(struct nvmft_controller *ctrlr,
520    struct nvmf_capsule *nc, const struct nvme_command *cmd)
521{
522	struct mbuf *m;
523	uint64_t offset;
524	uint32_t numd;
525	size_t len, todo;
526	u_int status;
527	uint8_t lid;
528	bool rae;
529
530	lid = le32toh(cmd->cdw10) & 0xff;
531	rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
532	numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
533	offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
534
535	if (offset % 3 != 0) {
536		status = NVME_SC_INVALID_FIELD;
537		goto done;
538	}
539
540	len = (numd + 1) * 4;
541
542	switch (lid) {
543	case NVME_LOG_ERROR:
544		todo = 0;
545
546		m = m_getml(len, M_WAITOK);
547		if (todo != len)
548			m_zero(m, todo, len - todo);
549		status = nvmf_send_controller_data(nc, 0, m, len);
550		MPASS(status != NVMF_MORE);
551		break;
552	case NVME_LOG_HEALTH_INFORMATION:
553	{
554		struct nvme_health_information_page hip;
555
556		if (offset >= sizeof(hip)) {
557			status = NVME_SC_INVALID_FIELD;
558			goto done;
559		}
560		todo = sizeof(hip) - offset;
561		if (todo > len)
562			todo = len;
563
564		mtx_lock(&ctrlr->lock);
565		hip = ctrlr->hip;
566		hip.controller_busy_time[0] =
567		    sbintime_getsec(ctrlr->busy_total) / 60;
568		hip.power_on_hours[0] =
569		    sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
570		mtx_unlock(&ctrlr->lock);
571
572		m = m_getml(len, M_WAITOK);
573		m_copyback(m, 0, todo, (char *)&hip + offset);
574		if (todo != len)
575			m_zero(m, todo, len - todo);
576		status = nvmf_send_controller_data(nc, 0, m, len);
577		MPASS(status != NVMF_MORE);
578		break;
579	}
580	case NVME_LOG_FIRMWARE_SLOT:
581		if (offset >= sizeof(ctrlr->np->fp)) {
582			status = NVME_SC_INVALID_FIELD;
583			goto done;
584		}
585		todo = sizeof(ctrlr->np->fp) - offset;
586		if (todo > len)
587			todo = len;
588
589		m = m_getml(len, M_WAITOK);
590		m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
591		if (todo != len)
592			m_zero(m, todo, len - todo);
593		status = nvmf_send_controller_data(nc, 0, m, len);
594		MPASS(status != NVMF_MORE);
595		break;
596	case NVME_LOG_CHANGED_NAMESPACE:
597		if (offset >= sizeof(*ctrlr->changed_ns)) {
598			status = NVME_SC_INVALID_FIELD;
599			goto done;
600		}
601		todo = sizeof(*ctrlr->changed_ns) - offset;
602		if (todo > len)
603			todo = len;
604
605		m = m_getml(len, M_WAITOK);
606		mtx_lock(&ctrlr->lock);
607		m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
608		if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
609			memset(ctrlr->changed_ns, 0,
610			    sizeof(*ctrlr->changed_ns));
611		if (!rae)
612			ctrlr->changed_ns_reported = false;
613		mtx_unlock(&ctrlr->lock);
614		if (todo != len)
615			m_zero(m, todo, len - todo);
616		status = nvmf_send_controller_data(nc, 0, m, len);
617		MPASS(status != NVMF_MORE);
618		break;
619	default:
620		nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
621		    lid);
622		status = NVME_SC_INVALID_FIELD;
623		break;
624	}
625
626done:
627	if (status == NVMF_SUCCESS_SENT)
628		nvmft_command_completed(ctrlr->admin, nc);
629	else
630		nvmft_send_generic_error(ctrlr->admin, nc, status);
631	nvmf_free_capsule(nc);
632}
633
634static void
635m_free_nslist(struct mbuf *m)
636{
637	free(m->m_ext.ext_arg1, M_NVMFT);
638}
639
640static void
641handle_identify_command(struct nvmft_controller *ctrlr,
642    struct nvmf_capsule *nc, const struct nvme_command *cmd)
643{
644	struct mbuf *m;
645	size_t data_len;
646	u_int status;
647	uint8_t cns;
648
649	cns = le32toh(cmd->cdw10) & 0xFF;
650	data_len = nvmf_capsule_data_len(nc);
651	if (data_len != sizeof(ctrlr->cdata)) {
652		nvmft_printf(ctrlr,
653		    "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
654		    cns);
655		nvmft_send_generic_error(ctrlr->admin, nc,
656		    NVME_SC_INVALID_OPCODE);
657		nvmf_free_capsule(nc);
658		return;
659	}
660
661	switch (cns) {
662	case 0:	/* Namespace data. */
663	case 3:	/* Namespace Identification Descriptor list. */
664		nvmft_dispatch_command(ctrlr->admin, nc, true);
665		return;
666	case 1:
667		/* Controller data. */
668		m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
669		m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
670		status = nvmf_send_controller_data(nc, 0, m,
671		    sizeof(ctrlr->cdata));
672		MPASS(status != NVMF_MORE);
673		break;
674	case 2:
675	{
676		/* Active namespace list. */
677		struct nvme_ns_list *nslist;
678		uint32_t nsid;
679
680		nsid = le32toh(cmd->nsid);
681		if (nsid >= 0xfffffffe) {
682			status = NVME_SC_INVALID_FIELD;
683			break;
684		}
685
686		nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
687		nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
688		m = m_get(M_WAITOK, MT_DATA);
689		m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
690		    nslist, NULL, 0, EXT_CTL);
691		m->m_len = sizeof(*nslist);
692		status = nvmf_send_controller_data(nc, 0, m, m->m_len);
693		MPASS(status != NVMF_MORE);
694		break;
695	}
696	default:
697		nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
698		status = NVME_SC_INVALID_FIELD;
699		break;
700	}
701
702	if (status == NVMF_SUCCESS_SENT)
703		nvmft_command_completed(ctrlr->admin, nc);
704	else
705		nvmft_send_generic_error(ctrlr->admin, nc, status);
706	nvmf_free_capsule(nc);
707}
708
709static void
710handle_set_features(struct nvmft_controller *ctrlr,
711    struct nvmf_capsule *nc, const struct nvme_command *cmd)
712{
713	struct nvme_completion cqe;
714	uint8_t fid;
715
716	fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
717	switch (fid) {
718	case NVME_FEAT_NUMBER_OF_QUEUES:
719	{
720		uint32_t num_queues;
721		struct nvmft_io_qpair *io_qpairs;
722
723		num_queues = le32toh(cmd->cdw11) & 0xffff;
724
725		/* 5.12.1.7: 65535 is invalid. */
726		if (num_queues == 65535)
727			goto error;
728
729		/* Fabrics requires the same number of SQs and CQs. */
730		if (le32toh(cmd->cdw11) >> 16 != num_queues)
731			goto error;
732
733		/* Convert to 1's based */
734		num_queues++;
735
736		io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
737		    M_NVMFT, M_WAITOK | M_ZERO);
738
739		mtx_lock(&ctrlr->lock);
740		if (ctrlr->num_io_queues != 0) {
741			mtx_unlock(&ctrlr->lock);
742			free(io_qpairs, M_NVMFT);
743			nvmft_send_generic_error(ctrlr->admin, nc,
744			    NVME_SC_COMMAND_SEQUENCE_ERROR);
745			nvmf_free_capsule(nc);
746			return;
747		}
748
749		ctrlr->num_io_queues = num_queues;
750		ctrlr->io_qpairs = io_qpairs;
751		mtx_unlock(&ctrlr->lock);
752
753		nvmft_init_cqe(&cqe, nc, 0);
754		cqe.cdw0 = cmd->cdw11;
755		nvmft_send_response(ctrlr->admin, &cqe);
756		nvmf_free_capsule(nc);
757		return;
758	}
759	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
760	{
761		uint32_t aer_mask;
762
763		aer_mask = le32toh(cmd->cdw11);
764
765		/* Check for any reserved or unimplemented feature bits. */
766		if ((aer_mask & 0xffffc000) != 0)
767			goto error;
768
769		mtx_lock(&ctrlr->lock);
770		ctrlr->aer_mask = aer_mask;
771		mtx_unlock(&ctrlr->lock);
772		nvmft_send_success(ctrlr->admin, nc);
773		return;
774	}
775	default:
776		nvmft_printf(ctrlr,
777		    "Unsupported feature ID %u for SET_FEATURES\n", fid);
778		goto error;
779	}
780
781error:
782	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
783	nvmf_free_capsule(nc);
784}
785
786static bool
787update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
788{
789	struct nvmft_port *np = ctrlr->np;
790	uint32_t changes;
791
792	*need_shutdown = false;
793
794	mtx_lock(&ctrlr->lock);
795
796	/* Don't allow any changes while shutting down. */
797	if (ctrlr->shutdown) {
798		mtx_unlock(&ctrlr->lock);
799		return (false);
800	}
801
802	if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
803		mtx_unlock(&ctrlr->lock);
804		return (false);
805	}
806
807	changes = ctrlr->cc ^ new_cc;
808	ctrlr->cc = new_cc;
809
810	/* Handle shutdown requests. */
811	if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
812	    NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
813		ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
814		ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
815		ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
816		ctrlr->shutdown = true;
817		*need_shutdown = true;
818		nvmft_printf(ctrlr, "shutdown requested\n");
819	}
820
821	if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
822		if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
823			/* Controller reset. */
824			nvmft_printf(ctrlr, "reset requested\n");
825			ctrlr->shutdown = true;
826			*need_shutdown = true;
827		} else
828			ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
829	}
830	mtx_unlock(&ctrlr->lock);
831
832	return (true);
833}
834
835static void
836handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
837    const struct nvmf_fabric_prop_get_cmd *pget)
838{
839	struct nvmf_fabric_prop_get_rsp rsp;
840
841	nvmft_init_cqe(&rsp, nc, 0);
842
843	switch (le32toh(pget->ofst)) {
844	case NVMF_PROP_CAP:
845		if (pget->attrib.size != NVMF_PROP_SIZE_8)
846			goto error;
847		rsp.value.u64 = htole64(ctrlr->np->cap);
848		break;
849	case NVMF_PROP_VS:
850		if (pget->attrib.size != NVMF_PROP_SIZE_4)
851			goto error;
852		rsp.value.u32.low = ctrlr->cdata.ver;
853		break;
854	case NVMF_PROP_CC:
855		if (pget->attrib.size != NVMF_PROP_SIZE_4)
856			goto error;
857		rsp.value.u32.low = htole32(ctrlr->cc);
858		break;
859	case NVMF_PROP_CSTS:
860		if (pget->attrib.size != NVMF_PROP_SIZE_4)
861			goto error;
862		rsp.value.u32.low = htole32(ctrlr->csts);
863		break;
864	default:
865		goto error;
866	}
867
868	nvmft_send_response(ctrlr->admin, &rsp);
869	return;
870error:
871	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
872}
873
874static void
875handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
876    const struct nvmf_fabric_prop_set_cmd *pset)
877{
878	bool need_shutdown;
879
880	need_shutdown = false;
881	switch (le32toh(pset->ofst)) {
882	case NVMF_PROP_CC:
883		if (pset->attrib.size != NVMF_PROP_SIZE_4)
884			goto error;
885		if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
886		    &need_shutdown))
887			goto error;
888		break;
889	default:
890		goto error;
891	}
892
893	nvmft_send_success(ctrlr->admin, nc);
894	if (need_shutdown) {
895		callout_stop(&ctrlr->ka_timer);
896		taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
897	}
898	return;
899error:
900	nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
901}
902
903static void
904handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
905    struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
906{
907	switch (fc->fctype) {
908	case NVMF_FABRIC_COMMAND_PROPERTY_GET:
909		handle_property_get(ctrlr, nc,
910		    (const struct nvmf_fabric_prop_get_cmd *)fc);
911		break;
912	case NVMF_FABRIC_COMMAND_PROPERTY_SET:
913		handle_property_set(ctrlr, nc,
914		    (const struct nvmf_fabric_prop_set_cmd *)fc);
915		break;
916	case NVMF_FABRIC_COMMAND_CONNECT:
917		nvmft_printf(ctrlr,
918		    "CONNECT command on connected admin queue\n");
919		nvmft_send_generic_error(ctrlr->admin, nc,
920		    NVME_SC_COMMAND_SEQUENCE_ERROR);
921		break;
922	case NVMF_FABRIC_COMMAND_DISCONNECT:
923		nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
924		nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
925		    NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
926		break;
927	default:
928		nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
929		    fc->fctype);
930		nvmft_send_generic_error(ctrlr->admin, nc,
931		    NVME_SC_INVALID_OPCODE);
932		break;
933	}
934	nvmf_free_capsule(nc);
935}
936
937void
938nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
939    struct nvmf_capsule *nc)
940{
941	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
942
943	/* Only permit Fabrics commands while a controller is disabled. */
944	if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
945	    cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
946		nvmft_printf(ctrlr,
947		    "Unsupported admin opcode %#x whiled disabled\n", cmd->opc);
948		nvmft_send_generic_error(ctrlr->admin, nc,
949		    NVME_SC_COMMAND_SEQUENCE_ERROR);
950		nvmf_free_capsule(nc);
951		return;
952	}
953
954	atomic_store_int(&ctrlr->ka_active_traffic, 1);
955
956	switch (cmd->opc) {
957	case NVME_OPC_GET_LOG_PAGE:
958		handle_get_log_page(ctrlr, nc, cmd);
959		break;
960	case NVME_OPC_IDENTIFY:
961		handle_identify_command(ctrlr, nc, cmd);
962		break;
963	case NVME_OPC_SET_FEATURES:
964		handle_set_features(ctrlr, nc, cmd);
965		break;
966	case NVME_OPC_ASYNC_EVENT_REQUEST:
967		mtx_lock(&ctrlr->lock);
968		if (ctrlr->aer_pending == NVMFT_NUM_AER) {
969			mtx_unlock(&ctrlr->lock);
970			nvmft_send_error(ctrlr->admin, nc,
971			    NVME_SCT_COMMAND_SPECIFIC,
972			    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
973		} else {
974			/* NB: Store the CID without byte-swapping. */
975			ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
976			ctrlr->aer_pending++;
977			ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
978			mtx_unlock(&ctrlr->lock);
979		}
980		nvmf_free_capsule(nc);
981		break;
982	case NVME_OPC_KEEP_ALIVE:
983		nvmft_send_success(ctrlr->admin, nc);
984		nvmf_free_capsule(nc);
985		break;
986	case NVME_OPC_FABRICS_COMMANDS:
987		handle_admin_fabrics_command(ctrlr, nc,
988		    (const struct nvmf_fabric_cmd *)cmd);
989		break;
990	default:
991		nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
992		nvmft_send_generic_error(ctrlr->admin, nc,
993		    NVME_SC_INVALID_OPCODE);
994		nvmf_free_capsule(nc);
995		break;
996	}
997}
998
999void
1000nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
1001    struct nvmf_capsule *nc)
1002{
1003	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
1004	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
1005
1006	atomic_store_int(&ctrlr->ka_active_traffic, 1);
1007
1008	switch (cmd->opc) {
1009	case NVME_OPC_FLUSH:
1010		if (cmd->nsid == htole32(0xffffffff)) {
1011			nvmft_send_generic_error(qp, nc,
1012			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1013			nvmf_free_capsule(nc);
1014			break;
1015		}
1016		/* FALLTHROUGH */
1017	case NVME_OPC_WRITE:
1018	case NVME_OPC_READ:
1019	case NVME_OPC_WRITE_UNCORRECTABLE:
1020	case NVME_OPC_COMPARE:
1021	case NVME_OPC_WRITE_ZEROES:
1022	case NVME_OPC_DATASET_MANAGEMENT:
1023	case NVME_OPC_VERIFY:
1024		nvmft_dispatch_command(qp, nc, false);
1025		break;
1026	default:
1027		nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
1028		nvmft_send_generic_error(qp, nc,
1029		    NVME_SC_INVALID_OPCODE);
1030		nvmf_free_capsule(nc);
1031		break;
1032	}
1033}
1034
1035static void
1036nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
1037    u_int type, uint8_t info, uint8_t log_page_id)
1038{
1039	struct nvme_completion cpl;
1040
1041	MPASS(type <= 7);
1042
1043	/* Drop events that are not enabled. */
1044	mtx_lock(&ctrlr->lock);
1045	if ((ctrlr->aer_mask & aer_mask) == 0) {
1046		mtx_unlock(&ctrlr->lock);
1047		return;
1048	}
1049
1050	/*
1051	 * If there is no pending AER command, drop it.
1052	 * XXX: Should we queue these?
1053	 */
1054	if (ctrlr->aer_pending == 0) {
1055		mtx_unlock(&ctrlr->lock);
1056		nvmft_printf(ctrlr,
1057		    "dropping AER type %u, info %#x, page %#x\n",
1058		    type, info, log_page_id);
1059		return;
1060	}
1061
1062	memset(&cpl, 0, sizeof(cpl));
1063	cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
1064	ctrlr->aer_pending--;
1065	ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
1066	mtx_unlock(&ctrlr->lock);
1067
1068	cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
1069	    NVMEF(NVME_ASYNC_EVENT_INFO, info) |
1070	    NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
1071
1072	nvmft_send_response(ctrlr->admin, &cpl);
1073}
1074
1075void
1076nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
1077{
1078	struct nvme_ns_list *nslist;
1079	uint32_t new_nsid, nsid;
1080	u_int i;
1081
1082	new_nsid = lun_id + 1;
1083
1084	mtx_lock(&ctrlr->lock);
1085	nslist = ctrlr->changed_ns;
1086
1087	/* If the first entry is 0xffffffff, the list is already full. */
1088	if (nslist->ns[0] != 0xffffffff) {
1089		/* Find the insertion point for this namespace ID. */
1090		for (i = 0; i < nitems(nslist->ns); i++) {
1091			nsid = le32toh(nslist->ns[i]);
1092			if (nsid == new_nsid) {
1093				/* Already reported, nothing to do. */
1094				mtx_unlock(&ctrlr->lock);
1095				return;
1096			}
1097
1098			if (nsid == 0 || nsid > new_nsid)
1099				break;
1100		}
1101
1102		if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
1103			/* List is full. */
1104			memset(ctrlr->changed_ns, 0,
1105			    sizeof(*ctrlr->changed_ns));
1106			ctrlr->changed_ns->ns[0] = 0xffffffff;
1107		} else if (nslist->ns[i] == htole32(0)) {
1108			/*
1109			 * Optimize case where this ID is appended to
1110			 * the end.
1111			 */
1112			nslist->ns[i] = htole32(new_nsid);
1113		} else {
1114			memmove(&nslist->ns[i + 1], &nslist->ns[i],
1115			    (nitems(nslist->ns) - i - 1) *
1116			    sizeof(nslist->ns[0]));
1117			nslist->ns[i] = htole32(new_nsid);
1118		}
1119	}
1120
1121	if (ctrlr->changed_ns_reported) {
1122		mtx_unlock(&ctrlr->lock);
1123		return;
1124	}
1125	ctrlr->changed_ns_reported = true;
1126	mtx_unlock(&ctrlr->lock);
1127
1128	nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
1129	    NVME_LOG_CHANGED_NAMESPACE);
1130}
1131