1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright (c) Intel Corporation. All rights reserved.
5 *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6 *
7 *   Redistribution and use in sourete and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of sourete code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include "nvme_internal.h"
35
36/*
37 * Host software shall wait a minimum of CAP.TO x 500 milleseconds for CSTS.RDY
38 * to be set to '1' after setting CC.EN to '1' from a previous value of '0'.
39 */
40static inline unsigned int
41nvme_ctrlr_get_ready_to_in_ms(struct nvme_ctrlr *ctrlr)
42{
43	union nvme_cap_register	cap;
44
45/* The TO unit in ms */
46#define NVME_READY_TIMEOUT_UNIT 500
47
48	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
49
50	return (NVME_READY_TIMEOUT_UNIT * cap.bits.to);
51}
52
53/*
54 * Create a queue pair.
55 */
56static int nvme_ctrlr_create_qpair(struct nvme_ctrlr *ctrlr,
57				   struct nvme_qpair *qpair)
58{
59	int ret;
60
61	/* Create the completion queue */
62	ret = nvme_admin_create_ioq(ctrlr, qpair, NVME_IO_COMPLETION_QUEUE);
63	if (ret != 0) {
64		nvme_notice("Create completion queue %u failed\n",
65			    qpair->id);
66		return ret;
67	}
68
69	/* Create the submission queue */
70	ret = nvme_admin_create_ioq(ctrlr, qpair, NVME_IO_SUBMISSION_QUEUE);
71	if (ret != 0) {
72		/* Attempt to delete the completion queue */
73		nvme_notice("Create submission queue %u failed\n",
74			    qpair->id);
75		nvme_admin_delete_ioq(ctrlr, qpair, NVME_IO_COMPLETION_QUEUE);
76		return ret;
77	}
78
79	nvme_qpair_reset(qpair);
80
81	return 0;
82}
83
84/*
85 * Delete a queue pair.
86 */
87static int nvme_ctrlr_delete_qpair(struct nvme_ctrlr *ctrlr,
88				   struct nvme_qpair *qpair)
89{
90	int ret;
91
92	/* Delete the submission queue */
93	ret = nvme_admin_delete_ioq(ctrlr, qpair, NVME_IO_SUBMISSION_QUEUE);
94	if (ret != 0) {
95		nvme_notice("Delete submission queue %u failed\n",
96			    qpair->id);
97		return ret;
98	}
99
100	/* Delete the completion queue */
101	ret = nvme_admin_delete_ioq(ctrlr, qpair, NVME_IO_COMPLETION_QUEUE);
102	if (ret != 0) {
103		nvme_notice("Delete completion queue %u failed\n",
104			    qpair->id);
105		return ret;
106	}
107
108	return 0;
109}
110
111/*
112 * Intel log page.
113 */
114static void
115nvme_ctrlr_construct_intel_support_log_page_list(struct nvme_ctrlr *ctrlr,
116				struct nvme_intel_log_page_dir *log_page_dir)
117{
118
119	if (ctrlr->cdata.vid != NVME_PCI_VID_INTEL ||
120	    log_page_dir == NULL)
121		return;
122
123	ctrlr->log_page_supported[NVME_INTEL_LOG_PAGE_DIR] = true;
124
125	if (log_page_dir->read_latency_log_len ||
126	    (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY))
127		ctrlr->log_page_supported[NVME_INTEL_LOG_READ_CMD_LATENCY] = true;
128
129	if (log_page_dir->write_latency_log_len ||
130	    (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY))
131		ctrlr->log_page_supported[NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true;
132
133	if (log_page_dir->temperature_statistics_log_len)
134		ctrlr->log_page_supported[NVME_INTEL_LOG_TEMPERATURE] = true;
135
136	if (log_page_dir->smart_log_len)
137		ctrlr->log_page_supported[NVME_INTEL_LOG_SMART] = true;
138
139	if (log_page_dir->marketing_description_log_len)
140		ctrlr->log_page_supported[NVME_INTEL_MARKETING_DESCRIPTION] = true;
141}
142
143/*
144 * Intel log page.
145 */
146static int nvme_ctrlr_set_intel_support_log_pages(struct nvme_ctrlr *ctrlr)
147{
148	struct nvme_intel_log_page_dir *log_page_dir;
149	int ret;
150
151	log_page_dir = nvme_zmalloc(sizeof(struct nvme_intel_log_page_dir), 64);
152	if (!log_page_dir) {
153		nvme_err("Allocate log_page_directory failed\n");
154		return ENOMEM;
155	}
156
157	ret = nvme_admin_get_log_page(ctrlr, NVME_INTEL_LOG_PAGE_DIR,
158				      NVME_GLOBAL_NS_TAG,
159				      log_page_dir,
160				      sizeof(struct nvme_intel_log_page_dir));
161	if (ret != 0)
162		nvme_notice("Get NVME_INTEL_LOG_PAGE_DIR log page failed\n");
163	else
164		nvme_ctrlr_construct_intel_support_log_page_list(ctrlr,
165								 log_page_dir);
166
167	nvme_free(log_page_dir);
168
169	return ret;
170}
171
172/*
173 * Initialize log page support directory.
174 */
175static void nvme_ctrlr_set_supported_log_pages(struct nvme_ctrlr *ctrlr)
176{
177
178	memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported));
179
180	/* Mandatory pages */
181	ctrlr->log_page_supported[NVME_LOG_ERROR] = true;
182	ctrlr->log_page_supported[NVME_LOG_HEALTH_INFORMATION] = true;
183	ctrlr->log_page_supported[NVME_LOG_FIRMWARE_SLOT] = true;
184
185	if (ctrlr->cdata.lpa.celp)
186		ctrlr->log_page_supported[NVME_LOG_COMMAND_EFFECTS_LOG] = true;
187
188	if (ctrlr->cdata.vid == NVME_PCI_VID_INTEL)
189		nvme_ctrlr_set_intel_support_log_pages(ctrlr);
190}
191
192/*
193 * Set Intel device features.
194 */
195static void nvme_ctrlr_set_intel_supported_features(struct nvme_ctrlr *ctrlr)
196{
197	bool *supported_feature = ctrlr->feature_supported;
198
199	supported_feature[NVME_INTEL_FEAT_MAX_LBA] = true;
200	supported_feature[NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true;
201	supported_feature[NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true;
202	supported_feature[NVME_INTEL_FEAT_SMBUS_ADDRESS] = true;
203	supported_feature[NVME_INTEL_FEAT_LED_PATTERN] = true;
204	supported_feature[NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true;
205	supported_feature[NVME_INTEL_FEAT_LATENCY_TRACKING] = true;
206}
207
208/*
209 * Set device features.
210 */
211static void nvme_ctrlr_set_supported_features(struct nvme_ctrlr *ctrlr)
212{
213	bool *supported_feature = ctrlr->feature_supported;
214
215	memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported));
216
217	/* Mandatory features */
218	supported_feature[NVME_FEAT_ARBITRATION] = true;
219	supported_feature[NVME_FEAT_POWER_MANAGEMENT] = true;
220	supported_feature[NVME_FEAT_TEMPERATURE_THRESHOLD] = true;
221	supported_feature[NVME_FEAT_ERROR_RECOVERY] = true;
222	supported_feature[NVME_FEAT_NUMBER_OF_QUEUES] = true;
223	supported_feature[NVME_FEAT_INTERRUPT_COALESCING] = true;
224	supported_feature[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true;
225	supported_feature[NVME_FEAT_WRITE_ATOMICITY] = true;
226	supported_feature[NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true;
227
228	/* Optional features */
229	if (ctrlr->cdata.vwc.present)
230		supported_feature[NVME_FEAT_VOLATILE_WRITE_CACHE] = true;
231	if (ctrlr->cdata.apsta.supported)
232		supported_feature[NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION]
233			= true;
234	if (ctrlr->cdata.hmpre)
235		supported_feature[NVME_FEAT_HOST_MEM_BUFFER] = true;
236	if (ctrlr->cdata.vid == NVME_PCI_VID_INTEL)
237		nvme_ctrlr_set_intel_supported_features(ctrlr);
238}
239
240/*
241 * Initialize I/O queue pairs.
242 */
243static int nvme_ctrlr_init_io_qpairs(struct nvme_ctrlr *ctrlr)
244{
245	struct nvme_qpair *qpair;
246	union nvme_cap_register	cap;
247	uint32_t i;
248
249	if (ctrlr->ioq != NULL)
250		/*
251		 * io_qpairs were already constructed, so just return.
252		 * This typically happens when the controller is
253		 * initialized a second (or subsequent) time after a
254		 * controller reset.
255		 */
256		return 0;
257
258	/*
259	 * NVMe spec sets a hard limit of 64K max entries, but
260	 * devices may specify a smaller limit, so we need to check
261	 * the MQES field in the capabilities register.
262	 */
263	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
264	ctrlr->io_qpairs_max_entries =
265		nvme_min(NVME_IO_ENTRIES, (unsigned int)cap.bits.mqes + 1);
266
267	ctrlr->ioq = calloc(ctrlr->io_queues, sizeof(struct nvme_qpair));
268	if (!ctrlr->ioq)
269		return ENOMEM;
270
271	/* Keep queue pair ID 0 for the admin queue */
272	for (i = 0; i < ctrlr->io_queues; i++) {
273		qpair = &ctrlr->ioq[i];
274		qpair->id = i + 1;
275		TAILQ_INSERT_TAIL(&ctrlr->free_io_qpairs, qpair, tailq);
276	}
277
278	return 0;
279}
280
281/*
282 * Shutdown a controller.
283 */
284static void nvme_ctrlr_shutdown(struct nvme_ctrlr *ctrlr)
285{
286	union nvme_cc_register	cc;
287	union nvme_csts_register csts;
288	int ms_waited = 0;
289
290	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
291	cc.bits.shn = NVME_SHN_NORMAL;
292	nvme_reg_mmio_write_4(ctrlr, cc.raw, cc.raw);
293
294	csts.raw = nvme_reg_mmio_read_4(ctrlr, csts.raw);
295	/*
296	 * The NVMe spec does not define a timeout period for shutdown
297	 * notification, so we just pick 5 seconds as a reasonable amount
298	 * of time to wait before proceeding.
299	 */
300#define NVME_CTRLR_SHUTDOWN_TIMEOUT 5000
301	while (csts.bits.shst != NVME_SHST_COMPLETE) {
302		nvme_usleep(1000);
303		csts.raw = nvme_reg_mmio_read_4(ctrlr, csts.raw);
304		if (ms_waited++ >= NVME_CTRLR_SHUTDOWN_TIMEOUT)
305			break;
306	}
307
308	if (csts.bits.shst != NVME_SHST_COMPLETE)
309		nvme_err("Controller did not shutdown within %d seconds\n",
310			 NVME_CTRLR_SHUTDOWN_TIMEOUT / 1000);
311}
312
313/*
314 * Enable a controller.
315 */
316static int nvme_ctrlr_enable(struct nvme_ctrlr *ctrlr)
317{
318	union nvme_cc_register	cc;
319	union nvme_aqa_register	aqa;
320	union nvme_cap_register	cap;
321
322	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
323
324	if (cc.bits.en != 0) {
325		nvme_err("COntroller enable called with CC.EN = 1\n");
326		return EINVAL;
327	}
328
329	nvme_reg_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
330	nvme_reg_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
331
332	aqa.raw = 0;
333	/* acqs and asqs are 0-based. */
334	aqa.bits.acqs = ctrlr->adminq.entries - 1;
335	aqa.bits.asqs = ctrlr->adminq.entries - 1;
336	nvme_reg_mmio_write_4(ctrlr, aqa.raw, aqa.raw);
337
338	cc.bits.en = 1;
339	cc.bits.css = 0;
340	cc.bits.shn = 0;
341	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
342	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
343
344	/* Page size is 2 ^ (12 + mps). */
345	cc.bits.mps = PAGE_SHIFT - 12;
346
347	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
348
349	switch (ctrlr->opts.arb_mechanism) {
350	case NVME_CC_AMS_RR:
351		break;
352	case NVME_CC_AMS_WRR:
353		if (NVME_CAP_AMS_WRR & cap.bits.ams)
354			break;
355		return EINVAL;
356	case NVME_CC_AMS_VS:
357		if (NVME_CAP_AMS_VS & cap.bits.ams)
358			break;
359		return EINVAL;
360	default:
361		return EINVAL;
362	}
363
364	cc.bits.ams = ctrlr->opts.arb_mechanism;
365
366	nvme_reg_mmio_write_4(ctrlr, cc.raw, cc.raw);
367
368	return 0;
369}
370
371/*
372 * Disable a controller.
373 */
374static inline void nvme_ctrlr_disable(struct nvme_ctrlr *ctrlr)
375{
376	union nvme_cc_register cc;
377
378	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
379	cc.bits.en = 0;
380
381	nvme_reg_mmio_write_4(ctrlr, cc.raw, cc.raw);
382}
383
384/*
385 * Test if a controller is enabled.
386 */
387static inline int nvme_ctrlr_enabled(struct nvme_ctrlr *ctrlr)
388{
389	union nvme_cc_register cc;
390
391	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
392
393	return cc.bits.en;
394}
395
396/*
397 * Test if a controller is ready.
398 */
399static inline int nvme_ctrlr_ready(struct nvme_ctrlr *ctrlr)
400{
401	union nvme_csts_register csts;
402
403	csts.raw = nvme_reg_mmio_read_4(ctrlr, csts.raw);
404
405	return csts.bits.rdy;
406}
407
408/*
409 * Set a controller state.
410 */
411static void nvme_ctrlr_set_state(struct nvme_ctrlr *ctrlr,
412				 enum nvme_ctrlr_state state,
413				 uint64_t timeout_in_ms)
414{
415	ctrlr->state = state;
416	if (timeout_in_ms == NVME_TIMEOUT_INFINITE)
417		ctrlr->state_timeout_ms = NVME_TIMEOUT_INFINITE;
418	else
419		ctrlr->state_timeout_ms = nvme_time_msec() + timeout_in_ms;
420}
421
422/*
423 * Get a controller data.
424 */
425static int nvme_ctrlr_identify(struct nvme_ctrlr *ctrlr)
426{
427	int ret;
428
429	ret = nvme_admin_identify_ctrlr(ctrlr, &ctrlr->cdata);
430	if (ret != 0) {
431		nvme_notice("Identify controller failed\n");
432		return ret;
433	}
434
435	/*
436	 * Use MDTS to ensure our default max_xfer_size doesn't
437	 * exceed what the controller supports.
438	 */
439	if (ctrlr->cdata.mdts > 0)
440		ctrlr->max_xfer_size = nvme_min(ctrlr->max_xfer_size,
441						ctrlr->min_page_size
442						* (1 << (ctrlr->cdata.mdts)));
443	return 0;
444}
445
446/*
447 * Set the number of I/O queue pairs.
448 */
449static int nvme_ctrlr_get_max_io_qpairs(struct nvme_ctrlr *ctrlr)
450{
451	unsigned int cdw0, cq_allocated, sq_allocated;
452	int ret;
453
454	ret = nvme_admin_get_feature(ctrlr, NVME_FEAT_CURRENT,
455				     NVME_FEAT_NUMBER_OF_QUEUES,
456				     0, &cdw0);
457	if (ret != 0) {
458		nvme_notice("Get feature NVME_FEAT_NUMBER_OF_QUEUES failed\n");
459		return ret;
460	}
461
462	/*
463	 * Data in cdw0 is 0-based.
464	 * Lower 16-bits indicate number of submission queues allocated.
465	 * Upper 16-bits indicate number of completion queues allocated.
466	 */
467	sq_allocated = (cdw0 & 0xFFFF) + 1;
468	cq_allocated = (cdw0 >> 16) + 1;
469
470	ctrlr->max_io_queues = nvme_min(sq_allocated, cq_allocated);
471
472	return 0;
473}
474
475/*
476 * Set the number of I/O queue pairs.
477 */
478static int nvme_ctrlr_set_num_qpairs(struct nvme_ctrlr *ctrlr)
479{
480	unsigned int num_queues, cdw0;
481	unsigned int cq_allocated, sq_allocated;
482	int ret;
483
484	ret = nvme_ctrlr_get_max_io_qpairs(ctrlr);
485	if (ret != 0) {
486		nvme_notice("Failed to get the maximum of I/O qpairs\n");
487		return ret;
488	}
489
490	/*
491	 * Format number of I/O queue:
492	 * Remove 1 as it as be be 0-based,
493	 * bits 31:16 represent the number of completion queues,
494	 * bits 0:15 represent the number of submission queues
495	*/
496	num_queues = ((ctrlr->opts.io_queues - 1) << 16) |
497		(ctrlr->opts.io_queues - 1);
498
499	/*
500	 * Set the number of I/O queues.
501	 * Note: The value allocated may be smaller or larger than the number
502	 * of queues requested (see specifications).
503	 */
504	ret = nvme_admin_set_feature(ctrlr, false, NVME_FEAT_NUMBER_OF_QUEUES,
505				     num_queues, 0, &cdw0);
506	if (ret != 0) {
507		nvme_notice("Set feature NVME_FEAT_NUMBER_OF_QUEUES failed\n");
508		return ret;
509	}
510
511	/*
512	 * Data in cdw0 is 0-based.
513	 * Lower 16-bits indicate number of submission queues allocated.
514	 * Upper 16-bits indicate number of completion queues allocated.
515	 */
516	sq_allocated = (cdw0 & 0xFFFF) + 1;
517	cq_allocated = (cdw0 >> 16) + 1;
518	ctrlr->io_queues = nvme_min(sq_allocated, cq_allocated);
519
520	/*
521	 * Make sure the number of constructed qpair listed in free_io_qpairs
522	 * will not be more than the requested one.
523	 */
524	ctrlr->io_queues = nvme_min(ctrlr->io_queues, ctrlr->opts.io_queues);
525
526	return 0;
527}
528
529static void nvme_ctrlr_destruct_namespaces(struct nvme_ctrlr *ctrlr)
530{
531
532	if (ctrlr->ns) {
533		free(ctrlr->ns);
534		ctrlr->ns = NULL;
535		ctrlr->nr_ns = 0;
536	}
537
538	if (ctrlr->nsdata) {
539		nvme_free(ctrlr->nsdata);
540		ctrlr->nsdata = NULL;
541	}
542}
543
544static int nvme_ctrlr_construct_namespaces(struct nvme_ctrlr *ctrlr)
545{
546	unsigned int i, nr_ns = ctrlr->cdata.nn;
547	struct nvme_ns *ns = NULL;
548
549	/*
550	 * ctrlr->nr_ns may be 0 (startup) or a different number of
551	 * namespaces (reset), so check if we need to reallocate.
552	 */
553	if (nr_ns != ctrlr->nr_ns) {
554
555		nvme_ctrlr_destruct_namespaces(ctrlr);
556
557		ctrlr->ns = calloc(nr_ns, sizeof(struct nvme_ns));
558		if (!ctrlr->ns)
559			goto fail;
560
561		nvme_debug("Allocate %u namespace data\n", nr_ns);
562		ctrlr->nsdata = nvme_calloc(nr_ns, sizeof(struct nvme_ns_data),
563					    PAGE_SIZE);
564		if (!ctrlr->nsdata)
565			goto fail;
566
567		ctrlr->nr_ns = nr_ns;
568
569	}
570
571	for (i = 0; i < nr_ns; i++) {
572		ns = &ctrlr->ns[i];
573		if (nvme_ns_construct(ctrlr, ns, i + 1) != 0)
574			goto fail;
575	}
576
577	return 0;
578
579fail:
580	nvme_ctrlr_destruct_namespaces(ctrlr);
581
582	return -1;
583}
584
585/*
586 * Forward declaration.
587 */
588static int nvme_ctrlr_construct_and_submit_aer(struct nvme_ctrlr *ctrlr,
589				struct nvme_async_event_request *aer);
590
591/*
592 * Async event completion callback.
593 */
594static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_cpl *cpl)
595{
596	struct nvme_async_event_request	*aer = arg;
597	struct nvme_ctrlr *ctrlr = aer->ctrlr;
598
599	if (cpl->status.sc == NVME_SC_ABORTED_SQ_DELETION)
600		/*
601		 *  This is simulated when controller is being shut down, to
602		 *  effectively abort outstanding asynchronous event requests
603		 *  and make sure all memory is freed. Do not repost the
604		 *  request in this case.
605		 */
606		return;
607
608	if (ctrlr->aer_cb_fn != NULL)
609		ctrlr->aer_cb_fn(ctrlr->aer_cb_arg, cpl);
610
611	/*
612	 * Repost another asynchronous event request to replace
613	 * the one that just completed.
614	 */
615	if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer))
616		/*
617		 * We can't do anything to recover from a failure here,
618		 * so just print a warning message and leave the
619		 * AER unsubmitted.
620		 */
621		nvme_err("Initialize AER failed\n");
622}
623
624/*
625 * Issue an async event request.
626 */
627static int nvme_ctrlr_construct_and_submit_aer(struct nvme_ctrlr *ctrlr,
628					       struct nvme_async_event_request *aer)
629{
630	struct nvme_request *req;
631
632	req = nvme_request_allocate_null(&ctrlr->adminq,
633					 nvme_ctrlr_async_event_cb, aer);
634	if (req == NULL)
635		return -1;
636
637	aer->ctrlr = ctrlr;
638	aer->req = req;
639	req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
640
641	return nvme_qpair_submit_request(&ctrlr->adminq, req);
642}
643
644/*
645 * Configure async event management.
646 */
647static int nvme_ctrlr_configure_aer(struct nvme_ctrlr *ctrlr)
648{
649	union nvme_critical_warning_state state;
650	struct nvme_async_event_request	*aer;
651	unsigned int i;
652	int ret;
653
654	state.raw = 0xFF;
655	state.bits.reserved = 0;
656
657	ret =  nvme_admin_set_feature(ctrlr, false,
658				      NVME_FEAT_ASYNC_EVENT_CONFIGURATION,
659				      state.raw, 0, NULL);
660	if (ret != 0) {
661		nvme_notice("Set feature ASYNC_EVENT_CONFIGURATION failed\n");
662		return ret;
663	}
664
665	/* aerl is a zero-based value, so we need to add 1 here. */
666	ctrlr->num_aers = nvme_min(NVME_MAX_ASYNC_EVENTS,
667				   (ctrlr->cdata.aerl + 1));
668
669	for (i = 0; i < ctrlr->num_aers; i++) {
670		aer = &ctrlr->aer[i];
671		if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) {
672			nvme_notice("Construct AER failed\n");
673			return -1;
674		}
675	}
676
677	return 0;
678}
679
680/*
681 * Start a controller.
682 */
683static int nvme_ctrlr_start(struct nvme_ctrlr *ctrlr)
684{
685
686	nvme_qpair_reset(&ctrlr->adminq);
687	nvme_qpair_enable(&ctrlr->adminq);
688
689	if (nvme_ctrlr_identify(ctrlr) != 0)
690		return -1;
691
692	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
693		return -1;
694
695	if (nvme_ctrlr_init_io_qpairs(ctrlr))
696		return -1;
697
698	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
699		return -1;
700
701	if (nvme_ctrlr_configure_aer(ctrlr) != 0)
702		nvme_warning("controller does not support AER!\n");
703
704	nvme_ctrlr_set_supported_log_pages(ctrlr);
705	nvme_ctrlr_set_supported_features(ctrlr);
706
707	if (ctrlr->cdata.sgls.supported)
708		ctrlr->flags |= NVME_CTRLR_SGL_SUPPORTED;
709
710	return 0;
711}
712
713/*
714 * Memory map the controller side buffer.
715 */
716static void nvme_ctrlr_map_cmb(struct nvme_ctrlr *ctrlr)
717{
718	int ret;
719	void *addr;
720	uint32_t bir;
721	union nvme_cmbsz_register cmbsz;
722	union nvme_cmbloc_register cmbloc;
723	uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
724
725	cmbsz.raw = nvme_reg_mmio_read_4(ctrlr, cmbsz.raw);
726	cmbloc.raw = nvme_reg_mmio_read_4(ctrlr, cmbloc.raw);
727	if (!cmbsz.bits.sz)
728		goto out;
729
730	/* Values 0 2 3 4 5 are valid for BAR */
731	bir = cmbloc.bits.bir;
732	if (bir > 5 || bir == 1)
733		goto out;
734
735	/* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
736	unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
737
738	/* controller memory buffer size in Bytes */
739	size = unit_size * cmbsz.bits.sz;
740
741	/* controller memory buffer offset from BAR in Bytes */
742	offset = unit_size * cmbloc.bits.ofst;
743
744	nvme_pcicfg_get_bar_addr_len(ctrlr->pci_dev, bir, &bar_phys_addr,
745				     &bar_size);
746
747	if (offset > bar_size)
748		goto out;
749
750	if (size > bar_size - offset)
751		goto out;
752
753	ret = nvme_pcicfg_map_bar_write_combine(ctrlr->pci_dev, bir, &addr);
754	if ((ret != 0) || addr == NULL)
755		goto out;
756
757	ctrlr->cmb_bar_virt_addr = addr;
758	ctrlr->cmb_bar_phys_addr = bar_phys_addr;
759	ctrlr->cmb_size = size;
760	ctrlr->cmb_current_offset = offset;
761
762	if (!cmbsz.bits.sqs)
763		ctrlr->opts.use_cmb_sqs = false;
764
765	return;
766
767out:
768	ctrlr->cmb_bar_virt_addr = NULL;
769	ctrlr->opts.use_cmb_sqs = false;
770
771	return;
772}
773
774/*
775 * Unmap the controller side buffer.
776 */
777static int nvme_ctrlr_unmap_cmb(struct nvme_ctrlr *ctrlr)
778{
779	union nvme_cmbloc_register cmbloc;
780	void *addr = ctrlr->cmb_bar_virt_addr;
781	int ret = 0;
782
783	if (addr) {
784		cmbloc.raw = nvme_reg_mmio_read_4(ctrlr, cmbloc.raw);
785		ret = nvme_pcicfg_unmap_bar(ctrlr->pci_dev, cmbloc.bits.bir,
786					    addr);
787	}
788	return ret;
789}
790
791/*
792 * Map the controller PCI bars.
793 */
794static int nvme_ctrlr_map_bars(struct nvme_ctrlr *ctrlr)
795{
796	void *addr;
797	int ret;
798
799	ret = nvme_pcicfg_map_bar(ctrlr->pci_dev, 0, 0, &addr);
800	if (ret != 0 || addr == NULL) {
801		nvme_err("Map PCI device bar failed %d (%s)\n",
802			 ret, strerror(ret));
803		return ret;
804	}
805
806	nvme_debug("Controller BAR mapped at %p\n", addr);
807
808	ctrlr->regs = (volatile struct nvme_registers *)addr;
809	nvme_ctrlr_map_cmb(ctrlr);
810
811	return 0;
812}
813
814/*
815 * Unmap the controller PCI bars.
816 */
817static int nvme_ctrlr_unmap_bars(struct nvme_ctrlr *ctrlr)
818{
819	void *addr = (void *)ctrlr->regs;
820	int ret;
821
822	ret = nvme_ctrlr_unmap_cmb(ctrlr);
823	if (ret != 0) {
824		nvme_err("Unmap controller side buffer failed %d\n", ret);
825		return ret;
826	}
827
828	if (addr) {
829		ret = nvme_pcicfg_unmap_bar(ctrlr->pci_dev, 0, addr);
830		if (ret != 0) {
831			nvme_err("Unmap PCI device bar failed %d\n", ret);
832			return ret;
833		}
834	}
835
836	return 0;
837}
838
839/*
840 * Set a controller in the failed state.
841 */
842static void nvme_ctrlr_fail(struct nvme_ctrlr *ctrlr)
843{
844	unsigned int i;
845
846	ctrlr->failed = true;
847
848	nvme_qpair_fail(&ctrlr->adminq);
849	if (ctrlr->ioq)
850		for (i = 0; i < ctrlr->io_queues; i++)
851			nvme_qpair_fail(&ctrlr->ioq[i]);
852}
853
854/*
855 * This function will be called repeatedly during initialization
856 * until the controller is ready.
857 */
858static int nvme_ctrlr_init(struct nvme_ctrlr *ctrlr)
859{
860	unsigned int ready_timeout_in_ms = nvme_ctrlr_get_ready_to_in_ms(ctrlr);
861	int ret;
862
863	/*
864	 * Check if the current initialization step is done or has timed out.
865	 */
866	switch (ctrlr->state) {
867
868	case NVME_CTRLR_STATE_INIT:
869
870		/* Begin the hardware initialization by making
871		 * sure the controller is disabled. */
872		if (nvme_ctrlr_enabled(ctrlr)) {
873			/*
874			 * Disable the controller to cause a reset.
875			 */
876			if (!nvme_ctrlr_ready(ctrlr)) {
877				/* Wait for the controller to be ready */
878				nvme_ctrlr_set_state(ctrlr,
879				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1,
880				      ready_timeout_in_ms);
881				return 0;
882			}
883
884			/*
885			 * The controller is enabled and ready.
886			 * It can be immediatly disabled
887			 */
888			nvme_ctrlr_disable(ctrlr);
889			nvme_ctrlr_set_state(ctrlr,
890				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
891				      ready_timeout_in_ms);
892
893			if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
894				nvme_msleep(2000);
895
896			return 0;
897		}
898
899		if (nvme_ctrlr_ready(ctrlr)) {
900			/*
901			 * Controller is in the process of shutting down.
902			 * We need to wait for CSTS.RDY to become 0.
903			 */
904			nvme_ctrlr_set_state(ctrlr,
905				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
906				      ready_timeout_in_ms);
907			return 0;
908		}
909
910		/*
911		 * Controller is currently disabled.
912		 * We can jump straight to enabling it.
913		 */
914		ret = nvme_ctrlr_enable(ctrlr);
915		if (ret)
916			nvme_err("Enable controller failed\n");
917		else
918			nvme_ctrlr_set_state(ctrlr,
919				       NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
920				       ready_timeout_in_ms);
921		return ret;
922
923	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
924
925		if (nvme_ctrlr_ready(ctrlr)) {
926			/* CC.EN = 1 && CSTS.RDY = 1,
927			 * so we can disable the controller now. */
928			nvme_ctrlr_disable(ctrlr);
929			nvme_ctrlr_set_state(ctrlr,
930				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
931				      ready_timeout_in_ms);
932			return 0;
933		}
934
935		break;
936
937	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
938
939		if (!nvme_ctrlr_ready(ctrlr)) {
940			/* CC.EN = 0 && CSTS.RDY = 0,
941			 * so we can enable the controller now. */
942			ret = nvme_ctrlr_enable(ctrlr);
943			if (ret)
944				nvme_err("Enable controller failed\n");
945			else
946				nvme_ctrlr_set_state(ctrlr,
947				       NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
948				       ready_timeout_in_ms);
949			return ret;
950		}
951		break;
952
953	case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
954
955		if (nvme_ctrlr_ready(ctrlr)) {
956			if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_RDY)
957				nvme_msleep(2000);
958
959			ret = nvme_ctrlr_start(ctrlr);
960			if (ret)
961				nvme_err("Start controller failed\n");
962			else
963				nvme_ctrlr_set_state(ctrlr,
964						     NVME_CTRLR_STATE_READY,
965						     NVME_TIMEOUT_INFINITE);
966			return ret;
967		}
968		break;
969
970	default:
971		nvme_panic("Unhandled ctrlr state %d\n", ctrlr->state);
972		nvme_ctrlr_fail(ctrlr);
973		return -1;
974	}
975
976	if ((ctrlr->state_timeout_ms != NVME_TIMEOUT_INFINITE) &&
977	    (nvme_time_msec() > ctrlr->state_timeout_ms)) {
978		nvme_err("Initialization timed out in state %d\n",
979			 ctrlr->state);
980		nvme_ctrlr_fail(ctrlr);
981		return -1;
982	}
983
984	return 0;
985}
986
987/*
988 * Reset a controller.
989 */
990static int nvme_ctrlr_reset(struct nvme_ctrlr *ctrlr)
991{
992	struct nvme_qpair *qpair;
993	unsigned int i;
994
995	if (ctrlr->resetting || ctrlr->failed)
996		/*
997		 * Controller is already resetting or has failed. Return
998		 * immediately since there is no need to kick off another
999		 * reset in these cases.
1000		 */
1001		return 0;
1002
1003	ctrlr->resetting = true;
1004
1005	/* Disable all queues before disabling the controller hardware. */
1006	nvme_qpair_disable(&ctrlr->adminq);
1007	for (i = 0; i < ctrlr->io_queues; i++)
1008		nvme_qpair_disable(&ctrlr->ioq[i]);
1009
1010	/* Set the state back to INIT to cause a full hardware reset. */
1011	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT,
1012			     NVME_TIMEOUT_INFINITE);
1013
1014	while (ctrlr->state != NVME_CTRLR_STATE_READY) {
1015		if (nvme_ctrlr_init(ctrlr) != 0) {
1016			nvme_crit("Controller reset failed\n");
1017			nvme_ctrlr_fail(ctrlr);
1018			goto out;
1019		}
1020	}
1021
1022	/* Reinitialize qpairs */
1023	TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
1024		if (nvme_ctrlr_create_qpair(ctrlr, qpair) != 0)
1025			nvme_ctrlr_fail(ctrlr);
1026	}
1027
1028out:
1029	ctrlr->resetting = false;
1030
1031	return ctrlr->failed ? -1 : 0;
1032}
1033
1034/*
1035 * Set a controller options.
1036 */
1037static void nvme_ctrlr_set_opts(struct nvme_ctrlr *ctrlr,
1038				struct nvme_ctrlr_opts *opts)
1039{
1040	if (opts)
1041		memcpy(&ctrlr->opts, opts, sizeof(struct nvme_ctrlr_opts));
1042	else
1043		memset(&ctrlr->opts, 0, sizeof(struct nvme_ctrlr_opts));
1044
1045	if (ctrlr->opts.io_queues == 0)
1046		ctrlr->opts.io_queues = DEFAULT_MAX_IO_QUEUES;
1047
1048	if (ctrlr->opts.io_queues > NVME_MAX_IO_QUEUES) {
1049		nvme_info("Limiting requested I/O queues %u to %d\n",
1050			  ctrlr->opts.io_queues, NVME_MAX_IO_QUEUES);
1051		ctrlr->opts.io_queues = NVME_MAX_IO_QUEUES;
1052	}
1053}
1054
1055/*
1056 * Attach a PCI controller.
1057 */
1058struct nvme_ctrlr *
1059nvme_ctrlr_attach(struct pci_device *pci_dev,
1060		  struct nvme_ctrlr_opts *opts)
1061{
1062	struct nvme_ctrlr *ctrlr;
1063	union nvme_cap_register	cap;
1064	uint32_t cmd_reg;
1065	int ret;
1066
1067	/* Get a new controller handle */
1068	ctrlr = malloc(sizeof(struct nvme_ctrlr));
1069	if (!ctrlr) {
1070		nvme_err("Allocate controller handle failed\n");
1071		return NULL;
1072	}
1073
1074	nvme_debug("New controller handle %p\n", ctrlr);
1075
1076	/* Initialize the handle */
1077	memset(ctrlr, 0, sizeof(struct nvme_ctrlr));
1078	ctrlr->pci_dev = pci_dev;
1079	ctrlr->resetting = false;
1080	ctrlr->failed = false;
1081	TAILQ_INIT(&ctrlr->free_io_qpairs);
1082	TAILQ_INIT(&ctrlr->active_io_qpairs);
1083	pthread_mutex_init(&ctrlr->lock, NULL);
1084	ctrlr->quirks = nvme_ctrlr_get_quirks(pci_dev);
1085
1086	nvme_ctrlr_set_state(ctrlr,
1087			     NVME_CTRLR_STATE_INIT,
1088			     NVME_TIMEOUT_INFINITE);
1089
1090	ret = nvme_ctrlr_map_bars(ctrlr);
1091	if (ret != 0) {
1092		nvme_err("Map controller BAR failed\n");
1093		pthread_mutex_destroy(&ctrlr->lock);
1094		free(ctrlr);
1095		return NULL;
1096	}
1097
1098	/* Enable PCI busmaster and disable INTx */
1099	nvme_pcicfg_read32(pci_dev, &cmd_reg, 4);
1100	cmd_reg |= 0x0404;
1101	nvme_pcicfg_write32(pci_dev, cmd_reg, 4);
1102
1103	/*
1104	 * Doorbell stride is 2 ^ (dstrd + 2),
1105	 * but we want multiples of 4, so drop the + 2.
1106	 */
1107	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
1108	ctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
1109	ctrlr->min_page_size = 1 << (12 + cap.bits.mpsmin);
1110
1111	/* Set default transfer size */
1112	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
1113
1114	/* Create the admin queue pair */
1115	ret = nvme_qpair_construct(ctrlr, &ctrlr->adminq, 0,
1116				   NVME_ADMIN_ENTRIES, NVME_ADMIN_TRACKERS);
1117	if (ret != 0) {
1118		nvme_err("Initialize admin queue pair failed\n");
1119		goto err;
1120	}
1121
1122	/* Set options and then initialize */
1123	nvme_ctrlr_set_opts(ctrlr, opts);
1124	do {
1125		ret = nvme_ctrlr_init(ctrlr);
1126		if (ret)
1127			goto err;
1128	} while (ctrlr->state != NVME_CTRLR_STATE_READY);
1129
1130	return ctrlr;
1131
1132err:
1133	nvme_ctrlr_detach(ctrlr);
1134
1135	return NULL;
1136}
1137
1138/*
1139 * Detach a PCI controller.
1140 */
1141void nvme_ctrlr_detach(struct nvme_ctrlr *ctrlr)
1142{
1143	struct nvme_qpair *qpair;
1144	uint32_t i;
1145
1146	while (!TAILQ_EMPTY(&ctrlr->active_io_qpairs)) {
1147		qpair = TAILQ_FIRST(&ctrlr->active_io_qpairs);
1148		nvme_ioqp_release(qpair);
1149	}
1150
1151	nvme_ctrlr_shutdown(ctrlr);
1152
1153	nvme_ctrlr_destruct_namespaces(ctrlr);
1154	if (ctrlr->ioq) {
1155		for (i = 0; i < ctrlr->io_queues; i++)
1156			nvme_qpair_destroy(&ctrlr->ioq[i]);
1157		free(ctrlr->ioq);
1158	}
1159
1160	nvme_qpair_destroy(&ctrlr->adminq);
1161
1162	nvme_ctrlr_unmap_bars(ctrlr);
1163
1164	pthread_mutex_destroy(&ctrlr->lock);
1165	free(ctrlr);
1166}
1167
1168/*
1169 * Get a controller feature.
1170 */
1171int nvme_ctrlr_get_feature(struct nvme_ctrlr *ctrlr,
1172			   enum nvme_feat_sel sel, enum nvme_feat feature,
1173			   uint32_t cdw11,
1174			   uint32_t *attributes)
1175{
1176	int ret;
1177
1178	pthread_mutex_lock(&ctrlr->lock);
1179
1180	ret = nvme_admin_get_feature(ctrlr, sel, feature, cdw11, attributes);
1181	if (ret != 0)
1182		nvme_notice("Get feature 0x%08x failed\n",
1183			    (unsigned int) feature);
1184
1185	pthread_mutex_unlock(&ctrlr->lock);
1186
1187	return ret;
1188}
1189
1190/*
1191 * Set a controller feature.
1192 */
1193int nvme_ctrlr_set_feature(struct nvme_ctrlr *ctrlr,
1194			   bool save, enum nvme_feat feature,
1195			   uint32_t cdw11, uint32_t cdw12,
1196			   uint32_t *attributes)
1197{
1198	int ret;
1199
1200	pthread_mutex_lock(&ctrlr->lock);
1201
1202	ret = nvme_admin_set_feature(ctrlr, save, feature,
1203				     cdw11, cdw12, attributes);
1204	if (ret != 0)
1205		nvme_notice("Set feature 0x%08x failed\n",
1206			    (unsigned int) feature);
1207
1208	pthread_mutex_unlock(&ctrlr->lock);
1209
1210	return ret;
1211}
1212
1213/*
1214 * Attach a namespace.
1215 */
1216int nvme_ctrlr_attach_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid,
1217			 struct nvme_ctrlr_list *clist)
1218{
1219	int ret;
1220
1221	pthread_mutex_lock(&ctrlr->lock);
1222
1223	ret = nvme_admin_attach_ns(ctrlr, nsid, clist);
1224	if (ret) {
1225		nvme_notice("Attach namespace %u failed\n", nsid);
1226		goto out;
1227	}
1228
1229	ret = nvme_ctrlr_reset(ctrlr);
1230	if (ret != 0)
1231		nvme_notice("Reset controller failed\n");
1232
1233out:
1234	pthread_mutex_unlock(&ctrlr->lock);
1235
1236	return ret;
1237}
1238
1239/*
1240 * Detach a namespace.
1241 */
1242int nvme_ctrlr_detach_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid,
1243			 struct nvme_ctrlr_list *clist)
1244{
1245	int ret;
1246
1247	pthread_mutex_lock(&ctrlr->lock);
1248
1249	ret = nvme_admin_detach_ns(ctrlr, nsid, clist);
1250	if (ret != 0) {
1251		nvme_notice("Detach namespace %u failed\n", nsid);
1252		goto out;
1253	}
1254
1255	ret = nvme_ctrlr_reset(ctrlr);
1256	if (ret)
1257		nvme_notice("Reset controller failed\n");
1258
1259out:
1260	pthread_mutex_unlock(&ctrlr->lock);
1261
1262	return ret;
1263}
1264
1265/*
1266 * Create a namespace.
1267 */
1268unsigned int nvme_ctrlr_create_ns(struct nvme_ctrlr *ctrlr,
1269				  struct nvme_ns_data *nsdata)
1270{
1271	unsigned int nsid;
1272	int ret;
1273
1274	pthread_mutex_lock(&ctrlr->lock);
1275
1276	ret = nvme_admin_create_ns(ctrlr, nsdata, &nsid);
1277	if (ret != 0) {
1278		nvme_notice("Create namespace failed\n");
1279		nsid = 0;
1280	}
1281
1282	pthread_mutex_unlock(&ctrlr->lock);
1283
1284	return nsid;
1285}
1286
1287/*
1288 * Delete a namespace.
1289 */
1290int nvme_ctrlr_delete_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid)
1291{
1292	int ret;
1293
1294	pthread_mutex_lock(&ctrlr->lock);
1295
1296	ret = nvme_admin_delete_ns(ctrlr, nsid);
1297	if (ret != 0) {
1298		nvme_notice("Delete namespace %u failed\n", nsid);
1299		goto out;
1300	}
1301
1302	ret = nvme_ctrlr_reset(ctrlr);
1303	if (ret)
1304		nvme_notice("Reset controller failed\n");
1305
1306out:
1307	pthread_mutex_unlock(&ctrlr->lock);
1308
1309	return ret;
1310}
1311
1312/*
1313 * Format NVM media.
1314 */
1315int nvme_ctrlr_format_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid,
1316			 struct nvme_format *format)
1317{
1318	int ret;
1319
1320	pthread_mutex_lock(&ctrlr->lock);
1321
1322	ret = nvme_admin_format_nvm(ctrlr, nsid, format);
1323	if (ret != 0) {
1324		if (nsid == NVME_GLOBAL_NS_TAG)
1325			nvme_notice("Format device failed\n");
1326		else
1327			nvme_notice("Format namespace %u failed\n", nsid);
1328		goto out;
1329	}
1330
1331	ret = nvme_ctrlr_reset(ctrlr);
1332	if (ret)
1333		nvme_notice("Reset controller failed\n");
1334
1335out:
1336	pthread_mutex_unlock(&ctrlr->lock);
1337
1338	return ret;
1339}
1340
1341/*
1342 * Update a device firmware.
1343 */
1344int nvme_ctrlr_update_firmware(struct nvme_ctrlr *ctrlr,
1345			       void *fw, size_t size, int slot)
1346{
1347	struct nvme_fw_commit fw_commit;
1348	unsigned int size_remaining = size, offset = 0, transfer;
1349	void *f = fw;
1350	int ret;
1351
1352	if (size & 0x3) {
1353		nvme_err("Invalid firmware size\n");
1354		return EINVAL;
1355	}
1356
1357	pthread_mutex_lock(&ctrlr->lock);
1358
1359	/* Download firmware */
1360	while (size_remaining > 0) {
1361
1362		transfer = nvme_min(size_remaining, ctrlr->min_page_size);
1363
1364		ret = nvme_admin_fw_image_dl(ctrlr, f, transfer, offset);
1365		if (ret != 0) {
1366			nvme_err("Download FW (%u B at %u) failed\n",
1367				 transfer, offset);
1368			goto out;
1369		}
1370
1371		f += transfer;
1372		offset += transfer;
1373		size_remaining -= transfer;
1374
1375	}
1376
1377	/* Commit firmware */
1378	memset(&fw_commit, 0, sizeof(struct nvme_fw_commit));
1379	fw_commit.fs = slot;
1380	fw_commit.ca = NVME_FW_COMMIT_REPLACE_IMG;
1381
1382	ret = nvme_admin_fw_commit(ctrlr, &fw_commit);
1383	if (ret != 0) {
1384		nvme_err("Commit downloaded FW (%zu B) failed\n",
1385			 size);
1386		goto out;
1387	}
1388
1389	ret = nvme_ctrlr_reset(ctrlr);
1390	if (ret)
1391		nvme_notice("Reset controller failed\n");
1392
1393out:
1394	pthread_mutex_unlock(&ctrlr->lock);
1395
1396	return ret;
1397}
1398
1399/*
1400 * Get an unused I/O queue pair.
1401 */
1402struct nvme_qpair *nvme_ioqp_get(struct nvme_ctrlr *ctrlr,
1403				 enum nvme_qprio qprio, unsigned int qd)
1404{
1405	struct nvme_qpair *qpair = NULL;
1406	union nvme_cc_register cc;
1407	uint32_t trackers;
1408	int ret;
1409
1410	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
1411
1412	/* Only the low 2 bits (values 0, 1, 2, 3) of QPRIO are valid. */
1413	if ((qprio & 3) != qprio)
1414		return NULL;
1415
1416	/*
1417	 * Only value NVME_QPRIO_URGENT(0) is valid for the
1418	 * default round robin arbitration method.
1419	 */
1420	if ((cc.bits.ams == NVME_CC_AMS_RR) && (qprio != NVME_QPRIO_URGENT)) {
1421		nvme_err("Invalid queue priority for default round "
1422			 "robin arbitration method\n");
1423		return NULL;
1424	}
1425
1426	/* I/O qpairs number of entries belong to [2, io_qpairs_max_entries] */
1427	if (qd == 1) {
1428		nvme_err("Invalid queue depth\n");
1429		return NULL;
1430	}
1431
1432	if (qd == 0 || qd > ctrlr->io_qpairs_max_entries)
1433		qd = ctrlr->io_qpairs_max_entries;
1434
1435	/*
1436	 * No need to have more trackers than entries in the submit queue.
1437	 * Note also that for a queue size of N, we can only have (N-1)
1438	 * commands outstanding, hence the "-1" here.
1439	 */
1440	trackers = nvme_min(NVME_IO_TRACKERS, (qd - 1));
1441
1442	pthread_mutex_lock(&ctrlr->lock);
1443
1444	/* Get the first available qpair structure */
1445	qpair = TAILQ_FIRST(&ctrlr->free_io_qpairs);
1446	if (qpair == NULL) {
1447		/* No free queue IDs */
1448		nvme_err("No free I/O queue pairs\n");
1449		goto out;
1450	}
1451
1452	/* Construct the qpair */
1453	ret = nvme_qpair_construct(ctrlr, qpair, qprio, qd, trackers);
1454	if (ret != 0) {
1455		nvme_qpair_destroy(qpair);
1456		qpair = NULL;
1457		goto out;
1458	}
1459
1460	/*
1461	 * At this point, qpair contains a preallocated submission
1462	 * and completion queue and a unique queue ID, but it is not
1463	 * yet created on the controller.
1464	 * Fill out the submission queue priority and send out the
1465	 * Create I/O Queue commands.
1466	 */
1467	if (nvme_ctrlr_create_qpair(ctrlr, qpair) != 0) {
1468		nvme_err("Create queue pair on the controller failed\n");
1469		nvme_qpair_destroy(qpair);
1470		qpair = NULL;
1471		goto out;
1472	}
1473
1474	TAILQ_REMOVE(&ctrlr->free_io_qpairs, qpair, tailq);
1475	TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq);
1476
1477out:
1478	pthread_mutex_unlock(&ctrlr->lock);
1479
1480	return qpair;
1481}
1482
1483/*
1484 * Free an I/O queue pair.
1485 */
1486int nvme_ioqp_release(struct nvme_qpair *qpair)
1487{
1488	struct nvme_ctrlr *ctrlr;
1489	int ret;
1490
1491	if (qpair == NULL)
1492		return 0;
1493
1494	ctrlr = qpair->ctrlr;
1495
1496	pthread_mutex_lock(&ctrlr->lock);
1497
1498	/* Delete the I/O submission and completion queues */
1499	ret = nvme_ctrlr_delete_qpair(ctrlr, qpair);
1500	if (ret != 0) {
1501		nvme_notice("Delete queue pair %u failed\n", qpair->id);
1502	} else {
1503		TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq);
1504		TAILQ_INSERT_HEAD(&ctrlr->free_io_qpairs, qpair, tailq);
1505	}
1506
1507	pthread_mutex_unlock(&ctrlr->lock);
1508
1509	return ret;
1510}
1511
1512/*
1513 * Submit an NVMe command using the specified I/O queue pair.
1514 */
1515int nvme_ioqp_submit_cmd(struct nvme_qpair *qpair,
1516			 struct nvme_cmd *cmd,
1517			 void *buf, size_t len,
1518			 nvme_cmd_cb cb_fn, void *cb_arg)
1519{
1520	struct nvme_request *req;
1521	int ret = ENOMEM;
1522
1523	req = nvme_request_allocate_contig(qpair, buf, len, cb_fn, cb_arg);
1524	if (req) {
1525		memcpy(&req->cmd, cmd, sizeof(req->cmd));
1526		ret = nvme_qpair_submit_request(qpair, req);
1527	}
1528
1529	return ret;
1530}
1531