1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright (c) Intel Corporation. All rights reserved.
5 *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include "nvme_internal.h"
35
36struct nvme_qpair_string {
37	uint16_t	value;
38	const char 	*str;
39};
40
41static const struct nvme_qpair_string admin_opcode[] = {
42	{ NVME_OPC_DELETE_IO_SQ,	"DELETE IO SQ" },
43	{ NVME_OPC_CREATE_IO_SQ,	"CREATE IO SQ" },
44	{ NVME_OPC_GET_LOG_PAGE,	"GET LOG PAGE" },
45	{ NVME_OPC_DELETE_IO_CQ,	"DELETE IO CQ" },
46	{ NVME_OPC_CREATE_IO_CQ,	"CREATE IO CQ" },
47	{ NVME_OPC_IDENTIFY, 		"IDENTIFY" },
48	{ NVME_OPC_ABORT,		"ABORT" },
49	{ NVME_OPC_SET_FEATURES,	"SET FEATURES" },
50	{ NVME_OPC_GET_FEATURES,	"GET FEATURES" },
51	{ NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
52	{ NVME_OPC_NS_MANAGEMENT,	"NAMESPACE MANAGEMENT" },
53	{ NVME_OPC_FIRMWARE_COMMIT,	"FIRMWARE COMMIT" },
54	{ NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
55	{ NVME_OPC_NS_ATTACHMENT,	"NAMESPACE ATTACHMENT" },
56	{ NVME_OPC_FORMAT_NVM,		"FORMAT NVM" },
57	{ NVME_OPC_SECURITY_SEND,	"SECURITY SEND" },
58	{ NVME_OPC_SECURITY_RECEIVE,	"SECURITY RECEIVE" },
59	{ 0xFFFF,			"ADMIN COMMAND" }
60};
61
62static const struct nvme_qpair_string io_opcode[] = {
63	{ NVME_OPC_FLUSH,		"FLUSH" },
64	{ NVME_OPC_WRITE,		"WRITE" },
65	{ NVME_OPC_READ,		"READ" },
66	{ NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
67	{ NVME_OPC_COMPARE,		"COMPARE" },
68	{ NVME_OPC_WRITE_ZEROES,	"WRITE ZEROES" },
69	{ NVME_OPC_DATASET_MANAGEMENT,	"DATASET MANAGEMENT" },
70	{ NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
71	{ NVME_OPC_RESERVATION_REPORT,	"RESERVATION REPORT" },
72	{ NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
73	{ NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
74	{ 0xFFFF,			"IO COMMAND" }
75};
76
77static const struct nvme_qpair_string generic_status[] = {
78	{ NVME_SC_SUCCESS,			"SUCCESS" },
79	{ NVME_SC_INVALID_OPCODE,		"INVALID OPCODE" },
80	{ NVME_SC_INVALID_FIELD,		"INVALID FIELD" },
81	{ NVME_SC_COMMAND_ID_CONFLICT,		"COMMAND ID CONFLICT" },
82	{ NVME_SC_DATA_TRANSFER_ERROR,		"DATA TRANSFER ERROR" },
83	{ NVME_SC_ABORTED_POWER_LOSS,		"ABORTED - POWER LOSS" },
84	{ NVME_SC_INTERNAL_DEVICE_ERROR,	"INTERNAL DEVICE ERROR" },
85	{ NVME_SC_ABORTED_BY_REQUEST,		"ABORTED - BY REQUEST" },
86	{ NVME_SC_ABORTED_SQ_DELETION,		"ABORTED - SQ DELETION" },
87	{ NVME_SC_ABORTED_FAILED_FUSED,		"ABORTED - FAILED FUSED" },
88	{ NVME_SC_ABORTED_MISSING_FUSED,	"ABORTED - MISSING FUSED" },
89	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT,	"INVALID NAMESPACE OR FORMAT" },
90	{ NVME_SC_COMMAND_SEQUENCE_ERROR,	"COMMAND SEQUENCE ERROR" },
91	{ NVME_SC_INVALID_SGL_SEG_DESCRIPTOR,	"INVALID SGL SEGMENT DESCRIPTOR" },
92	{ NVME_SC_INVALID_NUM_SGL_DESCIRPTORS,	"INVALID NUMBER OF SGL DESCRIPTORS" },
93	{ NVME_SC_DATA_SGL_LENGTH_INVALID,	"DATA SGL LENGTH INVALID" },
94	{ NVME_SC_METADATA_SGL_LENGTH_INVALID,	"METADATA SGL LENGTH INVALID" },
95	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID,	"SGL DESCRIPTOR TYPE INVALID" },
96	{ NVME_SC_INVALID_CONTROLLER_MEM_BUF,	"INVALID CONTROLLER MEMORY BUFFER" },
97	{ NVME_SC_INVALID_PRP_OFFSET,		"INVALID PRP OFFSET" },
98	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED,	"ATOMIC WRITE UNIT EXCEEDED" },
99	{ NVME_SC_LBA_OUT_OF_RANGE,		"LBA OUT OF RANGE" },
100	{ NVME_SC_CAPACITY_EXCEEDED,		"CAPACITY EXCEEDED" },
101	{ NVME_SC_NAMESPACE_NOT_READY,		"NAMESPACE NOT READY" },
102	{ NVME_SC_RESERVATION_CONFLICT,		"RESERVATION CONFLICT" },
103	{ NVME_SC_FORMAT_IN_PROGRESS,		"FORMAT IN PROGRESS" },
104	{ 0xFFFF,				"GENERIC" }
105};
106
107static const struct nvme_qpair_string command_specific_status[] = {
108	{ NVME_SC_COMPLETION_QUEUE_INVALID,	"INVALID COMPLETION QUEUE" },
109	{ NVME_SC_INVALID_QUEUE_IDENTIFIER,	"INVALID QUEUE IDENTIFIER" },
110	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED,	"MAX QUEUE SIZE EXCEEDED" },
111	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED,	"ABORT CMD LIMIT EXCEEDED" },
112	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED,"ASYNC LIMIT EXCEEDED" },
113	{ NVME_SC_INVALID_FIRMWARE_SLOT,	"INVALID FIRMWARE SLOT" },
114	{ NVME_SC_INVALID_FIRMWARE_IMAGE,	"INVALID FIRMWARE IMAGE" },
115	{ NVME_SC_INVALID_INTERRUPT_VECTOR,	"INVALID INTERRUPT VECTOR" },
116	{ NVME_SC_INVALID_LOG_PAGE,		"INVALID LOG PAGE" },
117	{ NVME_SC_INVALID_FORMAT,		"INVALID FORMAT" },
118	{ NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET,"FIRMWARE REQUIRES CONVENTIONAL RESET" },
119	{ NVME_SC_INVALID_QUEUE_DELETION,	"INVALID QUEUE DELETION" },
120	{ NVME_SC_FEATURE_ID_NOT_SAVEABLE,	"FEATURE ID NOT SAVEABLE" },
121	{ NVME_SC_FEATURE_NOT_CHANGEABLE,	"FEATURE NOT CHANGEABLE" },
122	{ NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC,"FEATURE NOT NAMESPACE SPECIFIC" },
123	{ NVME_SC_FIRMWARE_REQ_NVM_RESET,	"FIRMWARE REQUIRES NVM RESET" },
124	{ NVME_SC_FIRMWARE_REQ_RESET,		"FIRMWARE REQUIRES RESET" },
125	{ NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION,"FIRMWARE REQUIRES MAX TIME VIOLATION" },
126	{ NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED,"FIRMWARE ACTIVATION PROHIBITED" },
127	{ NVME_SC_OVERLAPPING_RANGE,		"OVERLAPPING RANGE" },
128	{ NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY,"NAMESPACE INSUFFICIENT CAPACITY" },
129	{ NVME_SC_NAMESPACE_ID_UNAVAILABLE,	"NAMESPACE ID UNAVAILABLE" },
130	{ NVME_SC_NAMESPACE_ALREADY_ATTACHED,	"NAMESPACE ALREADY ATTACHED" },
131	{ NVME_SC_NAMESPACE_IS_PRIVATE,		"NAMESPACE IS PRIVATE" },
132	{ NVME_SC_NAMESPACE_NOT_ATTACHED,	"NAMESPACE NOT ATTACHED" },
133	{ NVME_SC_THINPROVISIONING_NOT_SUPPORTED,"THINPROVISIONING NOT SUPPORTED" },
134	{ NVME_SC_CONTROLLER_LIST_INVALID,	"CONTROLLER LIST INVALID" },
135	{ NVME_SC_CONFLICTING_ATTRIBUTES,	"CONFLICTING ATTRIBUTES" },
136	{ NVME_SC_INVALID_PROTECTION_INFO,	"INVALID PROTECTION INFO" },
137	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE,	"WRITE TO RO PAGE" },
138	{ 0xFFFF,				"COMMAND SPECIFIC" }
139};
140
141static const struct nvme_qpair_string media_error_status[] = {
142	{ NVME_SC_WRITE_FAULTS, 		"WRITE FAULTS" },
143	{ NVME_SC_UNRECOVERED_READ_ERROR, 	"UNRECOVERED READ ERROR" },
144	{ NVME_SC_GUARD_CHECK_ERROR, 		"GUARD CHECK ERROR" },
145	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, 	"APPLICATION TAG CHECK ERROR" },
146	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, 	"REFERENCE TAG CHECK ERROR" },
147	{ NVME_SC_COMPARE_FAILURE, 		"COMPARE FAILURE" },
148	{ NVME_SC_ACCESS_DENIED, 		"ACCESS DENIED" },
149	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
150	{ 0xFFFF, 				"MEDIA ERROR" }
151};
152
153static inline bool nvme_qpair_is_admin_queue(struct nvme_qpair *qpair)
154{
155	return qpair->id == 0;
156}
157
158static inline bool nvme_qpair_is_io_queue(struct nvme_qpair *qpair)
159{
160	return qpair->id != 0;
161}
162
163static const char*nvme_qpair_get_string(const struct nvme_qpair_string *strings,
164					uint16_t value)
165{
166	const struct nvme_qpair_string *entry;
167
168	entry = strings;
169
170	while (entry->value != 0xFFFF) {
171		if (entry->value == value)
172			return entry->str;
173		entry++;
174	}
175	return entry->str;
176}
177
178static void nvme_qpair_admin_qpair_print_command(struct nvme_qpair *qpair,
179						 struct nvme_cmd *cmd)
180{
181	nvme_info("%s (%02x) sqid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x\n",
182		  nvme_qpair_get_string(admin_opcode, cmd->opc), cmd->opc,
183		  qpair->id, cmd->cid,
184		  cmd->nsid, cmd->cdw10, cmd->cdw11);
185}
186
187static void nvme_qpair_io_qpair_print_command(struct nvme_qpair *qpair,
188					      struct nvme_cmd *cmd)
189{
190	nvme_assert(qpair != NULL, "print_command: qpair == NULL\n");
191	nvme_assert(cmd != NULL, "print_command: cmd == NULL\n");
192
193	switch ((int)cmd->opc) {
194	case NVME_OPC_WRITE:
195	case NVME_OPC_READ:
196	case NVME_OPC_WRITE_UNCORRECTABLE:
197	case NVME_OPC_COMPARE:
198		nvme_info("%s sqid:%d cid:%d nsid:%d lba:%llu len:%d\n",
199			  nvme_qpair_get_string(io_opcode, cmd->opc),
200			  qpair->id, cmd->cid, cmd->nsid,
201			  ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
202			  (cmd->cdw12 & 0xFFFF) + 1);
203		break;
204	case NVME_OPC_FLUSH:
205	case NVME_OPC_DATASET_MANAGEMENT:
206		nvme_info("%s sqid:%d cid:%d nsid:%d\n",
207			  nvme_qpair_get_string(io_opcode, cmd->opc),
208			  qpair->id, cmd->cid, cmd->nsid);
209		break;
210	default:
211		nvme_info("%s (%02x) sqid:%d cid:%d nsid:%d\n",
212			  nvme_qpair_get_string(io_opcode, cmd->opc),
213			  cmd->opc, qpair->id, cmd->cid, cmd->nsid);
214		break;
215	}
216}
217
218static void nvme_qpair_print_command(struct nvme_qpair *qpair,
219				     struct nvme_cmd *cmd)
220{
221	nvme_assert(qpair != NULL, "qpair can not be NULL");
222	nvme_assert(cmd != NULL, "cmd can not be NULL");
223
224	if (nvme_qpair_is_admin_queue(qpair))
225		return nvme_qpair_admin_qpair_print_command(qpair, cmd);
226
227	return nvme_qpair_io_qpair_print_command(qpair, cmd);
228}
229
230static const char *get_status_string(uint16_t sct, uint16_t sc)
231{
232	const struct nvme_qpair_string *entry;
233
234	switch (sct) {
235	case NVME_SCT_GENERIC:
236		entry = generic_status;
237		break;
238	case NVME_SCT_COMMAND_SPECIFIC:
239		entry = command_specific_status;
240		break;
241	case NVME_SCT_MEDIA_ERROR:
242		entry = media_error_status;
243		break;
244	case NVME_SCT_VENDOR_SPECIFIC:
245		return "VENDOR SPECIFIC";
246	default:
247		return "RESERVED";
248	}
249
250	return nvme_qpair_get_string(entry, sc);
251}
252
253static void nvme_qpair_print_completion(struct nvme_qpair *qpair,
254					struct nvme_cpl *cpl)
255{
256	nvme_info("Cpl: %s (%02x/%02x) sqid:%d cid:%d "
257		  "cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
258		  get_status_string(cpl->status.sct, cpl->status.sc),
259		  cpl->status.sct,
260		  cpl->status.sc,
261		  cpl->sqid,
262		  cpl->cid,
263		  cpl->cdw0,
264		  cpl->sqhd,
265		  cpl->status.p,
266		  cpl->status.m,
267		  cpl->status.dnr);
268}
269
270static bool nvme_qpair_completion_retry(const struct nvme_cpl *cpl)
271{
272	/*
273	 * TODO: spec is not clear how commands that are aborted due
274	 *  to TLER will be marked.  So for now, it seems
275	 *  NAMESPACE_NOT_READY is the only case where we should
276	 *  look at the DNR bit.
277	 */
278	switch ((int)cpl->status.sct) {
279	case NVME_SCT_GENERIC:
280		switch ((int)cpl->status.sc) {
281		case NVME_SC_NAMESPACE_NOT_READY:
282		case NVME_SC_FORMAT_IN_PROGRESS:
283			if (cpl->status.dnr)
284				return false;
285			return true;
286		case NVME_SC_INVALID_OPCODE:
287		case NVME_SC_INVALID_FIELD:
288		case NVME_SC_COMMAND_ID_CONFLICT:
289		case NVME_SC_DATA_TRANSFER_ERROR:
290		case NVME_SC_ABORTED_POWER_LOSS:
291		case NVME_SC_INTERNAL_DEVICE_ERROR:
292		case NVME_SC_ABORTED_BY_REQUEST:
293		case NVME_SC_ABORTED_SQ_DELETION:
294		case NVME_SC_ABORTED_FAILED_FUSED:
295		case NVME_SC_ABORTED_MISSING_FUSED:
296		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
297		case NVME_SC_COMMAND_SEQUENCE_ERROR:
298		case NVME_SC_LBA_OUT_OF_RANGE:
299		case NVME_SC_CAPACITY_EXCEEDED:
300		default:
301			return false;
302		}
303	case NVME_SCT_COMMAND_SPECIFIC:
304	case NVME_SCT_MEDIA_ERROR:
305	case NVME_SCT_VENDOR_SPECIFIC:
306	default:
307		return false;
308	}
309}
310
311static void nvme_qpair_construct_tracker(struct nvme_tracker *tr,
312					 uint16_t cid, uint64_t phys_addr)
313{
314	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
315	tr->cid = cid;
316	tr->active = false;
317}
318
319static inline void nvme_qpair_copy_command(struct nvme_cmd *dst,
320					   const struct nvme_cmd *src)
321{
322	/* dst and src are known to be non-overlapping and 64-byte aligned. */
323#if defined(__AVX__)
324	__m256i *d256 = (__m256i *)dst;
325	const __m256i *s256 = (const __m256i *)src;
326
327	_mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
328	_mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
329#elif defined(__SSE2__)
330	__m128i *d128 = (__m128i *)dst;
331	const __m128i *s128 = (const __m128i *)src;
332
333	_mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
334	_mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
335	_mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
336	_mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
337#else
338	*dst = *src;
339#endif
340}
341
342static void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
343				      struct nvme_tracker *tr)
344{
345	struct nvme_request *req = tr->req;
346
347	/*
348	 * Set the tracker active and copy its command
349	 * to the submission queue.
350	 */
351	nvme_debug("qpair %d: Submit command, tail %d, cid %d / %d\n",
352		   qpair->id,
353		   (int)qpair->sq_tail,
354		   (int)tr->cid,
355		   (int)tr->req->cmd.cid);
356
357	qpair->tr[tr->cid].active = true;
358	nvme_qpair_copy_command(&qpair->cmd[qpair->sq_tail], &req->cmd);
359
360	if (++qpair->sq_tail == qpair->entries)
361		qpair->sq_tail = 0;
362
363	nvme_wmb();
364	nvme_mmio_write_4(qpair->sq_tdbl, qpair->sq_tail);
365}
366
367static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair,
368					struct nvme_tracker *tr,
369					struct nvme_cpl *cpl,
370					bool print_on_error)
371{
372	struct nvme_request *req = tr->req;
373	bool retry, error;
374
375	if (!req) {
376		nvme_crit("tracker has no request\n");
377		qpair->tr[cpl->cid].active = false;
378		goto done;
379	}
380
381	error = nvme_cpl_is_error(cpl);
382	retry = error && nvme_qpair_completion_retry(cpl) &&
383		(req->retries < NVME_MAX_RETRY_COUNT);
384	if (error && print_on_error) {
385		nvme_qpair_print_command(qpair, &req->cmd);
386		nvme_qpair_print_completion(qpair, cpl);
387	}
388
389	qpair->tr[cpl->cid].active = false;
390
391	if (cpl->cid != req->cmd.cid)
392		nvme_crit("cpl and command CID mismatch (%d / %d)\n",
393			  (int)cpl->cid, (int)req->cmd.cid);
394
395	if (retry) {
396		req->retries++;
397		nvme_qpair_submit_tracker(qpair, tr);
398		return;
399	}
400
401	if (req->cb_fn)
402		req->cb_fn(req->cb_arg, cpl);
403
404	nvme_request_free_locked(req);
405
406done:
407	tr->req = NULL;
408
409	LIST_REMOVE(tr, list);
410	LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
411}
412
413static void nvme_qpair_submit_queued_requests(struct nvme_qpair *qpair)
414{
415	STAILQ_HEAD(, nvme_request) req_queue;
416	STAILQ_INIT(&req_queue);
417
418	pthread_mutex_lock(&qpair->lock);
419
420	STAILQ_CONCAT(&req_queue, &qpair->queued_req);
421
422	/*
423	 * If the controller is in the middle of a reset, don't
424	 * try to submit queued requests - let the reset logic
425	 * handle that instead.
426	 */
427	while (!qpair->ctrlr->resetting && LIST_FIRST(&qpair->free_tr)
428			&& !STAILQ_EMPTY(&req_queue)) {
429		struct nvme_request *req = STAILQ_FIRST(&req_queue);
430		STAILQ_REMOVE_HEAD(&req_queue, stailq);
431
432		pthread_mutex_unlock(&qpair->lock);
433		nvme_qpair_submit_request(qpair, req);
434		pthread_mutex_lock(&qpair->lock);
435	}
436
437	STAILQ_CONCAT(&qpair->queued_req, &req_queue);
438
439	pthread_mutex_unlock(&qpair->lock);
440}
441
442static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
443					       struct nvme_tracker *tr,
444					       uint32_t sct,
445					       uint32_t sc,
446					       uint32_t dnr,
447					       bool print_on_error)
448{
449	struct nvme_cpl	cpl;
450
451	memset(&cpl, 0, sizeof(cpl));
452	cpl.sqid = qpair->id;
453	cpl.cid = tr->cid;
454	cpl.status.sct = sct;
455	cpl.status.sc = sc;
456	cpl.status.dnr = dnr;
457
458	nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
459}
460
461static void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
462					       struct nvme_request *req,
463					       uint32_t sct, uint32_t sc,
464					       bool print_on_error)
465{
466	struct nvme_cpl	cpl;
467	bool error;
468
469	memset(&cpl, 0, sizeof(cpl));
470	cpl.sqid = qpair->id;
471	cpl.status.sct = sct;
472	cpl.status.sc = sc;
473
474	error = nvme_cpl_is_error(&cpl);
475
476	if (error && print_on_error) {
477		nvme_qpair_print_command(qpair, &req->cmd);
478		nvme_qpair_print_completion(qpair, &cpl);
479	}
480
481	if (req->cb_fn)
482		req->cb_fn(req->cb_arg, &cpl);
483
484	nvme_request_free_locked(req);
485}
486
487static void nvme_qpair_abort_aers(struct nvme_qpair *qpair)
488{
489	struct nvme_tracker *tr;
490
491	tr = LIST_FIRST(&qpair->outstanding_tr);
492	while (tr != NULL) {
493		nvme_assert(tr->req != NULL,
494			    "tr->req == NULL in abort_aers\n");
495		if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
496			nvme_qpair_manual_complete_tracker(qpair, tr,
497					      NVME_SCT_GENERIC,
498					      NVME_SC_ABORTED_SQ_DELETION,
499					      0, false);
500			tr = LIST_FIRST(&qpair->outstanding_tr);
501			continue;
502		}
503		tr = LIST_NEXT(tr, list);
504	}
505}
506
507static inline void _nvme_qpair_admin_qpair_destroy(struct nvme_qpair *qpair)
508{
509	nvme_qpair_abort_aers(qpair);
510}
511
512static inline void _nvme_qpair_req_bad_phys(struct nvme_qpair *qpair,
513					    struct nvme_tracker *tr)
514{
515	/*
516	 * Bad vtophys translation, so abort this request
517	 * and return immediately, without retry.
518	 */
519	nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
520					   NVME_SC_INVALID_FIELD,
521					   1, true);
522}
523
524/*
525 * Build PRP list describing physically contiguous payload buffer.
526 */
527static int _nvme_qpair_build_contig_request(struct nvme_qpair *qpair,
528					    struct nvme_request *req,
529					    struct nvme_tracker *tr)
530{
531	uint64_t phys_addr;
532	void *seg_addr;
533	uint32_t nseg, cur_nseg, modulo, unaligned;
534	void *md_payload;
535	void *payload = req->payload.u.contig + req->payload_offset;
536
537	phys_addr = nvme_mem_vtophys(payload);
538	if (phys_addr == NVME_VTOPHYS_ERROR) {
539		_nvme_qpair_req_bad_phys(qpair, tr);
540		return -1;
541	}
542	nseg = req->payload_size >> PAGE_SHIFT;
543	modulo = req->payload_size & (PAGE_SIZE - 1);
544	unaligned = phys_addr & (PAGE_SIZE - 1);
545	if (modulo || unaligned)
546		nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
547
548	if (req->payload.md) {
549		md_payload = req->payload.md + req->md_offset;
550		tr->req->cmd.mptr = nvme_mem_vtophys(md_payload);
551		if (tr->req->cmd.mptr == NVME_VTOPHYS_ERROR) {
552			_nvme_qpair_req_bad_phys(qpair, tr);
553			return -1;
554		}
555	}
556
557	tr->req->cmd.psdt = NVME_PSDT_PRP;
558	tr->req->cmd.dptr.prp.prp1 = phys_addr;
559	if (nseg == 2) {
560		seg_addr = payload + PAGE_SIZE - unaligned;
561		tr->req->cmd.dptr.prp.prp2 = nvme_mem_vtophys(seg_addr);
562	} else if (nseg > 2) {
563		cur_nseg = 1;
564		tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
565		while (cur_nseg < nseg) {
566			seg_addr = payload + cur_nseg * PAGE_SIZE - unaligned;
567			phys_addr = nvme_mem_vtophys(seg_addr);
568			if (phys_addr == NVME_VTOPHYS_ERROR) {
569				_nvme_qpair_req_bad_phys(qpair, tr);
570				return -1;
571			}
572			tr->u.prp[cur_nseg - 1] = phys_addr;
573			cur_nseg++;
574		}
575	}
576
577	return 0;
578}
579
580/*
581 * Build SGL list describing scattered payload buffer.
582 */
583static int _nvme_qpair_build_hw_sgl_request(struct nvme_qpair *qpair,
584					    struct nvme_request *req,
585					    struct nvme_tracker *tr)
586{
587	struct nvme_sgl_descriptor *sgl;
588	uint64_t phys_addr;
589	uint32_t remaining_transfer_len, length, nseg = 0;
590	int ret;
591
592	/*
593	 * Build scattered payloads.
594	 */
595	nvme_assert(req->payload_size != 0,
596		    "cannot build SGL for zero-length transfer\n");
597	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
598		    "sgl payload type required\n");
599	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
600		    "sgl reset callback required\n");
601	nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
602		    "sgl callback required\n");
603	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
604					req->payload_offset);
605
606	sgl = tr->u.sgl;
607	req->cmd.psdt = NVME_PSDT_SGL_MPTR_SGL;
608	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
609
610	remaining_transfer_len = req->payload_size;
611
612	while (remaining_transfer_len > 0) {
613
614		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
615			_nvme_qpair_req_bad_phys(qpair, tr);
616			return -1;
617		}
618
619		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
620						     &phys_addr, &length);
621		if (ret != 0) {
622			_nvme_qpair_req_bad_phys(qpair, tr);
623			return ret;
624		}
625
626		length = nvme_min(remaining_transfer_len, length);
627		remaining_transfer_len -= length;
628
629		sgl->unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
630		sgl->unkeyed.length = length;
631		sgl->address = phys_addr;
632		sgl->unkeyed.subtype = 0;
633
634		sgl++;
635		nseg++;
636
637	}
638
639	if (nseg == 1) {
640		/*
641		 * The whole transfer can be described by a single Scatter
642		 * Gather List descriptor. Use the special case described
643		 * by the spec where SGL1's type is Data Block.
644		 * This means the SGL in the tracker is not used at all,
645		 * so copy the first (and only) SGL element into SGL1.
646		 */
647		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
648		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
649		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
650	} else {
651		/* For now we only support 1 SGL segment in NVMe controller */
652		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_LAST_SEGMENT;
653		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
654		req->cmd.dptr.sgl1.unkeyed.length =
655			nseg * sizeof(struct nvme_sgl_descriptor);
656	}
657
658	return 0;
659}
660
661/*
662 * Build Physical Region Page list describing scattered payload buffer.
663 */
664static int _nvme_qpair_build_prps_sgl_request(struct nvme_qpair *qpair,
665					      struct nvme_request *req,
666					      struct nvme_tracker *tr)
667{
668	uint64_t phys_addr, prp2 = 0;
669	uint32_t data_transferred, remaining_transfer_len, length;
670	uint32_t nseg, cur_nseg, total_nseg = 0, last_nseg = 0;
671	uint32_t modulo, unaligned, sge_count = 0;
672	int ret;
673
674	/*
675	 * Build scattered payloads.
676	 */
677	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
678		    "sgl payload type required\n");
679	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
680		    "sgl reset callback required\n");
681	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
682					req->payload_offset);
683
684	remaining_transfer_len = req->payload_size;
685
686	while (remaining_transfer_len > 0) {
687
688		nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
689			    "sgl callback required\n");
690
691		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
692						    &phys_addr, &length);
693		if (ret != 0) {
694			_nvme_qpair_req_bad_phys(qpair, tr);
695			return -1;
696		}
697
698		nvme_assert((phys_addr & 0x3) == 0, "address must be dword aligned\n");
699		nvme_assert((length >= remaining_transfer_len) || ((phys_addr + length) % PAGE_SIZE) == 0,
700			"All SGEs except last must end on a page boundary\n");
701		nvme_assert((sge_count == 0) || (phys_addr % PAGE_SIZE) == 0,
702			"All SGEs except first must start on a page boundary\n");
703
704		data_transferred = nvme_min(remaining_transfer_len, length);
705
706		nseg = data_transferred >> PAGE_SHIFT;
707		modulo = data_transferred & (PAGE_SIZE - 1);
708		unaligned = phys_addr & (PAGE_SIZE - 1);
709		if (modulo || unaligned)
710			nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
711
712		if (total_nseg == 0) {
713			req->cmd.psdt = NVME_PSDT_PRP;
714			req->cmd.dptr.prp.prp1 = phys_addr;
715		}
716
717		total_nseg += nseg;
718		sge_count++;
719		remaining_transfer_len -= data_transferred;
720
721		if (total_nseg == 2) {
722			if (sge_count == 1)
723				tr->req->cmd.dptr.prp.prp2 = phys_addr +
724					PAGE_SIZE - unaligned;
725			else if (sge_count == 2)
726				tr->req->cmd.dptr.prp.prp2 = phys_addr;
727			/* save prp2 value */
728			prp2 = tr->req->cmd.dptr.prp.prp2;
729		} else if (total_nseg > 2) {
730			if (sge_count == 1)
731				cur_nseg = 1;
732			else
733				cur_nseg = 0;
734
735			tr->req->cmd.dptr.prp.prp2 =
736				(uint64_t)tr->prp_sgl_bus_addr;
737
738			while (cur_nseg < nseg) {
739				if (prp2) {
740					tr->u.prp[0] = prp2;
741					tr->u.prp[last_nseg + 1] = phys_addr +
742						cur_nseg * PAGE_SIZE - unaligned;
743				} else {
744					tr->u.prp[last_nseg] = phys_addr +
745						cur_nseg * PAGE_SIZE - unaligned;
746				}
747				last_nseg++;
748				cur_nseg++;
749			}
750		}
751	}
752
753	return 0;
754}
755
756static void _nvme_qpair_admin_qpair_enable(struct nvme_qpair *qpair)
757{
758	struct nvme_tracker *tr, *tr_temp;
759
760	/*
761	 * Manually abort each outstanding admin command.  Do not retry
762	 * admin commands found here, since they will be left over from
763	 * a controller reset and its likely the context in which the
764	 * command was issued no longer applies.
765	 */
766	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, tr_temp) {
767		nvme_info("Aborting outstanding admin command\n");
768		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
769						   NVME_SC_ABORTED_BY_REQUEST,
770						   1 /* do not retry */, true);
771	}
772
773	qpair->enabled = true;
774}
775
776static void _nvme_qpair_io_qpair_enable(struct nvme_qpair *qpair)
777{
778	struct nvme_tracker *tr, *temp;
779	struct nvme_request *req;
780
781	qpair->enabled = true;
782
783	qpair->ctrlr->enabled_io_qpairs++;
784
785	/* Manually abort each queued I/O. */
786	while (!STAILQ_EMPTY(&qpair->queued_req)) {
787		req = STAILQ_FIRST(&qpair->queued_req);
788		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
789		nvme_info("Aborting queued I/O command\n");
790		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
791						   NVME_SC_ABORTED_BY_REQUEST,
792						   true);
793	}
794
795	/* Manually abort each outstanding I/O. */
796	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, temp) {
797		nvme_info("Aborting outstanding I/O command\n");
798		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
799						   NVME_SC_ABORTED_BY_REQUEST,
800						   0, true);
801	}
802}
803
804static inline void _nvme_qpair_admin_qpair_disable(struct nvme_qpair *qpair)
805{
806	qpair->enabled = false;
807	nvme_qpair_abort_aers(qpair);
808}
809
810static inline void _nvme_qpair_io_qpair_disable(struct nvme_qpair *qpair)
811{
812	qpair->enabled = false;
813
814	qpair->ctrlr->enabled_io_qpairs--;
815}
816
817/*
818 * Reserve room for the submission queue
819 * in the controller memory buffer
820 */
821static int nvme_ctrlr_reserve_sq_in_cmb(struct nvme_ctrlr *ctrlr,
822					uint16_t entries,
823					uint64_t aligned, uint64_t *offset)
824{
825	uint64_t round_offset;
826	const uint64_t length = entries * sizeof(struct nvme_cmd);
827
828	round_offset = ctrlr->cmb_current_offset;
829	round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
830
831	if (round_offset + length > ctrlr->cmb_size)
832		return -1;
833
834	*offset = round_offset;
835	ctrlr->cmb_current_offset = round_offset + length;
836
837	return 0;
838}
839
840/*
841 * Initialize a queue pair on the host side.
842 */
843int nvme_qpair_construct(struct nvme_ctrlr *ctrlr, struct nvme_qpair *qpair,
844			 enum nvme_qprio qprio,
845			 uint16_t entries, uint16_t trackers)
846{
847	volatile uint32_t *doorbell_base;
848	struct nvme_tracker *tr;
849	uint64_t offset;
850	unsigned long phys_addr = 0;
851	uint16_t i;
852	int ret;
853
854	nvme_assert(entries != 0, "Invalid number of entries\n");
855	nvme_assert(trackers != 0, "Invalid trackers\n");
856
857	pthread_mutex_init(&qpair->lock, NULL);
858
859	qpair->entries = entries;
860	qpair->trackers = trackers;
861	qpair->qprio = qprio;
862	qpair->sq_in_cmb = false;
863	qpair->ctrlr = ctrlr;
864
865	if (ctrlr->opts.use_cmb_sqs) {
866		/*
867		 * Reserve room for the submission queue in ctrlr
868		 * memory buffer.
869		 */
870		ret = nvme_ctrlr_reserve_sq_in_cmb(ctrlr, entries,
871						   PAGE_SIZE,
872						   &offset);
873		if (ret == 0) {
874
875			qpair->cmd = ctrlr->cmb_bar_virt_addr + offset;
876			qpair->cmd_bus_addr = ctrlr->cmb_bar_phys_addr + offset;
877			qpair->sq_in_cmb = true;
878
879			nvme_debug("Allocated qpair %d cmd in cmb at %p / 0x%llx\n",
880				   qpair->id,
881				   qpair->cmd, qpair->cmd_bus_addr);
882
883		}
884	}
885
886	if (qpair->sq_in_cmb == false) {
887
888		qpair->cmd =
889			nvme_mem_alloc_node(sizeof(struct nvme_cmd) * entries,
890				    PAGE_SIZE, NVME_NODE_ID_ANY,
891				    (unsigned long *) &qpair->cmd_bus_addr);
892		if (!qpair->cmd) {
893			nvme_err("Allocate qpair commands failed\n");
894			goto fail;
895		}
896		memset(qpair->cmd, 0, sizeof(struct nvme_cmd) * entries);
897
898		nvme_debug("Allocated qpair %d cmd %p / 0x%llx\n",
899			   qpair->id,
900			   qpair->cmd, qpair->cmd_bus_addr);
901	}
902
903	qpair->cpl = nvme_mem_alloc_node(sizeof(struct nvme_cpl) * entries,
904				 PAGE_SIZE, NVME_NODE_ID_ANY,
905				 (unsigned long *) &qpair->cpl_bus_addr);
906	if (!qpair->cpl) {
907		nvme_err("Allocate qpair completions failed\n");
908		goto fail;
909	}
910	memset(qpair->cpl, 0, sizeof(struct nvme_cpl) * entries);
911
912	nvme_debug("Allocated qpair %d cpl at %p / 0x%llx\n",
913		   qpair->id,
914		   qpair->cpl,
915		   qpair->cpl_bus_addr);
916
917	doorbell_base = &ctrlr->regs->doorbell[0].sq_tdbl;
918	qpair->sq_tdbl = doorbell_base +
919		(2 * qpair->id + 0) * ctrlr->doorbell_stride_u32;
920	qpair->cq_hdbl = doorbell_base +
921		(2 * qpair->id + 1) * ctrlr->doorbell_stride_u32;
922
923	LIST_INIT(&qpair->free_tr);
924	LIST_INIT(&qpair->outstanding_tr);
925	STAILQ_INIT(&qpair->free_req);
926	STAILQ_INIT(&qpair->queued_req);
927
928	/* Request pool */
929	if (nvme_request_pool_construct(qpair)) {
930		nvme_err("Create request pool failed\n");
931		goto fail;
932	}
933
934	/*
935	 * Reserve space for all of the trackers in a single allocation.
936	 * struct nvme_tracker must be padded so that its size is already
937	 * a power of 2. This ensures the PRP list embedded in the nvme_tracker
938	 * object will not span a 4KB boundary, while allowing access to
939	 * trackers in tr[] via normal array indexing.
940	 */
941	qpair->tr = nvme_mem_alloc_node(sizeof(struct nvme_tracker) * trackers,
942					sizeof(struct nvme_tracker),
943					NVME_NODE_ID_ANY, &phys_addr);
944	if (!qpair->tr) {
945		nvme_err("Allocate request trackers failed\n");
946		goto fail;
947	}
948	memset(qpair->tr, 0, sizeof(struct nvme_tracker) * trackers);
949
950	nvme_debug("Allocated qpair %d trackers at %p / 0x%lx\n",
951		   qpair->id, qpair->tr, phys_addr);
952
953	for (i = 0; i < trackers; i++) {
954		tr = &qpair->tr[i];
955		nvme_qpair_construct_tracker(tr, i, phys_addr);
956		LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
957		phys_addr += sizeof(struct nvme_tracker);
958	}
959
960	nvme_qpair_reset(qpair);
961
962	return 0;
963
964fail:
965	nvme_qpair_destroy(qpair);
966
967	return -1;
968}
969
970void nvme_qpair_destroy(struct nvme_qpair *qpair)
971{
972	if (!qpair->ctrlr)
973		return; // Not initialized.
974
975	if (nvme_qpair_is_admin_queue(qpair))
976		_nvme_qpair_admin_qpair_destroy(qpair);
977
978	if (qpair->cmd && !qpair->sq_in_cmb) {
979		nvme_free(qpair->cmd);
980		qpair->cmd = NULL;
981	}
982	if (qpair->cpl) {
983		nvme_free(qpair->cpl);
984		qpair->cpl = NULL;
985	}
986	if (qpair->tr) {
987		nvme_free(qpair->tr);
988		qpair->tr = NULL;
989	}
990	nvme_request_pool_destroy(qpair);
991
992	qpair->ctrlr = NULL;
993
994	pthread_mutex_destroy(&qpair->lock);
995}
996
997static bool nvme_qpair_enabled(struct nvme_qpair *qpair)
998{
999	if (!qpair->enabled && !qpair->ctrlr->resetting)
1000		nvme_qpair_enable(qpair);
1001
1002	return qpair->enabled;
1003}
1004
1005int nvme_qpair_submit_request(struct nvme_qpair *qpair,
1006			      struct nvme_request *req)
1007{
1008	struct nvme_tracker *tr;
1009	struct nvme_request *child_req, *tmp;
1010	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
1011	bool child_req_failed = false;
1012	int ret = 0;
1013
1014	if (ctrlr->failed) {
1015		nvme_request_free(req);
1016		return ENXIO;
1017	}
1018
1019	nvme_qpair_enabled(qpair);
1020
1021	if (req->child_reqs) {
1022
1023		/*
1024		 * This is a splitted (parent) request. Submit all of the
1025		 * children but not the parent request itself, since the
1026		 * parent is the original unsplit request.
1027		 */
1028		TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
1029			if (!child_req_failed) {
1030				ret = nvme_qpair_submit_request(qpair, child_req);
1031				if (ret != 0)
1032					child_req_failed = true;
1033			} else {
1034				/* free remaining child_reqs since
1035				 * one child_req fails */
1036				nvme_request_remove_child(req, child_req);
1037				nvme_request_free(child_req);
1038			}
1039		}
1040
1041		return ret;
1042	}
1043
1044	pthread_mutex_lock(&qpair->lock);
1045
1046	tr = LIST_FIRST(&qpair->free_tr);
1047	if (tr == NULL || !qpair->enabled || !STAILQ_EMPTY(&qpair->queued_req)) {
1048		/*
1049		 * No tracker is available, the qpair is disabled due
1050		 * to an in-progress controller-level reset, or
1051		 * there are already queued requests.
1052		 *
1053		 * Put the request on the qpair's request queue to be
1054		 * processed when a tracker frees up via a command
1055		 * completion or when the controller reset is completed.
1056		 */
1057		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1058		pthread_mutex_unlock(&qpair->lock);
1059
1060		if (tr)
1061			nvme_qpair_submit_queued_requests(qpair);
1062		return 0;
1063	}
1064
1065	/* remove tr from free_tr */
1066	LIST_REMOVE(tr, list);
1067	LIST_INSERT_HEAD(&qpair->outstanding_tr, tr, list);
1068	tr->req = req;
1069	req->cmd.cid = tr->cid;
1070
1071	if (req->payload_size == 0) {
1072		/* Null payload - leave PRP fields zeroed */
1073		ret = 0;
1074	} else if (req->payload.type == NVME_PAYLOAD_TYPE_CONTIG) {
1075		ret = _nvme_qpair_build_contig_request(qpair, req, tr);
1076	} else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL) {
1077		if (ctrlr->flags & NVME_CTRLR_SGL_SUPPORTED)
1078			ret = _nvme_qpair_build_hw_sgl_request(qpair, req, tr);
1079		else
1080			ret = _nvme_qpair_build_prps_sgl_request(qpair, req, tr);
1081	} else {
1082		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1083						   NVME_SC_INVALID_FIELD,
1084						   1 /* do not retry */, true);
1085		ret = -EINVAL;
1086	}
1087
1088	if (ret == 0)
1089		nvme_qpair_submit_tracker(qpair, tr);
1090
1091	pthread_mutex_unlock(&qpair->lock);
1092
1093	return ret;
1094}
1095
1096/*
1097 * Poll for completion of NVMe commands submitted to the
1098 * specified I/O queue pair.
1099 */
1100unsigned int nvme_qpair_poll(struct nvme_qpair *qpair,
1101			     unsigned int max_completions)
1102{
1103	struct nvme_tracker *tr;
1104	struct nvme_cpl	*cpl;
1105	uint32_t num_completions = 0;
1106
1107	if (!nvme_qpair_enabled(qpair))
1108		/*
1109		 * qpair is not enabled, likely because a controller reset is
1110		 * is in progress.  Ignore the interrupt - any I/O that was
1111		 * associated with this interrupt will get retried when the
1112		 * reset is complete.
1113		 */
1114		return 0;
1115
1116	if ((max_completions == 0) ||
1117	    (max_completions > (qpair->entries - 1U)))
1118		/*
1119		 * max_completions == 0 means unlimited, but complete at most
1120		 * one queue depth batch of I/O at a time so that the completion
1121		 * queue doorbells don't wrap around.
1122		 */
1123		max_completions = qpair->entries - 1;
1124
1125	pthread_mutex_lock(&qpair->lock);
1126
1127	while (1) {
1128
1129		cpl = &qpair->cpl[qpair->cq_head];
1130		if (cpl->status.p != qpair->phase)
1131			break;
1132
1133		tr = &qpair->tr[cpl->cid];
1134		if (tr->active) {
1135			nvme_qpair_complete_tracker(qpair, tr, cpl, true);
1136		} else {
1137			nvme_info("cpl does not map to outstanding cmd\n");
1138			nvme_qpair_print_completion(qpair, cpl);
1139			nvme_panic("received completion for unknown cmd\n");
1140		}
1141
1142		if (++qpair->cq_head == qpair->entries) {
1143			qpair->cq_head = 0;
1144			qpair->phase = !qpair->phase;
1145		}
1146
1147		if (++num_completions == max_completions)
1148			break;
1149	}
1150
1151	if (num_completions > 0)
1152		nvme_mmio_write_4(qpair->cq_hdbl, qpair->cq_head);
1153
1154	pthread_mutex_unlock(&qpair->lock);
1155
1156	if (!STAILQ_EMPTY(&qpair->queued_req))
1157		nvme_qpair_submit_queued_requests(qpair);
1158
1159	return num_completions;
1160}
1161
1162void nvme_qpair_reset(struct nvme_qpair *qpair)
1163{
1164	pthread_mutex_lock(&qpair->lock);
1165
1166	qpair->sq_tail = qpair->cq_head = 0;
1167
1168	/*
1169	 * First time through the completion queue, HW will set phase
1170	 * bit on completions to 1.  So set this to 1 here, indicating
1171	 * we're looking for a 1 to know which entries have completed.
1172	 * we'll toggle the bit each time when the completion queue rolls over.
1173	 */
1174	qpair->phase = 1;
1175
1176	memset(qpair->cmd, 0, qpair->entries * sizeof(struct nvme_cmd));
1177	memset(qpair->cpl, 0, qpair->entries * sizeof(struct nvme_cpl));
1178
1179	pthread_mutex_unlock(&qpair->lock);
1180}
1181
1182void nvme_qpair_enable(struct nvme_qpair *qpair)
1183{
1184	pthread_mutex_lock(&qpair->lock);
1185
1186	if (nvme_qpair_is_io_queue(qpair))
1187		_nvme_qpair_io_qpair_enable(qpair);
1188	else
1189		_nvme_qpair_admin_qpair_enable(qpair);
1190
1191	pthread_mutex_unlock(&qpair->lock);
1192}
1193
1194void nvme_qpair_disable(struct nvme_qpair *qpair)
1195{
1196	pthread_mutex_lock(&qpair->lock);
1197
1198	if (nvme_qpair_is_io_queue(qpair))
1199		_nvme_qpair_io_qpair_disable(qpair);
1200	else
1201		_nvme_qpair_admin_qpair_disable(qpair);
1202
1203	pthread_mutex_unlock(&qpair->lock);
1204}
1205
1206void nvme_qpair_fail(struct nvme_qpair *qpair)
1207{
1208	struct nvme_tracker *tr;
1209	struct nvme_request *req;
1210
1211	pthread_mutex_lock(&qpair->lock);
1212
1213	while (!STAILQ_EMPTY(&qpair->queued_req)) {
1214
1215		nvme_notice("Failing queued I/O command\n");
1216		req = STAILQ_FIRST(&qpair->queued_req);
1217		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1218		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1219						   NVME_SC_ABORTED_BY_REQUEST,
1220						   true);
1221
1222	}
1223
1224	/* Manually abort each outstanding I/O. */
1225	while (!LIST_EMPTY(&qpair->outstanding_tr)) {
1226
1227		/*
1228		 * Do not remove the tracker. The abort_tracker path
1229		 * will do that for us.
1230		 */
1231		nvme_notice("Failing outstanding I/O command\n");
1232		tr = LIST_FIRST(&qpair->outstanding_tr);
1233		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1234						   NVME_SC_ABORTED_BY_REQUEST,
1235						   1, true);
1236
1237	}
1238
1239	pthread_mutex_unlock(&qpair->lock);
1240}
1241
1242