nvme_ctrlr.c revision 240616
1/*-
2 * Copyright (C) 2012 Intel Corporation
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/dev/nvme/nvme_ctrlr.c 240616 2012-09-17 19:23:01Z jimharris $");
29
30#include <sys/param.h>
31#include <sys/bus.h>
32#include <sys/conf.h>
33#include <sys/ioccom.h>
34#include <sys/smp.h>
35
36#include <dev/pci/pcireg.h>
37#include <dev/pci/pcivar.h>
38
39#include "nvme_private.h"
40
41static void
42nvme_ctrlr_cb(void *arg, const struct nvme_completion *status)
43{
44	struct nvme_completion	*cpl = arg;
45	struct mtx		*mtx;
46
47	/*
48	 * Copy status into the argument passed by the caller, so that
49	 *  the caller can check the status to determine if the
50	 *  the request passed or failed.
51	 */
52	memcpy(cpl, status, sizeof(*cpl));
53	mtx = mtx_pool_find(mtxpool_sleep, cpl);
54	mtx_lock(mtx);
55	wakeup(cpl);
56	mtx_unlock(mtx);
57}
58
59static int
60nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
61{
62
63	/* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
64	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
65		ctrlr->resource_id = PCIR_BAR(2);
66	else
67		ctrlr->resource_id = PCIR_BAR(0);
68
69	ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
70	    &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
71
72	if(ctrlr->resource == NULL) {
73		device_printf(ctrlr->dev, "unable to allocate pci resource\n");
74		return (ENOMEM);
75	}
76
77	ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
78	ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
79	ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
80
81	return (0);
82}
83
84#ifdef CHATHAM2
85static int
86nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
87{
88
89	ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
90	ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
91	    SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
92	    RF_ACTIVE);
93
94	if(ctrlr->chatham_resource == NULL) {
95		device_printf(ctrlr->dev, "unable to alloc pci resource\n");
96		return (ENOMEM);
97	}
98
99	ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
100	ctrlr->chatham_bus_handle =
101	    rman_get_bushandle(ctrlr->chatham_resource);
102
103	return (0);
104}
105
106static void
107nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
108{
109	uint64_t reg1, reg2, reg3;
110	uint64_t temp1, temp2;
111	uint32_t temp3;
112	uint32_t use_flash_timings = 0;
113
114	DELAY(10000);
115
116	temp3 = chatham_read_4(ctrlr, 0x8080);
117
118	device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
119
120	ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
121	ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
122
123	device_printf(ctrlr->dev, "Chatham size: %lld\n",
124	    (long long)ctrlr->chatham_size);
125
126	reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
127
128	TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
129	if (use_flash_timings) {
130		device_printf(ctrlr->dev, "Chatham: using flash timings\n");
131		temp1 = 0x00001b58000007d0LL;
132		temp2 = 0x000000cb00000131LL;
133	} else {
134		device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
135		temp1 = temp2 = 0x0LL;
136	}
137
138	chatham_write_8(ctrlr, 0x8000, reg1);
139	chatham_write_8(ctrlr, 0x8008, reg2);
140	chatham_write_8(ctrlr, 0x8010, reg3);
141
142	chatham_write_8(ctrlr, 0x8020, temp1);
143	temp3 = chatham_read_4(ctrlr, 0x8020);
144
145	chatham_write_8(ctrlr, 0x8028, temp2);
146	temp3 = chatham_read_4(ctrlr, 0x8028);
147
148	chatham_write_8(ctrlr, 0x8030, temp1);
149	chatham_write_8(ctrlr, 0x8038, temp2);
150	chatham_write_8(ctrlr, 0x8040, temp1);
151	chatham_write_8(ctrlr, 0x8048, temp2);
152	chatham_write_8(ctrlr, 0x8050, temp1);
153	chatham_write_8(ctrlr, 0x8058, temp2);
154
155	DELAY(10000);
156}
157
158static void
159nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
160{
161	struct nvme_controller_data *cdata;
162
163	cdata = &ctrlr->cdata;
164
165	cdata->vid = 0x8086;
166	cdata->ssvid = 0x2011;
167
168	/*
169	 * Chatham2 puts garbage data in these fields when we
170	 *  invoke IDENTIFY_CONTROLLER, so we need to re-zero
171	 *  the fields before calling bcopy().
172	 */
173	memset(cdata->sn, 0, sizeof(cdata->sn));
174	memcpy(cdata->sn, "2012", strlen("2012"));
175	memset(cdata->mn, 0, sizeof(cdata->mn));
176	memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
177	memset(cdata->fr, 0, sizeof(cdata->fr));
178	memcpy(cdata->fr, "0", strlen("0"));
179	cdata->rab = 8;
180	cdata->aerl = 3;
181	cdata->lpa.ns_smart = 1;
182	cdata->sqes.min = 6;
183	cdata->sqes.max = 6;
184	cdata->sqes.min = 4;
185	cdata->sqes.max = 4;
186	cdata->nn = 1;
187
188	/* Chatham2 doesn't support DSM command */
189	cdata->oncs.dsm = 0;
190
191	cdata->vwc.present = 1;
192}
193#endif /* CHATHAM2 */
194
195static void
196nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
197{
198	struct nvme_qpair	*qpair;
199	uint32_t		num_entries;
200
201	qpair = &ctrlr->adminq;
202
203	num_entries = NVME_ADMIN_ENTRIES;
204	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
205	/*
206	 * If admin_entries was overridden to an invalid value, revert it
207	 *  back to our default value.
208	 */
209	if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
210	    num_entries > NVME_MAX_ADMIN_ENTRIES) {
211		printf("nvme: invalid hw.nvme.admin_entries=%d specified\n",
212		    num_entries);
213		num_entries = NVME_ADMIN_ENTRIES;
214	}
215
216	/*
217	 * The admin queue's max xfer size is treated differently than the
218	 *  max I/O xfer size.  16KB is sufficient here - maybe even less?
219	 */
220	nvme_qpair_construct(qpair, 0, 0, num_entries, 16*1024, ctrlr);
221}
222
223static int
224nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
225{
226	struct nvme_qpair	*qpair;
227	union cap_lo_register	cap_lo;
228	int			i, num_entries;
229
230	num_entries = NVME_IO_ENTRIES;
231	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
232
233	num_entries = max(num_entries, NVME_MIN_IO_ENTRIES);
234
235	/*
236	 * NVMe spec sets a hard limit of 64K max entries, but
237	 *  devices may specify a smaller limit, so we need to check
238	 *  the MQES field in the capabilities register.
239	 */
240	cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
241	num_entries = min(num_entries, cap_lo.bits.mqes+1);
242
243	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
244	TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
245	/*
246	 * Check that tunable doesn't specify a size greater than what our
247	 *  driver supports, and is an even PAGE_SIZE multiple.
248	 */
249	if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
250	    ctrlr->max_xfer_size % PAGE_SIZE)
251		ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
252
253	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
254	    M_NVME, M_ZERO | M_NOWAIT);
255
256	if (ctrlr->ioq == NULL)
257		return (ENOMEM);
258
259	for (i = 0; i < ctrlr->num_io_queues; i++) {
260		qpair = &ctrlr->ioq[i];
261
262		/*
263		 * Admin queue has ID=0. IO queues start at ID=1 -
264		 *  hence the 'i+1' here.
265		 *
266		 * For I/O queues, use the controller-wide max_xfer_size
267		 *  calculated in nvme_attach().
268		 */
269		nvme_qpair_construct(qpair,
270				     i+1, /* qpair ID */
271				     ctrlr->msix_enabled ? i+1 : 0, /* vector */
272				     num_entries,
273				     ctrlr->max_xfer_size,
274				     ctrlr);
275
276		if (ctrlr->per_cpu_io_queues)
277			bus_bind_intr(ctrlr->dev, qpair->res, i);
278	}
279
280	return (0);
281}
282
283static int
284nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
285{
286	int ms_waited;
287	union cc_register cc;
288	union csts_register csts;
289
290	cc.raw = nvme_mmio_read_4(ctrlr, cc);
291	csts.raw = nvme_mmio_read_4(ctrlr, csts);
292
293	if (!cc.bits.en) {
294		device_printf(ctrlr->dev, "%s called with cc.en = 0\n",
295		    __func__);
296		return (ENXIO);
297	}
298
299	ms_waited = 0;
300
301	while (!csts.bits.rdy) {
302		DELAY(1000);
303		if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
304			device_printf(ctrlr->dev, "controller did not become "
305			    "ready within %d ms\n", ctrlr->ready_timeout_in_ms);
306			return (ENXIO);
307		}
308		csts.raw = nvme_mmio_read_4(ctrlr, csts);
309	}
310
311	return (0);
312}
313
314static void
315nvme_ctrlr_disable(struct nvme_controller *ctrlr)
316{
317	union cc_register cc;
318	union csts_register csts;
319
320	cc.raw = nvme_mmio_read_4(ctrlr, cc);
321	csts.raw = nvme_mmio_read_4(ctrlr, csts);
322
323	if (cc.bits.en == 1 && csts.bits.rdy == 0)
324		nvme_ctrlr_wait_for_ready(ctrlr);
325
326	cc.bits.en = 0;
327	nvme_mmio_write_4(ctrlr, cc, cc.raw);
328	DELAY(5000);
329}
330
331static int
332nvme_ctrlr_enable(struct nvme_controller *ctrlr)
333{
334	union cc_register	cc;
335	union csts_register	csts;
336	union aqa_register	aqa;
337
338	cc.raw = nvme_mmio_read_4(ctrlr, cc);
339	csts.raw = nvme_mmio_read_4(ctrlr, csts);
340
341	if (cc.bits.en == 1) {
342		if (csts.bits.rdy == 1)
343			return (0);
344		else
345			return (nvme_ctrlr_wait_for_ready(ctrlr));
346	}
347
348	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
349	DELAY(5000);
350	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
351	DELAY(5000);
352
353	aqa.raw = 0;
354	/* acqs and asqs are 0-based. */
355	aqa.bits.acqs = ctrlr->adminq.num_entries-1;
356	aqa.bits.asqs = ctrlr->adminq.num_entries-1;
357	nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
358	DELAY(5000);
359
360	cc.bits.en = 1;
361	cc.bits.css = 0;
362	cc.bits.ams = 0;
363	cc.bits.shn = 0;
364	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
365	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
366
367	/* This evaluates to 0, which is according to spec. */
368	cc.bits.mps = (PAGE_SIZE >> 13);
369
370	nvme_mmio_write_4(ctrlr, cc, cc.raw);
371	DELAY(5000);
372
373	return (nvme_ctrlr_wait_for_ready(ctrlr));
374}
375
376int
377nvme_ctrlr_reset(struct nvme_controller *ctrlr)
378{
379
380	nvme_ctrlr_disable(ctrlr);
381	return (nvme_ctrlr_enable(ctrlr));
382}
383
384static void
385nvme_async_event_cb(void *arg, const struct nvme_completion *status)
386{
387	struct nvme_controller *ctrlr = arg;
388
389	printf("Asynchronous event occurred.\n");
390
391	/* TODO: decode async event type based on status */
392	/* TODO: check status for any error bits */
393
394	/*
395	 * Repost an asynchronous event request so that it can be
396	 *  used again by the controller.
397	 */
398	nvme_ctrlr_cmd_asynchronous_event_request(ctrlr, nvme_async_event_cb,
399	    ctrlr);
400}
401
402static int
403nvme_ctrlr_identify(struct nvme_controller *ctrlr)
404{
405	struct mtx		*mtx;
406	struct nvme_completion	cpl;
407	int			status;
408
409	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
410
411	mtx_lock(mtx);
412	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
413	    nvme_ctrlr_cb, &cpl);
414	status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
415	mtx_unlock(mtx);
416	if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
417		printf("nvme_identify_controller failed!\n");
418		return (ENXIO);
419	}
420
421#ifdef CHATHAM2
422	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
423		nvme_chatham_populate_cdata(ctrlr);
424#endif
425
426	return (0);
427}
428
429static int
430nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
431{
432	struct mtx		*mtx;
433	struct nvme_completion	cpl;
434	int			cq_allocated, sq_allocated, status;
435
436	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
437
438	mtx_lock(mtx);
439	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
440	    nvme_ctrlr_cb, &cpl);
441	status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
442	mtx_unlock(mtx);
443	if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
444		printf("nvme_set_num_queues failed!\n");
445		return (ENXIO);
446	}
447
448	/*
449	 * Data in cdw0 is 0-based.
450	 * Lower 16-bits indicate number of submission queues allocated.
451	 * Upper 16-bits indicate number of completion queues allocated.
452	 */
453	sq_allocated = (cpl.cdw0 & 0xFFFF) + 1;
454	cq_allocated = (cpl.cdw0 >> 16) + 1;
455
456	/*
457	 * Check that the controller was able to allocate the number of
458	 *  queues we requested.  If not, revert to one IO queue.
459	 */
460	if (sq_allocated < ctrlr->num_io_queues ||
461	    cq_allocated < ctrlr->num_io_queues) {
462		ctrlr->num_io_queues = 1;
463		ctrlr->per_cpu_io_queues = 0;
464
465		/* TODO: destroy extra queues that were created
466		 *  previously but now found to be not needed.
467		 */
468	}
469
470	return (0);
471}
472
473static int
474nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
475{
476	struct mtx		*mtx;
477	struct nvme_qpair	*qpair;
478	struct nvme_completion	cpl;
479	int			i, status;
480
481	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
482
483	for (i = 0; i < ctrlr->num_io_queues; i++) {
484		qpair = &ctrlr->ioq[i];
485
486		mtx_lock(mtx);
487		nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
488		    nvme_ctrlr_cb, &cpl);
489		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
490		mtx_unlock(mtx);
491		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
492			printf("nvme_create_io_cq failed!\n");
493			return (ENXIO);
494		}
495
496		mtx_lock(mtx);
497		nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
498		    nvme_ctrlr_cb, &cpl);
499		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
500		mtx_unlock(mtx);
501		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
502			printf("nvme_create_io_sq failed!\n");
503			return (ENXIO);
504		}
505	}
506
507	return (0);
508}
509
510static int
511nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
512{
513	struct nvme_namespace	*ns;
514	int			i, status;
515
516	for (i = 0; i < ctrlr->cdata.nn; i++) {
517		ns = &ctrlr->ns[i];
518		status = nvme_ns_construct(ns, i+1, ctrlr);
519		if (status != 0)
520			return (status);
521	}
522
523	return (0);
524}
525
526static void
527nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
528{
529	union nvme_critical_warning_state	state;
530	uint8_t					num_async_events;
531
532	state.raw = 0xFF;
533	state.bits.reserved = 0;
534	nvme_ctrlr_cmd_set_asynchronous_event_config(ctrlr, state, NULL, NULL);
535
536	/* aerl is a zero-based value, so we need to add 1 here. */
537	num_async_events = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
538
539	/*
540	 * Disable this code for now, since Chatham doesn't support
541	 *  AERs so I have no good way to test them.
542	 */
543#if 0
544	for (int i = 0; i < num_async_events; i++)
545		nvme_ctrlr_cmd_asynchronous_event_request(ctrlr,
546		    nvme_async_event_cb, ctrlr);
547#endif
548}
549
550static void
551nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
552{
553
554	ctrlr->int_coal_time = 0;
555	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
556	    &ctrlr->int_coal_time);
557
558	ctrlr->int_coal_threshold = 0;
559	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
560	    &ctrlr->int_coal_threshold);
561
562	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
563	    ctrlr->int_coal_threshold, NULL, NULL);
564}
565
566void
567nvme_ctrlr_start(void *ctrlr_arg)
568{
569	struct nvme_controller *ctrlr = ctrlr_arg;
570
571	if (nvme_ctrlr_identify(ctrlr) != 0)
572		goto err;
573
574	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
575		goto err;
576
577	if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
578		goto err;
579
580	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
581		goto err;
582
583	nvme_ctrlr_configure_aer(ctrlr);
584	nvme_ctrlr_configure_int_coalescing(ctrlr);
585
586	ctrlr->is_started = TRUE;
587
588err:
589
590	/*
591	 * Initialize sysctls, even if controller failed to start, to
592	 *  assist with debugging admin queue pair.
593	 */
594	nvme_sysctl_initialize_ctrlr(ctrlr);
595	config_intrhook_disestablish(&ctrlr->config_hook);
596}
597
598static void
599nvme_ctrlr_intx_task(void *arg, int pending)
600{
601	struct nvme_controller *ctrlr = arg;
602
603	nvme_qpair_process_completions(&ctrlr->adminq);
604
605	if (ctrlr->ioq[0].cpl)
606		nvme_qpair_process_completions(&ctrlr->ioq[0]);
607
608	nvme_mmio_write_4(ctrlr, intmc, 1);
609}
610
611static void
612nvme_ctrlr_intx_handler(void *arg)
613{
614	struct nvme_controller *ctrlr = arg;
615
616	nvme_mmio_write_4(ctrlr, intms, 1);
617	taskqueue_enqueue_fast(ctrlr->taskqueue, &ctrlr->task);
618}
619
620static int
621nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
622{
623
624	ctrlr->num_io_queues = 1;
625	ctrlr->per_cpu_io_queues = 0;
626	ctrlr->rid = 0;
627	ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
628	    &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
629
630	if (ctrlr->res == NULL) {
631		device_printf(ctrlr->dev, "unable to allocate shared IRQ\n");
632		return (ENOMEM);
633	}
634
635	bus_setup_intr(ctrlr->dev, ctrlr->res,
636	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
637	    ctrlr, &ctrlr->tag);
638
639	if (ctrlr->tag == NULL) {
640		device_printf(ctrlr->dev,
641		    "unable to setup legacy interrupt handler\n");
642		return (ENOMEM);
643	}
644
645	TASK_INIT(&ctrlr->task, 0, nvme_ctrlr_intx_task, ctrlr);
646	ctrlr->taskqueue = taskqueue_create_fast("nvme_taskq", M_NOWAIT,
647	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
648	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_NET,
649	    "%s intx taskq", device_get_nameunit(ctrlr->dev));
650
651	return (0);
652}
653
654static int
655nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
656    struct thread *td)
657{
658	struct nvme_controller	*ctrlr;
659	struct nvme_completion	cpl;
660	struct mtx		*mtx;
661
662	ctrlr = cdev->si_drv1;
663
664	switch (cmd) {
665	case NVME_IDENTIFY_CONTROLLER:
666#ifdef CHATHAM2
667		/*
668		 * Don't refresh data on Chatham, since Chatham returns
669		 *  garbage on IDENTIFY anyways.
670		 */
671		if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
672			memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
673			break;
674		}
675#endif
676		/* Refresh data before returning to user. */
677		mtx = mtx_pool_find(mtxpool_sleep, &cpl);
678		mtx_lock(mtx);
679		nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
680		    nvme_ctrlr_cb, &cpl);
681		msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
682		mtx_unlock(mtx);
683		if (cpl.sf_sc || cpl.sf_sct)
684			return (ENXIO);
685		memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
686		break;
687	default:
688		return (ENOTTY);
689	}
690
691	return (0);
692}
693
694static struct cdevsw nvme_ctrlr_cdevsw = {
695	.d_version =	D_VERSION,
696	.d_flags =	0,
697	.d_ioctl =	nvme_ctrlr_ioctl
698};
699
700int
701nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
702{
703	union cap_lo_register	cap_lo;
704	union cap_hi_register	cap_hi;
705	int			num_vectors, per_cpu_io_queues, status = 0;
706
707	ctrlr->dev = dev;
708	ctrlr->is_started = FALSE;
709
710	status = nvme_ctrlr_allocate_bar(ctrlr);
711
712	if (status != 0)
713		return (status);
714
715#ifdef CHATHAM2
716	if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
717		status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
718		if (status != 0)
719			return (status);
720		nvme_ctrlr_setup_chatham(ctrlr);
721	}
722#endif
723
724	/*
725	 * Software emulators may set the doorbell stride to something
726	 *  other than zero, but this driver is not set up to handle that.
727	 */
728	cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
729	if (cap_hi.bits.dstrd != 0)
730		return (ENXIO);
731
732	/* Get ready timeout value from controller, in units of 500ms. */
733	cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
734	ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
735
736	per_cpu_io_queues = 1;
737	TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
738	ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
739
740	if (ctrlr->per_cpu_io_queues)
741		ctrlr->num_io_queues = mp_ncpus;
742	else
743		ctrlr->num_io_queues = 1;
744
745	ctrlr->force_intx = 0;
746	TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
747
748	ctrlr->msix_enabled = 1;
749
750	if (ctrlr->force_intx) {
751		ctrlr->msix_enabled = 0;
752		goto intx;
753	}
754
755	/* One vector per IO queue, plus one vector for admin queue. */
756	num_vectors = ctrlr->num_io_queues + 1;
757
758	if (pci_msix_count(dev) < num_vectors) {
759		ctrlr->msix_enabled = 0;
760		goto intx;
761	}
762
763	if (pci_alloc_msix(dev, &num_vectors) != 0)
764		ctrlr->msix_enabled = 0;
765
766intx:
767
768	if (!ctrlr->msix_enabled)
769		nvme_ctrlr_configure_intx(ctrlr);
770
771	nvme_ctrlr_construct_admin_qpair(ctrlr);
772
773	status = nvme_ctrlr_construct_io_qpairs(ctrlr);
774
775	if (status != 0)
776		return (status);
777
778	ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
779	    "nvme%d", device_get_unit(dev));
780
781	if (ctrlr->cdev == NULL)
782		return (ENXIO);
783
784	ctrlr->cdev->si_drv1 = (void *)ctrlr;
785
786	return (0);
787}
788