1// Copyright 2017 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <assert.h>
6#include <limits.h>
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <threads.h>
11
12#include <ddk/binding.h>
13#include <ddk/debug.h>
14#include <ddk/device.h>
15#include <ddk/driver.h>
16#include <ddk/protocol/block.h>
17#include <ddk/protocol/pci.h>
18#include <ddk/io-buffer.h>
19
20#include <hw/reg.h>
21#include <hw/pci.h>
22
23#include <lib/sync/completion.h>
24
25#include <zircon/device/block.h>
26#include <zircon/syscalls.h>
27#include <zircon/types.h>
28#include <zircon/listnode.h>
29
30#include "nvme-hw.h"
31
32// If enabled, gather stats on concurrent io ops,
33// pending txns, etc.  Stats are retrieved by
34// IOCTL_BLOCK_GET_STATS
35#define WITH_STATS 1
36
37#define TXN_FLAG_FAILED 1
38
39typedef struct {
40    block_op_t op;
41    list_node_t node;
42    uint16_t pending_utxns;
43    uint8_t opcode;
44    uint8_t flags;
45} nvme_txn_t;
46
47typedef struct {
48    zx_paddr_t phys;    // io buffer phys base (1 page)
49    void* virt;         // io buffer virt base
50    zx_handle_t pmt;    // pinned memory
51    nvme_txn_t* txn;    // related txn
52    uint16_t id;
53    uint16_t reserved0;
54    uint32_t reserved1;
55} nvme_utxn_t;
56
57#define UTXN_COUNT 63
58
59// There's no system constant for this.  Ensure it matches reality.
60#define PAGE_SHIFT (12ULL)
61static_assert(PAGE_SIZE == (1ULL << PAGE_SHIFT), "");
62
63#define PAGE_MASK (PAGE_SIZE - 1ULL)
64
65// Limit maximum transfer size to 1MB which fits comfortably
66// within our single scatter gather page per utxn setup
67#define MAX_XFER (1024*1024)
68
69// Maximum submission and completion queue item counts, for
70// queues that are a single page in size.
71#define SQMAX (PAGE_SIZE / sizeof(nvme_cmd_t))
72#define CQMAX (PAGE_SIZE / sizeof(nvme_cpl_t))
73
74// global driver state bits
75#define FLAG_IRQ_THREAD_STARTED  0x0001
76#define FLAG_IO_THREAD_STARTED   0x0002
77#define FLAG_SHUTDOWN            0x0004
78
79#define FLAG_HAS_VWC             0x0100
80
81typedef struct {
82    void* io;
83    zx_handle_t ioh;
84    zx_handle_t irqh;
85    zx_handle_t bti;
86    uint32_t flags;
87    mtx_t lock;
88
89    // io queue doorbell registers
90    void* io_sq_tail_db;
91    void* io_cq_head_db;
92
93    nvme_cpl_t* io_cq;
94    nvme_cmd_t* io_sq;
95    uint32_t io_nsid;
96    uint16_t io_cq_head;
97    uint16_t io_cq_toggle;
98    uint16_t io_sq_tail;
99    uint16_t io_sq_head;
100
101    uint64_t utxn_avail;   // bitmask of available utxns
102
103    // The pending list is txns that have been received
104    // via nvme_queue() and are waiting for io to start.
105    // The exception is the head of the pending list which may
106    // be partially started, waiting for more utxns to become
107    // available.
108    // The active list consists of txns where all utxns have
109    // been created and we're waiting for them to complete or
110    // error out.
111    list_node_t pending_txns;      // inbound txns to process
112    list_node_t active_txns;       // txns in flight
113
114    // The io signal completion is signaled from nvme_queue()
115    // or from the irq thread, notifying the io thread that
116    // it has work to do.
117    sync_completion_t io_signal;
118
119    uint32_t max_xfer;
120    block_info_t info;
121
122    // admin queue doorbell registers
123    void* io_admin_sq_tail_db;
124    void* io_admin_cq_head_db;
125
126    // admin queues and state
127    nvme_cpl_t* admin_cq;
128    nvme_cmd_t* admin_sq;
129    uint16_t admin_cq_head;
130    uint16_t admin_cq_toggle;
131    uint16_t admin_sq_tail;
132    uint16_t admin_sq_head;
133
134    // context for admin transactions
135    // presently we serialize these under the admin_lock
136    mtx_t admin_lock;
137    sync_completion_t admin_signal;
138    nvme_cpl_t admin_result;
139
140    pci_protocol_t pci;
141    zx_device_t* zxdev;
142
143    size_t iosz;
144
145    // source of physical pages for queues and admin commands
146    io_buffer_t iob;
147
148    thrd_t irqthread;
149    thrd_t iothread;
150
151#if WITH_STATS
152    size_t stat_concur;
153    size_t stat_pending;
154    size_t stat_max_concur;
155    size_t stat_max_pending;
156    size_t stat_total_ops;
157    size_t stat_total_blocks;
158#endif
159
160    // pool of utxns
161    nvme_utxn_t utxn[UTXN_COUNT];
162} nvme_device_t;
163
164#if WITH_STATS
165#define STAT_INC(name) do { nvme->stat_##name++; } while (0)
166#define STAT_DEC(name) do { nvme->stat_##name--; } while (0)
167#define STAT_DEC_IF(name, c) do { if (c) nvme->stat_##name--; } while (0)
168#define STAT_ADD(name, num) do { nvme->stat_##name += num; } while (0)
169#define STAT_INC_MAX(name) do { \
170    if (++nvme->stat_##name > nvme->stat_max_##name) { \
171        nvme->stat_max_##name = nvme->stat_##name; \
172    }} while (0)
173#else
174#define STAT_INC(name) do { } while (0)
175#define STAT_DEC(name) do { } while (0)
176#define STAT_DEC_IF(name, c) do { } while (0)
177#define STAT_ADD(name, num) do { } while (0)
178#define STAT_INC_MAX(name) do { } while (0)
179#endif
180
181
182// We break IO transactions down into one or more "micro transactions" (utxn)
183// based on the transfer limits of the controller, etc.  Each utxn has an
184// id associated with it, which is used as the command id for the command
185// queued to the NVME device.  This id is the same as its index into the
186// pool of utxns and the bitmask of free txns, to simplify management.
187//
188// We maintain a pool of 63 of these, which is the number of commands
189// that can be submitted to NVME via a single page submit queue.
190//
191// The utxns are not protected by locks.  Instead, after initialization,
192// they may only be touched by the io thread, which is responsible for
193// queueing commands and dequeuing completion messages.
194
195static nvme_utxn_t* utxn_get(nvme_device_t* nvme) {
196    uint64_t n = __builtin_ffsll(nvme->utxn_avail);
197    if (n == 0) {
198        return NULL;
199    }
200    n--;
201    nvme->utxn_avail &= ~(1ULL << n);
202    STAT_INC_MAX(concur);
203    return nvme->utxn + n;
204}
205
206static void utxn_put(nvme_device_t* nvme, nvme_utxn_t* utxn) {
207    uint64_t n = utxn->id;
208    STAT_DEC(concur);
209    nvme->utxn_avail |= (1ULL << n);
210}
211
212static zx_status_t nvme_admin_cq_get(nvme_device_t* nvme, nvme_cpl_t* cpl) {
213    if ((readw(&nvme->admin_cq[nvme->admin_cq_head].status) & 1) != nvme->admin_cq_toggle) {
214        return ZX_ERR_SHOULD_WAIT;
215    }
216    *cpl = nvme->admin_cq[nvme->admin_cq_head];
217
218    // advance the head pointer, wrapping and inverting toggle at max
219    uint16_t next = (nvme->admin_cq_head + 1) & (CQMAX - 1);
220    if ((nvme->admin_cq_head = next) == 0) {
221        nvme->admin_cq_toggle ^= 1;
222    }
223
224    // note the new sq head reported by hw
225    nvme->admin_sq_head = cpl->sq_head;
226
227    // ring the doorbell
228    writel(next, nvme->io_admin_cq_head_db);
229    return ZX_OK;
230}
231
232static zx_status_t nvme_admin_sq_put(nvme_device_t* nvme, nvme_cmd_t* cmd) {
233    uint16_t next = (nvme->admin_sq_tail + 1) & (SQMAX - 1);
234
235    // if head+1 == tail: queue is full
236    if (next == nvme->admin_sq_head) {
237        return ZX_ERR_SHOULD_WAIT;
238    }
239
240    nvme->admin_sq[nvme->admin_sq_tail] = *cmd;
241    nvme->admin_sq_tail = next;
242
243    // ring the doorbell
244    writel(next, nvme->io_admin_sq_tail_db);
245    return ZX_OK;
246}
247
248static zx_status_t nvme_io_cq_get(nvme_device_t* nvme, nvme_cpl_t* cpl) {
249    if ((readw(&nvme->io_cq[nvme->io_cq_head].status) & 1) != nvme->io_cq_toggle) {
250        return ZX_ERR_SHOULD_WAIT;
251    }
252    *cpl = nvme->io_cq[nvme->io_cq_head];
253
254    // advance the head pointer, wrapping and inverting toggle at max
255    uint16_t next = (nvme->io_cq_head + 1) & (CQMAX - 1);
256    if ((nvme->io_cq_head = next) == 0) {
257        nvme->io_cq_toggle ^= 1;
258    }
259
260    // note the new sq head reported by hw
261    nvme->io_sq_head = cpl->sq_head;
262    return ZX_OK;
263}
264
265static void nvme_io_cq_ack(nvme_device_t* nvme) {
266    // ring the doorbell
267    writel(nvme->io_cq_head, nvme->io_cq_head_db);
268}
269
270static zx_status_t nvme_io_sq_put(nvme_device_t* nvme, nvme_cmd_t* cmd) {
271    uint16_t next = (nvme->io_sq_tail + 1) & (SQMAX - 1);
272
273    // if head+1 == tail: queue is full
274    if (next == nvme->io_sq_head) {
275        return ZX_ERR_SHOULD_WAIT;
276    }
277
278    nvme->io_sq[nvme->io_sq_tail] = *cmd;
279    nvme->io_sq_tail = next;
280
281    // ring the doorbell
282    writel(next, nvme->io_sq_tail_db);
283    return ZX_OK;
284}
285
286static int irq_thread(void* arg) {
287    nvme_device_t* nvme = arg;
288    for (;;) {
289        zx_status_t r;
290        if ((r = zx_interrupt_wait(nvme->irqh, NULL)) != ZX_OK) {
291            zxlogf(ERROR, "nvme: irq wait failed: %d\n", r);
292            break;
293        }
294
295        nvme_cpl_t cpl;
296        if (nvme_admin_cq_get(nvme, &cpl) == ZX_OK) {
297            nvme->admin_result = cpl;
298            sync_completion_signal(&nvme->admin_signal);
299        }
300
301        sync_completion_signal(&nvme->io_signal);
302    }
303    return 0;
304}
305
306static zx_status_t nvme_admin_txn(nvme_device_t* nvme, nvme_cmd_t* cmd, nvme_cpl_t* cpl) {
307    zx_status_t r;
308    mtx_lock(&nvme->admin_lock);
309    sync_completion_reset(&nvme->admin_signal);
310    if ((r = nvme_admin_sq_put(nvme, cmd)) != ZX_OK) {
311        goto done;
312    }
313    if ((r = sync_completion_wait(&nvme->admin_signal, ZX_SEC(1))) != ZX_OK) {
314        zxlogf(ERROR, "nvme: admin txn: timed out\n");
315        goto done;
316    }
317
318    unsigned code = NVME_CPL_STATUS_CODE(nvme->admin_result.status);
319    if (code != 0) {
320        zxlogf(ERROR, "nvme: admin txn: nvm error %03x\n", code);
321        r = ZX_ERR_IO;
322    }
323    if (cpl != NULL) {
324        *cpl = nvme->admin_result;
325    }
326done:
327    mtx_unlock(&nvme->admin_lock);
328    return r;
329}
330
331static inline void txn_complete(nvme_txn_t* txn, zx_status_t status) {
332    txn->op.completion_cb(&txn->op, status);
333}
334
335// Attempt to generate utxns and queue nvme commands for a txn
336// Returns true if this could not be completed due to temporary
337// lack of resources or false if either it succeeded or errored out.
338static bool io_process_txn(nvme_device_t* nvme, nvme_txn_t* txn) {
339    zx_handle_t vmo = txn->op.rw.vmo;
340    nvme_utxn_t* utxn;
341    zx_paddr_t* pages;
342    zx_status_t r;
343
344    for (;;) {
345        // If there are no available utxns, we can't proceed
346        // and we tell the caller to retain the txn (true)
347        if ((utxn = utxn_get(nvme)) == NULL) {
348            return true;
349        }
350
351        uint32_t blocks = txn->op.rw.length;
352        if (blocks > nvme->max_xfer) {
353            blocks = nvme->max_xfer;
354        }
355
356        // Total transfer size in bytes
357        size_t bytes = ((size_t) blocks) * ((size_t) nvme->info.block_size);
358
359        // Page offset of first page of transfer
360        size_t pageoffset = txn->op.rw.offset_vmo & (~PAGE_MASK);
361
362        // Byte offset into first page of transfer
363        size_t byteoffset = txn->op.rw.offset_vmo & PAGE_MASK;
364
365        // Total pages mapped / touched
366        size_t pagecount = (byteoffset + bytes + PAGE_MASK) >> PAGE_SHIFT;
367
368        // read disk (OP_READ) -> memory (PERM_WRITE) or
369        // write memory (PERM_READ) -> disk (OP_WRITE)
370        uint32_t opt = (txn->opcode == NVME_OP_READ) ? ZX_BTI_PERM_WRITE : ZX_BTI_PERM_READ;
371
372        pages = utxn->virt;
373
374        if ((r = zx_bti_pin(nvme->bti, opt, vmo, pageoffset, pagecount << PAGE_SHIFT,
375                            pages, pagecount, &utxn->pmt)) != ZX_OK) {
376            zxlogf(ERROR, "nvme: could not pin pages: %d\n", r);
377            break;
378        }
379
380        nvme_cmd_t cmd;
381        memset(&cmd, 0, sizeof(cmd));
382        cmd.cmd = NVME_CMD_CID(utxn->id) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(txn->opcode);
383        cmd.nsid = 1;
384        cmd.u.rw.start_lba = txn->op.rw.offset_dev;
385        cmd.u.rw.block_count = blocks - 1;
386        // The NVME command has room for two data pointers inline.
387        // The first is always the pointer to the first page where data is.
388        // The second is the second page if pagecount is 2.
389        // The second is the address of an array of page 2..n if pagecount > 2
390        cmd.dptr.prp[0] = pages[0] | byteoffset;
391        if (pagecount == 2) {
392            cmd.dptr.prp[1] = pages[1];
393        } else if (pagecount > 2) {
394            cmd.dptr.prp[1] = utxn->phys + sizeof(uint64_t);
395        }
396
397        zxlogf(TRACE, "nvme: txn=%p utxn id=%u pages=%zu op=%s\n", txn, utxn->id, pagecount,
398               txn->opcode == NVME_OP_WRITE ? "WR" : "RD");
399        zxlogf(SPEW, "nvme: prp[0]=%016zx prp[1]=%016zx\n", cmd.dptr.prp[0], cmd.dptr.prp[1]);
400        zxlogf(SPEW, "nvme: pages[] = { %016zx, %016zx, %016zx, %016zx, ... }\n",
401               pages[0], pages[1], pages[2], pages[3]);
402
403        if ((r = nvme_io_sq_put(nvme, &cmd)) != ZX_OK) {
404            zxlogf(ERROR, "nvme: could not submit cmd (txn=%p id=%u)\n", txn, utxn->id);
405            break;
406        }
407
408        utxn->txn = txn;
409
410        // keep track of where we are
411        txn->op.rw.offset_dev += blocks;
412        txn->op.rw.offset_vmo += bytes;
413        txn->op.rw.length -= blocks;
414        txn->pending_utxns++;
415
416        // If there's no more remaining, we're done, and we
417        // move this txn to the active list and tell the
418        // caller not to retain the txn (false)
419        if (txn->op.rw.length == 0) {
420            mtx_lock(&nvme->lock);
421            list_add_tail(&nvme->active_txns, &txn->node);
422            mtx_unlock(&nvme->lock);
423            return false;
424        }
425    }
426
427    // failure
428    if ((r = zx_pmt_unpin(utxn->pmt)) != ZX_OK) {
429        zxlogf(ERROR, "nvme: cannot unpin io buffer: %d\n", r);
430    }
431    utxn_put(nvme, utxn);
432
433    mtx_lock(&nvme->lock);
434    txn->flags |= TXN_FLAG_FAILED;
435    if (txn->pending_utxns) {
436        // if there are earlier uncompleted IOs we become active now
437        // and will finish erroring out when they complete
438        list_add_tail(&nvme->active_txns, &txn->node);
439        txn = NULL;
440    }
441    mtx_unlock(&nvme->lock);
442
443    if (txn != NULL) {
444        txn_complete(txn, ZX_ERR_INTERNAL);
445    }
446
447    // Either way we tell the caller not to retain the txn (false)
448    return false;
449}
450
451static void io_process_txns(nvme_device_t* nvme) {
452    nvme_txn_t* txn;
453
454    for (;;) {
455        mtx_lock(&nvme->lock);
456        txn = list_remove_head_type(&nvme->pending_txns, nvme_txn_t, node);
457        STAT_DEC_IF(pending, txn != NULL);
458        mtx_unlock(&nvme->lock);
459
460        if (txn == NULL) {
461            return;
462        }
463
464        if (io_process_txn(nvme, txn)) {
465            // put txn back at front of queue for further processing later
466            mtx_lock(&nvme->lock);
467            list_add_head(&nvme->pending_txns, &txn->node);
468            STAT_INC_MAX(pending);
469            mtx_unlock(&nvme->lock);
470            return;
471        }
472    }
473}
474
475static void io_process_cpls(nvme_device_t* nvme) {
476    bool ring_doorbell = false;
477    nvme_cpl_t cpl;
478
479    while (nvme_io_cq_get(nvme, &cpl) == ZX_OK) {
480        ring_doorbell = true;
481
482        if (cpl.cmd_id >= UTXN_COUNT) {
483            zxlogf(ERROR, "nvme: unexpected cmd id %u\n", cpl.cmd_id);
484            continue;
485        }
486        nvme_utxn_t* utxn = nvme->utxn + cpl.cmd_id;
487        nvme_txn_t* txn = utxn->txn;
488
489        if (txn == NULL) {
490            zxlogf(ERROR, "nvme: inactive utxn #%u completed?!\n", cpl.cmd_id);
491            continue;
492        }
493
494        uint32_t code = NVME_CPL_STATUS_CODE(cpl.status);
495        if (code != 0) {
496            zxlogf(ERROR, "nvme: utxn #%u txn %p failed: status=%03x\n",
497                   cpl.cmd_id, txn, code);
498            txn->flags |= TXN_FLAG_FAILED;
499            // discard any remaining bytes -- no reason to keep creating
500            // further utxns once one has failed
501            txn->op.rw.length = 0;
502        } else {
503            zxlogf(SPEW, "nvme: utxn #%u txn %p OKAY\n", cpl.cmd_id, txn);
504        }
505
506        zx_status_t r;
507        if ((r = zx_pmt_unpin(utxn->pmt)) != ZX_OK) {
508            zxlogf(ERROR, "nvme: cannot unpin io buffer: %d\n", r);
509        }
510
511        // release the microtransaction
512        utxn->txn = NULL;
513        utxn_put(nvme, utxn);
514
515        txn->pending_utxns--;
516        if ((txn->pending_utxns == 0) && (txn->op.rw.length == 0)) {
517            // remove from either pending or active list
518            mtx_lock(&nvme->lock);
519            list_delete(&txn->node);
520            mtx_unlock(&nvme->lock);
521            zxlogf(TRACE, "nvme: txn %p %s\n", txn, txn->flags & TXN_FLAG_FAILED ? "error" : "okay");
522            txn_complete(txn, txn->flags & TXN_FLAG_FAILED ? ZX_ERR_IO : ZX_OK);
523        }
524    }
525
526    if (ring_doorbell) {
527        nvme_io_cq_ack(nvme);
528    }
529}
530
531static int io_thread(void* arg) {
532    nvme_device_t* nvme = arg;
533    for (;;) {
534        if (sync_completion_wait(&nvme->io_signal, ZX_TIME_INFINITE)) {
535            break;
536        }
537        if (nvme->flags & FLAG_SHUTDOWN) {
538            //TODO: cancel out pending IO
539            zxlogf(INFO, "nvme: io thread exiting\n");
540            break;
541        }
542
543        sync_completion_reset(&nvme->io_signal);
544
545        // process completion messages
546        io_process_cpls(nvme);
547
548        // process work queue
549        io_process_txns(nvme);
550
551    }
552    return 0;
553}
554
555static void nvme_queue(void* ctx, block_op_t* op) {
556    nvme_device_t* nvme = ctx;
557    nvme_txn_t* txn = containerof(op, nvme_txn_t, op);
558
559    switch (txn->op.command & BLOCK_OP_MASK) {
560    case BLOCK_OP_READ:
561        txn->opcode = NVME_OP_READ;
562        break;
563    case BLOCK_OP_WRITE:
564        txn->opcode = NVME_OP_WRITE;
565        break;
566    case BLOCK_OP_FLUSH:
567        // TODO
568        txn_complete(txn, ZX_OK);
569        return;
570    default:
571        txn_complete(txn, ZX_ERR_NOT_SUPPORTED);
572        return;
573    }
574
575    if (txn->op.rw.length == 0) {
576        txn_complete(txn, ZX_ERR_INVALID_ARGS);
577        return;
578    }
579    // Transaction must fit within device
580    if ((txn->op.rw.offset_dev >= nvme->info.block_count) ||
581        (nvme->info.block_count - txn->op.rw.offset_dev < txn->op.rw.length)) {
582        txn_complete(txn, ZX_ERR_OUT_OF_RANGE);
583        return;
584    }
585
586    // convert vmo offset to a byte offset
587    txn->op.rw.offset_vmo *= nvme->info.block_size;
588
589    txn->pending_utxns = 0;
590    txn->flags = 0;
591
592    zxlogf(SPEW, "nvme: io: %s: %ublks @ blk#%zu\n",
593           txn->opcode == NVME_OP_WRITE ? "wr" : "rd",
594           txn->op.rw.length + 1U, txn->op.rw.offset_dev);
595
596    STAT_INC(total_ops);
597    STAT_ADD(total_blocks, txn->op.rw.length);
598
599    mtx_lock(&nvme->lock);
600    list_add_tail(&nvme->pending_txns, &txn->node);
601    STAT_INC_MAX(pending);
602    mtx_unlock(&nvme->lock);
603
604    sync_completion_signal(&nvme->io_signal);
605}
606
607static void nvme_query(void* ctx, block_info_t* info_out, size_t* block_op_size_out) {
608    nvme_device_t* nvme = ctx;
609    *info_out = nvme->info;
610    *block_op_size_out = sizeof(nvme_txn_t);
611}
612
613static zx_status_t nvme_ioctl(void* ctx, uint32_t op, const void* cmd, size_t cmdlen, void* reply,
614                              size_t max, size_t* out_actual) {
615    nvme_device_t* nvme = ctx;
616    switch (op) {
617    case IOCTL_BLOCK_GET_INFO: {
618        if (max < sizeof(block_info_t)) {
619            return ZX_ERR_BUFFER_TOO_SMALL;
620        }
621        size_t sz;
622        nvme_query(nvme, reply, &sz);
623        *out_actual = sizeof(block_info_t);
624        return ZX_OK;
625    }
626    case IOCTL_BLOCK_GET_STATS: {
627#if WITH_STATS
628        if (cmdlen != sizeof(bool)) {
629            return ZX_ERR_INVALID_ARGS;
630        }
631        block_stats_t* out = reply;
632        if (max < sizeof(*out)) {
633            return ZX_ERR_BUFFER_TOO_SMALL;
634        }
635        mtx_lock(&nvme->lock);
636        out->max_concur = nvme->stat_max_concur;
637        out->max_pending = nvme->stat_max_pending;
638        out->total_ops = nvme->stat_total_ops;
639        out->total_blocks = nvme->stat_total_blocks;
640        bool clear = *(bool *)cmd;
641        if (clear) {
642            nvme->stat_max_concur = 0;
643            nvme->stat_max_pending = 0;
644            nvme->stat_total_ops = 0;
645            nvme->stat_total_blocks = 0;
646        }
647        mtx_unlock(&nvme->lock);
648        *out_actual = sizeof(*out);
649        return ZX_OK;
650#else
651        return ZX_ERR_NOT_SUPPORTED;
652#endif
653    }
654    case IOCTL_DEVICE_SYNC: {
655        return ZX_OK;
656    }
657    default:
658        return ZX_ERR_NOT_SUPPORTED;
659    }
660}
661
662static zx_off_t nvme_get_size(void* ctx) {
663    nvme_device_t* nvme = ctx;
664    return nvme->info.block_count * nvme->info.block_size;
665}
666
667static zx_status_t nvme_suspend(void* ctx, uint32_t flags) {
668    return ZX_OK;
669}
670
671static zx_status_t nvme_resume(void* ctx, uint32_t flags) {
672    return ZX_OK;
673}
674
675static void nvme_release(void* ctx) {
676    nvme_device_t* nvme = ctx;
677    int r;
678
679    zxlogf(INFO, "nvme: release\n");
680    nvme->flags |= FLAG_SHUTDOWN;
681    if (nvme->ioh != ZX_HANDLE_INVALID) {
682        pci_enable_bus_master(&nvme->pci, false);
683        zx_handle_close(nvme->bti);
684        zx_handle_close(nvme->ioh);
685        // TODO: risks a handle use-after-close, will be resolved by IRQ api
686        // changes coming soon
687        zx_handle_close(nvme->irqh);
688    }
689    if (nvme->flags & FLAG_IRQ_THREAD_STARTED) {
690        thrd_join(nvme->irqthread, &r);
691    }
692    if (nvme->flags & FLAG_IO_THREAD_STARTED) {
693        sync_completion_signal(&nvme->io_signal);
694        thrd_join(nvme->iothread, &r);
695    }
696
697    // error out any pending txns
698    mtx_lock(&nvme->lock);
699    nvme_txn_t* txn;
700    while ((txn = list_remove_head_type(&nvme->active_txns, nvme_txn_t, node)) != NULL) {
701        txn_complete(txn, ZX_ERR_PEER_CLOSED);
702    }
703    while ((txn = list_remove_head_type(&nvme->pending_txns, nvme_txn_t, node)) != NULL) {
704        txn_complete(txn, ZX_ERR_PEER_CLOSED);
705    }
706    mtx_unlock(&nvme->lock);
707
708    io_buffer_release(&nvme->iob);
709    free(nvme);
710}
711
712static zx_protocol_device_t device_ops = {
713    .version = DEVICE_OPS_VERSION,
714
715    .ioctl = nvme_ioctl,
716    .get_size = nvme_get_size,
717
718    .suspend = nvme_suspend,
719    .resume = nvme_resume,
720    .release = nvme_release,
721};
722
723static void infostring(const char* prefix, uint8_t* str, size_t len) {
724    char tmp[len + 1];
725    size_t i;
726    for (i = 0; i < len; i++) {
727        uint8_t c = str[i];
728        if (c == 0) {
729            break;
730        }
731        if ((c < ' ') || (c > 127)) {
732            c = ' ';
733        }
734        tmp[i] = c;
735    }
736    tmp[i] = 0;
737    while (i > 0) {
738        i--;
739        if (tmp[i] == ' ') {
740            tmp[i] = 0;
741        } else {
742            break;
743        }
744    }
745    zxlogf(INFO, "nvme: %s'%s'\n", prefix, tmp);
746}
747
748// Convenience accessors for BAR0 registers
749#define rd32(r) readl(nvme->io + NVME_REG_##r)
750#define rd64(r) readll(nvme->io + NVME_REG_##r)
751#define wr32(v,r) writel(v, nvme->io + NVME_REG_##r)
752#define wr64(v,r) writell(v, nvme->io + NVME_REG_##r)
753
754// dedicated pages from the page pool
755#define IDX_ADMIN_SQ   0
756#define IDX_ADMIN_CQ   1
757#define IDX_IO_SQ      2
758#define IDX_IO_CQ      3
759#define IDX_SCRATCH    4
760#define IDX_UTXN_POOL  5 // this must always be last
761
762#define IO_PAGE_COUNT  (IDX_UTXN_POOL + UTXN_COUNT)
763
764static inline uint64_t U64(uint8_t* x) {
765    return *((uint64_t*) (void*) x);
766}
767static inline uint32_t U32(uint8_t* x) {
768    return *((uint32_t*) (void*) x);
769}
770static inline uint32_t U16(uint8_t* x) {
771    return *((uint16_t*) (void*) x);
772}
773
774#define WAIT_MS 5000
775
776static zx_status_t nvme_init(nvme_device_t* nvme) {
777    uint32_t n = rd32(VS);
778    uint64_t cap = rd64(CAP);
779
780    zxlogf(INFO, "nvme: version %d.%d.%d\n", n >> 16, (n >> 8) & 0xFF, n & 0xFF);
781    zxlogf(INFO, "nvme: page size: (MPSMIN): %u (MPSMAX): %u\n",
782           (unsigned) (1 << NVME_CAP_MPSMIN(cap)),
783           (unsigned) (1 << NVME_CAP_MPSMAX(cap)));
784    zxlogf(INFO, "nvme: doorbell stride: %u\n", (unsigned) (1 << NVME_CAP_DSTRD(cap)));
785    zxlogf(INFO, "nvme: timeout: %u ms\n", (unsigned) (1 << NVME_CAP_TO(cap)));
786    zxlogf(INFO, "nvme: boot partition support (BPS): %c\n", NVME_CAP_BPS(cap) ? 'Y' : 'N');
787    zxlogf(INFO, "nvme: supports NVM command set (CSS:NVM): %c\n", NVME_CAP_CSS_NVM(cap) ? 'Y' : 'N');
788    zxlogf(INFO, "nvme: subsystem reset supported (NSSRS): %c\n", NVME_CAP_NSSRS(cap) ? 'Y' : 'N');
789    zxlogf(INFO, "nvme: weighted-round-robin (AMS:WRR): %c\n", NVME_CAP_AMS_WRR(cap) ? 'Y' : 'N');
790    zxlogf(INFO, "nvme: vendor-specific arbitration (AMS:VS): %c\n", NVME_CAP_AMS_VS(cap) ? 'Y' : 'N');
791    zxlogf(INFO, "nvme: contiquous queues required (CQR): %c\n", NVME_CAP_CQR(cap) ? 'Y' : 'N');
792    zxlogf(INFO, "nvme: maximum queue entries supported (MQES): %u\n", ((unsigned) NVME_CAP_MQES(cap)) + 1);
793
794    if ((1 << NVME_CAP_MPSMIN(cap)) > PAGE_SIZE) {
795        zxlogf(ERROR, "nvme: minimum page size larger than platform page size\n");
796        return ZX_ERR_NOT_SUPPORTED;
797    }
798    // allocate pages for various queues and the utxn scatter lists
799    // TODO: these should all be RO to hardware apart from the scratch io page(s)
800    if (io_buffer_init(&nvme->iob, nvme->bti, PAGE_SIZE * IO_PAGE_COUNT, IO_BUFFER_RW) ||
801        io_buffer_physmap(&nvme->iob)) {
802        zxlogf(ERROR, "nvme: could not allocate io buffers\n");
803        return ZX_ERR_NO_MEMORY;
804    }
805
806    // initialize the microtransaction pool
807    nvme->utxn_avail = 0x7FFFFFFFFFFFFFFFULL;
808    for (unsigned n = 0; n < UTXN_COUNT; n++) {
809        nvme->utxn[n].id = n;
810        nvme->utxn[n].phys = nvme->iob.phys_list[IDX_UTXN_POOL + n];
811        nvme->utxn[n].virt = nvme->iob.virt + (IDX_UTXN_POOL + n) * PAGE_SIZE;
812    }
813
814    if (rd32(CSTS) & NVME_CSTS_RDY) {
815        zxlogf(INFO, "nvme: controller is active. resetting...\n");
816        wr32(rd32(CC) & ~NVME_CC_EN, CC); // disable
817    }
818
819    // ensure previous shutdown (by us or bootloader) has completed
820    unsigned ms_remain = WAIT_MS;
821    while (rd32(CSTS) & NVME_CSTS_RDY) {
822        if (--ms_remain == 0) {
823            zxlogf(ERROR, "nvme: timed out waiting for CSTS ~RDY\n");
824            return ZX_ERR_INTERNAL;
825        }
826        zx_nanosleep(zx_deadline_after(ZX_MSEC(1)));
827    }
828
829    zxlogf(INFO, "nvme: controller inactive. (after %u ms)\n", WAIT_MS - ms_remain);
830
831    // configure admin submission and completion queues
832    wr64(nvme->iob.phys_list[IDX_ADMIN_SQ], ASQ);
833    wr64(nvme->iob.phys_list[IDX_ADMIN_CQ], ACQ);
834    wr32(NVME_AQA_ASQS(SQMAX - 1) | NVME_AQA_ACQS(CQMAX - 1), AQA);
835
836    zxlogf(INFO, "nvme: enabling\n");
837    wr32(NVME_CC_EN | NVME_CC_AMS_RR | NVME_CC_MPS(0) |
838         NVME_CC_IOCQES(NVME_CPL_SHIFT) |
839         NVME_CC_IOSQES(NVME_CMD_SHIFT), CC);
840
841    ms_remain = WAIT_MS;
842    while (!(rd32(CSTS) & NVME_CSTS_RDY)) {
843        if (--ms_remain == 0) {
844            zxlogf(ERROR, "nvme: timed out waiting for CSTS RDY\n");
845            return ZX_ERR_INTERNAL;
846        }
847        zx_nanosleep(zx_deadline_after(ZX_MSEC(1)));
848    }
849    zxlogf(INFO, "nvme: controller ready. (after %u ms)\n", WAIT_MS - ms_remain);
850
851    // registers and buffers for admin queues
852    nvme->io_admin_sq_tail_db = nvme->io + NVME_REG_SQnTDBL(0, cap);
853    nvme->io_admin_cq_head_db = nvme->io + NVME_REG_CQnHDBL(0, cap);
854
855    nvme->admin_sq = nvme->iob.virt + PAGE_SIZE * IDX_ADMIN_SQ;
856    nvme->admin_sq_head = 0;
857    nvme->admin_sq_tail = 0;
858
859    nvme->admin_cq = nvme->iob.virt + PAGE_SIZE * IDX_ADMIN_CQ;
860    nvme->admin_cq_head = 0;
861    nvme->admin_cq_toggle = 1;
862
863    // registers and buffers for IO queues
864    nvme->io_sq_tail_db = nvme->io + NVME_REG_SQnTDBL(1, cap);
865    nvme->io_cq_head_db = nvme->io + NVME_REG_CQnHDBL(1, cap);
866
867    nvme->io_sq = nvme->iob.virt + PAGE_SIZE * IDX_IO_SQ;
868    nvme->io_sq_head = 0;
869    nvme->io_sq_tail = 0;
870
871    nvme->io_cq = nvme->iob.virt + PAGE_SIZE * IDX_IO_CQ;
872    nvme->io_cq_head = 0;
873    nvme->io_cq_toggle = 1;
874
875    // scratch page for admin ops
876    void* scratch = nvme->iob.virt + PAGE_SIZE * IDX_SCRATCH;
877
878    if (thrd_create_with_name(&nvme->irqthread, irq_thread, nvme, "nvme-irq-thread")) {
879        zxlogf(ERROR, "nvme; cannot create irq thread\n");
880        return ZX_ERR_INTERNAL;
881    }
882    nvme->flags |= FLAG_IRQ_THREAD_STARTED;
883
884    if (thrd_create_with_name(&nvme->iothread, io_thread, nvme, "nvme-io-thread")) {
885        zxlogf(ERROR, "nvme; cannot create io thread\n");
886        return ZX_ERR_INTERNAL;
887    }
888    nvme->flags |= FLAG_IO_THREAD_STARTED;
889
890    nvme_cmd_t cmd;
891
892    // identify device
893    cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_IDENTIFY);
894    cmd.nsid = 0;
895    cmd.reserved = 0;
896    cmd.mptr = 0;
897    cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_SCRATCH];
898    cmd.dptr.prp[1] = 0;
899    cmd.u.raw[0] = 1; // CNS 01
900
901    if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
902        zxlogf(ERROR, "nvme: device identify op failed\n");
903        return ZX_ERR_INTERNAL;
904    }
905
906    nvme_identify_t* ci = scratch;
907    infostring("model:         ", ci->MN, sizeof(ci->MN));
908    infostring("serial number: ", ci->SN, sizeof(ci->SN));
909    infostring("firmware:      ", ci->FR, sizeof(ci->FR));
910
911    if ((ci->SQES & 0xF) != NVME_CMD_SHIFT) {
912        zxlogf(ERROR, "nvme: SQES minimum is not %ub\n", NVME_CMD_SIZE);
913        return ZX_ERR_NOT_SUPPORTED;
914    }
915    if ((ci->CQES & 0xF) != NVME_CPL_SHIFT) {
916        zxlogf(ERROR, "nvme: CQES minimum is not %ub\n", NVME_CPL_SIZE);
917        return ZX_ERR_NOT_SUPPORTED;
918    }
919    zxlogf(INFO, "nvme: max outstanding commands: %u\n", ci->MAXCMD);
920
921    uint32_t nscount = ci->NN;
922    zxlogf(INFO, "nvme: max namespaces: %u\n", nscount);
923    zxlogf(INFO, "nvme: scatter gather lists (SGL): %c %08x\n",
924           (ci->SGLS & 3) ? 'Y' : 'N', ci->SGLS);
925
926    // Maximum transfer is in units of 2^n * PAGESIZE, n == 0 means "infinite"
927    nvme->max_xfer = 0xFFFFFFFF;
928    if ((ci->MDTS != 0) && (ci->MDTS < (31 - PAGE_SHIFT))) {
929        nvme->max_xfer = (1 << ci->MDTS) * PAGE_SIZE;
930    }
931
932    zxlogf(INFO, "nvme: max data transfer: %u bytes\n", nvme->max_xfer);
933    zxlogf(INFO, "nvme: sanitize caps: %u\n", ci->SANICAP & 3);
934
935    zxlogf(INFO, "nvme: abort command limit (ACL): %u\n", ci->ACL + 1);
936    zxlogf(INFO, "nvme: asynch event req limit (AERL): %u\n", ci->AERL + 1);
937    zxlogf(INFO, "nvme: firmware: slots: %u reset: %c slot1ro: %c\n", (ci->FRMW >> 1) & 3,
938           (ci->FRMW & (1 << 4)) ? 'N' : 'Y', (ci->FRMW & 1) ? 'Y' : 'N');
939    zxlogf(INFO, "nvme: host buffer: min/preferred: %u/%u pages\n", ci->HMMIN, ci->HMPRE);
940    zxlogf(INFO, "nvme: capacity: total/unalloc: %zu/%zu\n", ci->TNVMCAP_LO, ci->UNVMCAP_LO);
941
942    if (ci->VWC & 1) {
943        nvme->flags |= FLAG_HAS_VWC;
944    }
945    uint32_t awun = ci->AWUN + 1;
946    uint32_t awupf = ci->AWUPF + 1;
947    zxlogf(INFO, "nvme: volatile write cache (VWC): %s\n", nvme->flags & FLAG_HAS_VWC ? "Y" : "N");
948    zxlogf(INFO, "nvme: atomic write unit (AWUN)/(AWUPF): %u/%u blks\n", awun, awupf);
949
950#define FEATURE(a,b) if (ci->a & a##_##b) zxlogf(INFO, "nvme: feature: %s\n", #b)
951    FEATURE(OACS, DOORBELL_BUFFER_CONFIG);
952    FEATURE(OACS, VIRTUALIZATION_MANAGEMENT);
953    FEATURE(OACS, NVME_MI_SEND_RECV);
954    FEATURE(OACS, DIRECTIVE_SEND_RECV);
955    FEATURE(OACS, DEVICE_SELF_TEST);
956    FEATURE(OACS, NAMESPACE_MANAGEMENT);
957    FEATURE(OACS, FIRMWARE_DOWNLOAD_COMMIT);
958    FEATURE(OACS, FORMAT_NVM);
959    FEATURE(OACS, SECURITY_SEND_RECV);
960    FEATURE(ONCS, TIMESTAMP);
961    FEATURE(ONCS, RESERVATIONS);
962    FEATURE(ONCS, SAVE_SELECT_NONZERO);
963    FEATURE(ONCS, WRITE_UNCORRECTABLE);
964    FEATURE(ONCS, COMPARE);
965
966    // set feature (number of queues) to 1 iosq and 1 iocq
967    memset(&cmd, 0, sizeof(cmd));
968    cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_SET_FEATURE);
969    cmd.u.raw[0] = NVME_FEATURE_NUMBER_OF_QUEUES;
970    cmd.u.raw[1] = 0;
971
972    nvme_cpl_t cpl;
973    if (nvme_admin_txn(nvme, &cmd, &cpl) != ZX_OK) {
974        zxlogf(ERROR, "nvme: set feature (number queues) op failed\n");
975        return ZX_ERR_INTERNAL;
976    }
977    zxlogf(INFO,"cpl.cmd %08x\n", cpl.cmd);
978
979    // create the IO completion queue
980    memset(&cmd, 0, sizeof(cmd));
981    cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_CREATE_IOCQ);
982    cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_IO_CQ];
983    cmd.u.raw[0] = ((CQMAX - 1) << 16) | 1; // queue size, queue id
984    cmd.u.raw[1] = (0 << 16) | 2 | 1; // irq vector, irq enable, phys contig
985
986    if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
987        zxlogf(ERROR, "nvme: completion queue creation op failed\n");
988        return ZX_ERR_INTERNAL;
989    }
990
991    // create the IO submit queue
992    memset(&cmd, 0, sizeof(cmd));
993    cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_CREATE_IOSQ);
994    cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_IO_SQ];
995    cmd.u.raw[0] = ((SQMAX - 1) << 16) | 1; // queue size, queue id
996    cmd.u.raw[1] = (1 << 16) | 0 | 1; // cqid, qprio, phys contig
997
998    if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
999        zxlogf(ERROR, "nvme: submit queue creation op failed\n");
1000        return ZX_ERR_INTERNAL;
1001    }
1002
1003    // identify namespace 1
1004    memset(&cmd, 0, sizeof(cmd));
1005    cmd.cmd = NVME_CMD_CID(0) | NVME_CMD_PRP | NVME_CMD_NORMAL | NVME_CMD_OPC(NVME_ADMIN_OP_IDENTIFY);
1006    cmd.nsid = 1;
1007    cmd.dptr.prp[0] = nvme->iob.phys_list[IDX_SCRATCH];
1008
1009    if (nvme_admin_txn(nvme, &cmd, NULL) != ZX_OK) {
1010        zxlogf(ERROR, "nvme: namespace identify op failed\n");
1011        return ZX_ERR_INTERNAL;
1012    }
1013
1014    nvme_identify_ns_t* ni = scratch;
1015
1016    uint32_t nawun = (ni->NSFEAT & NSFEAT_LOCAL_ATOMIC_SIZES) ? (ni->NAWUN + 1U) : awun;
1017    uint32_t nawupf = (ni->NSFEAT & NSFEAT_LOCAL_ATOMIC_SIZES) ? (ni->NAWUPF + 1U) : awupf;
1018    zxlogf(INFO, "nvme: ns: atomic write unit (AWUN)/(AWUPF): %u/%u blks\n", nawun, nawupf);
1019    zxlogf(INFO, "nvme: ns: NABSN/NABO/NABSPF/NOIOB: %u/%u/%u/%u\n",
1020           ni->NABSN, ni->NABO, ni->NABSPF, ni->NOIOB);
1021
1022    // table of block formats
1023    for (unsigned i = 0; i < 16; i++) {
1024        if (ni->LBAF[i]) {
1025            zxlogf(INFO, "nvme: ns: LBA FMT %02d: RP=%u LBADS=2^%ub MS=%ub\n",
1026                    i, NVME_LBAFMT_RP(ni->LBAF[i]), NVME_LBAFMT_LBADS(ni->LBAF[i]),
1027                    NVME_LBAFMT_MS(ni->LBAF[i]));
1028        }
1029    }
1030
1031    zxlogf(INFO, "nvme: ns: LBA FMT #%u active\n", ni->FLBAS & 0xF);
1032    zxlogf(INFO, "nvme: ns: data protection: caps/set: 0x%02x/%u\n",
1033           ni->DPC & 0x3F, ni->DPS & 3);
1034
1035    uint32_t fmt = ni->LBAF[ni->FLBAS & 0xF];
1036
1037    zxlogf(INFO, "nvme: ns: size/cap/util: %zu/%zu/%zu blks\n", ni->NSSZ, ni->NCAP, ni->NUSE);
1038
1039    nvme->info.block_count = ni->NSSZ;
1040    nvme->info.block_size = 1 << NVME_LBAFMT_LBADS(fmt);
1041    nvme->info.max_transfer_size = BLOCK_MAX_TRANSFER_UNBOUNDED;
1042
1043    if (NVME_LBAFMT_MS(fmt)) {
1044        zxlogf(ERROR, "nvme: cannot handle LBA format with metadata\n");
1045        return ZX_ERR_NOT_SUPPORTED;
1046    }
1047    if ((nvme->info.block_size < 512) || (nvme->info.block_size > 32768)) {
1048        zxlogf(ERROR, "nvme: cannot handle LBA size of %u\n", nvme->info.block_size);
1049        return ZX_ERR_NOT_SUPPORTED;
1050    }
1051
1052    // NVME r/w commands operate in block units, maximum of 64K:
1053    size_t max_bytes_per_cmd = ((size_t) nvme->info.block_size) * ((size_t) 65536);
1054
1055    if (nvme->max_xfer > max_bytes_per_cmd) {
1056        nvme->max_xfer = max_bytes_per_cmd;
1057    }
1058
1059    // The device may allow transfers larger than we are prepared
1060    // to handle.  Clip to our limit.
1061    if (nvme->max_xfer > MAX_XFER) {
1062        nvme->max_xfer = MAX_XFER;
1063    }
1064
1065    // convert to block units
1066    nvme->max_xfer /= nvme->info.block_size;
1067    zxlogf(INFO, "nvme: max transfer per r/w op: %u blocks (%u bytes)\n",
1068           nvme->max_xfer, nvme->max_xfer * nvme->info.block_size);
1069
1070    device_make_visible(nvme->zxdev);
1071    return ZX_OK;
1072}
1073
1074block_protocol_ops_t block_ops = {
1075    .query = nvme_query,
1076    .queue = nvme_queue,
1077};
1078
1079static zx_status_t nvme_bind(void* ctx, zx_device_t* dev) {
1080    nvme_device_t* nvme;
1081    if ((nvme = calloc(1, sizeof(nvme_device_t))) == NULL) {
1082        return ZX_ERR_NO_MEMORY;
1083    }
1084    list_initialize(&nvme->pending_txns);
1085    list_initialize(&nvme->active_txns);
1086    mtx_init(&nvme->lock, mtx_plain);
1087    mtx_init(&nvme->admin_lock, mtx_plain);
1088
1089    if (device_get_protocol(dev, ZX_PROTOCOL_PCI, &nvme->pci)) {
1090        goto fail;
1091    }
1092
1093    if (pci_map_bar(&nvme->pci, 0u, ZX_CACHE_POLICY_UNCACHED_DEVICE,
1094                    &nvme->io, &nvme->iosz, &nvme->ioh)) {
1095        zxlogf(ERROR, "nvme: cannot map registers\n");
1096        goto fail;
1097    }
1098
1099    uint32_t modes[3] = {
1100        ZX_PCIE_IRQ_MODE_MSI_X, ZX_PCIE_IRQ_MODE_MSI, ZX_PCIE_IRQ_MODE_LEGACY,
1101    };
1102    uint32_t nirq = 0;
1103    for (unsigned n = 0; n < countof(modes); n++) {
1104        if ((pci_query_irq_mode(&nvme->pci, modes[n], &nirq) == ZX_OK) &&
1105            (pci_set_irq_mode(&nvme->pci, modes[n], 1) == ZX_OK)) {
1106            zxlogf(INFO, "nvme: irq mode %u, irq count %u (#%u)\n", modes[n], nirq, n);
1107            goto irq_configured;
1108        }
1109    }
1110    zxlogf(ERROR, "nvme: could not configure irqs\n");
1111    goto fail;
1112
1113irq_configured:
1114    if (pci_map_interrupt(&nvme->pci, 0, &nvme->irqh) != ZX_OK) {
1115        zxlogf(ERROR, "nvme: could not map irq\n");
1116        goto fail;
1117    }
1118    if (pci_enable_bus_master(&nvme->pci, true)) {
1119        zxlogf(ERROR, "nvme: cannot enable bus mastering\n");
1120        goto fail;
1121    }
1122    if (pci_get_bti(&nvme->pci, 0, &nvme->bti) != ZX_OK) {
1123        zxlogf(ERROR, "nvme: cannot obtain bti handle\n");
1124        goto fail;
1125    }
1126
1127    device_add_args_t args = {
1128        .version = DEVICE_ADD_ARGS_VERSION,
1129        .name = "nvme",
1130        .ctx = nvme,
1131        .ops = &device_ops,
1132        .flags = DEVICE_ADD_INVISIBLE,
1133        .proto_id = ZX_PROTOCOL_BLOCK_IMPL,
1134        .proto_ops = &block_ops,
1135    };
1136
1137    if (device_add(dev, &args, &nvme->zxdev)) {
1138        goto fail;
1139    }
1140
1141    if (nvme_init(nvme) != ZX_OK) {
1142        zxlogf(ERROR, "nvme: init failed\n");
1143        device_remove(nvme->zxdev);
1144        return ZX_ERR_INTERNAL;
1145    }
1146
1147    return ZX_OK;
1148
1149fail:
1150    nvme_release(nvme);
1151    return ZX_ERR_NOT_SUPPORTED;
1152}
1153
1154static zx_driver_ops_t driver_ops = {
1155    .version = DRIVER_OPS_VERSION,
1156    .bind = nvme_bind,
1157};
1158
1159ZIRCON_DRIVER_BEGIN(nvme, driver_ops, "zircon", "0.1", 4)
1160    BI_ABORT_IF(NE, BIND_PROTOCOL, ZX_PROTOCOL_PCI),
1161    BI_ABORT_IF(NE, BIND_PCI_CLASS, 1), // Mass Storage
1162    BI_ABORT_IF(NE, BIND_PCI_SUBCLASS, 8), // NVM
1163    BI_MATCH_IF(EQ, BIND_PCI_INTERFACE, 2), // NVMHCI
1164ZIRCON_DRIVER_END(nvme)
1165