1/*
2 * Copyright 2019-2022, Haiku, Inc. All rights reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 *		Augustin Cavalier <waddlesplash>
7 */
8
9
10#include <stdio.h>
11#include <stdlib.h>
12
13#include <algorithm>
14#include <condition_variable.h>
15#include <AutoDeleter.h>
16#include <kernel.h>
17#include <smp.h>
18#include <util/AutoLock.h>
19
20#include <fs/devfs.h>
21#include <bus/PCI.h>
22#include <vm/vm.h>
23
24#include "IORequest.h"
25
26extern "C" {
27#include <libnvme/nvme.h>
28#include <libnvme/nvme_internal.h>
29}
30
31
32//#define TRACE_NVME_DISK
33#ifdef TRACE_NVME_DISK
34#	define TRACE(x...) dprintf("nvme_disk: " x)
35#else
36#	define TRACE(x...) ;
37#endif
38#define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
39#define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
40#define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41
42
43static const uint8 kDriveIcon[] = {
44	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67};
68
69
70#define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
71#define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
72#define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
73
74#define NVME_MAX_QPAIRS					(16)
75
76
77static device_manager_info* sDeviceManager;
78
79typedef struct {
80	device_node*			node;
81	pci_info				info;
82
83	struct nvme_ctrlr*		ctrlr;
84
85	struct nvme_ns*			ns;
86	uint64					capacity;
87	uint32					block_size;
88	uint32					max_io_blocks;
89	status_t				media_status;
90
91	DMAResource				dma_resource;
92	sem_id					dma_buffers_sem;
93
94	rw_lock					rounded_write_lock;
95
96	ConditionVariable		interrupt;
97	int32					polling;
98
99	struct qpair_info {
100		struct nvme_qpair*	qpair;
101	}						qpairs[NVME_MAX_QPAIRS];
102	uint32					qpair_count;
103} nvme_disk_driver_info;
104typedef nvme_disk_driver_info::qpair_info qpair_info;
105
106
107typedef struct {
108	nvme_disk_driver_info*		info;
109} nvme_disk_handle;
110
111
112static status_t
113get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
114{
115	nvme_disk_driver_info* info = handle->info;
116
117	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
118	geometry->bytes_per_physical_sector = info->block_size;
119
120	geometry->device_type = B_DISK;
121	geometry->removable = false;
122
123	geometry->read_only = false;
124	geometry->write_once = false;
125
126	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
127		geometry->bytes_per_sector, geometry->sectors_per_track,
128		geometry->cylinder_count, geometry->head_count, geometry->device_type,
129		geometry->removable, geometry->read_only, geometry->write_once);
130
131	return B_OK;
132}
133
134
135static void
136nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
137	uint32 blockSize)
138{
139	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
140		info, capacity, blockSize);
141
142	info->capacity = capacity;
143	info->block_size = blockSize;
144}
145
146
147//	#pragma mark - device module API
148
149
150static int32 nvme_interrupt_handler(void* _info);
151
152
153static status_t
154nvme_disk_init_device(void* _info, void** _cookie)
155{
156	CALLED();
157	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
158	ASSERT(info->ctrlr == NULL);
159
160	pci_device_module_info* pci;
161	pci_device* pcidev;
162	device_node* parent = sDeviceManager->get_parent_node(info->node);
163	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
164		(void**)&pcidev);
165	pci->get_pci_info(pcidev, &info->info);
166	sDeviceManager->put_node(parent);
167
168	// construct the libnvme pci_device struct
169	pci_device* device = new pci_device;
170	device->vendor_id = info->info.vendor_id;
171	device->device_id = info->info.device_id;
172	device->subvendor_id = 0;
173	device->subdevice_id = 0;
174
175	device->domain = 0;
176	device->bus = info->info.bus;
177	device->dev = info->info.device;
178	device->func = info->info.function;
179
180	device->pci_info = &info->info;
181
182	// enable busmaster and memory mapped access
183	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
184	command |= PCI_command_master | PCI_command_memory;
185	pci->write_pci_config(pcidev, PCI_command, 2, command);
186
187	// open the controller
188	info->ctrlr = nvme_ctrlr_open(device, NULL);
189	if (info->ctrlr == NULL) {
190		TRACE_ERROR("failed to open the controller!\n");
191		return B_ERROR;
192	}
193
194	struct nvme_ctrlr_stat cstat;
195	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
196	if (err != 0) {
197		TRACE_ERROR("failed to get controller information!\n");
198		nvme_ctrlr_close(info->ctrlr);
199		return err;
200	}
201
202	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
203	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
204	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
205
206	// TODO: export more than just the first namespace!
207	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
208	if (info->ns == NULL) {
209		TRACE_ERROR("failed to open namespace!\n");
210		nvme_ctrlr_close(info->ctrlr);
211		return B_ERROR;
212	}
213	TRACE_ALWAYS("namespace 0\n");
214
215	struct nvme_ns_stat nsstat;
216	err = nvme_ns_stat(info->ns, &nsstat);
217	if (err != 0) {
218		TRACE_ERROR("failed to get namespace information!\n");
219		nvme_ctrlr_close(info->ctrlr);
220		return err;
221	}
222
223	// store capacity information
224	TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n",
225		nsstat.sector_size, info->ns->stripe_size);
226	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
227
228	command = pci->read_pci_config(pcidev, PCI_command, 2);
229	command &= ~(PCI_command_int_disable);
230	pci->write_pci_config(pcidev, PCI_command, 2, command);
231
232	uint32 irq = info->info.u.h0.interrupt_line;
233	if (irq == 0xFF)
234		irq = 0;
235
236	if (pci->get_msix_count(pcidev)) {
237		uint32 msixVector = 0;
238		if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK
239			&& pci->enable_msix(pcidev) == B_OK) {
240			TRACE_ALWAYS("using MSI-X\n");
241			irq = msixVector;
242		}
243	} else if (pci->get_msi_count(pcidev) >= 1) {
244		uint32 msiVector = 0;
245		if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK
246			&& pci->enable_msi(pcidev) == B_OK) {
247			TRACE_ALWAYS("using message signaled interrupts\n");
248			irq = msiVector;
249		}
250	}
251
252	if (irq == 0) {
253		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
254			info->info.bus, info->info.device, info->info.function);
255		info->polling = 1;
256	} else {
257		info->polling = 0;
258	}
259	info->interrupt.Init(NULL, NULL);
260	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
261
262	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
263		uint32 microseconds = 16, threshold = 32;
264		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
265			((microseconds / 100) << 8) | threshold, 0, NULL);
266	}
267
268	// allocate qpairs
269	uint32 try_qpairs = cstat.io_qpairs;
270	try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS);
271	if (try_qpairs >= (uint32)smp_get_num_cpus()) {
272		try_qpairs = smp_get_num_cpus();
273	} else {
274		// Find the highest number of qpairs that evenly divides the number of CPUs.
275		while ((smp_get_num_cpus() % try_qpairs) != 0)
276			try_qpairs--;
277	}
278	info->qpair_count = 0;
279	for (uint32 i = 0; i < try_qpairs; i++) {
280		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
281			(enum nvme_qprio)0, 0);
282		if (info->qpairs[i].qpair == NULL)
283			break;
284
285		info->qpair_count++;
286	}
287	if (info->qpair_count == 0) {
288		TRACE_ERROR("failed to allocate qpairs!\n");
289		nvme_ctrlr_close(info->ctrlr);
290		return B_NO_MEMORY;
291	}
292	if (info->qpair_count != try_qpairs) {
293		TRACE_ALWAYS("warning: did not get expected number of qpairs\n");
294	}
295
296	// allocate DMA buffers
297	int buffers = info->qpair_count * 2;
298
299	dma_restrictions restrictions = {};
300	restrictions.alignment = B_PAGE_SIZE;
301		// Technically, the first and last segments in a transfer can be aligned
302		// only on 32-bits, and the rest only need to have sizes that are a multiple
303		// of the block size.
304	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
305	restrictions.max_transfer_size = cstat.max_xfer_size;
306	info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
307
308	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
309	if (err != 0) {
310		TRACE_ERROR("failed to initialize DMA resource!\n");
311		nvme_ctrlr_close(info->ctrlr);
312		return err;
313	}
314
315	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
316	if (info->dma_buffers_sem < 0) {
317		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
318		nvme_ctrlr_close(info->ctrlr);
319		return info->dma_buffers_sem;
320	}
321
322	// set up rounded-write lock
323	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
324
325	*_cookie = info;
326	return B_OK;
327}
328
329
330static void
331nvme_disk_uninit_device(void* _cookie)
332{
333	CALLED();
334	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
335
336	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
337		nvme_interrupt_handler, (void*)info);
338
339	rw_lock_destroy(&info->rounded_write_lock);
340
341	nvme_ns_close(info->ns);
342	nvme_ctrlr_close(info->ctrlr);
343
344	// TODO: Deallocate MSI(-X).
345	// TODO: Deallocate PCI.
346}
347
348
349static status_t
350nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
351{
352	CALLED();
353
354	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
355	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
356		sizeof(nvme_disk_handle));
357	if (handle == NULL)
358		return B_NO_MEMORY;
359
360	handle->info = info;
361
362	*_cookie = handle;
363	return B_OK;
364}
365
366
367static status_t
368nvme_disk_close(void* cookie)
369{
370	CALLED();
371
372	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
373	return B_OK;
374}
375
376
377static status_t
378nvme_disk_free(void* cookie)
379{
380	CALLED();
381
382	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
383	free(handle);
384	return B_OK;
385}
386
387
388// #pragma mark - I/O
389
390
391static int32
392nvme_interrupt_handler(void* _info)
393{
394	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
395	info->interrupt.NotifyAll();
396	info->polling = -1;
397	return 0;
398}
399
400
401static qpair_info*
402get_qpair(nvme_disk_driver_info* info)
403{
404	return &info->qpairs[smp_get_current_cpu() % info->qpair_count];
405}
406
407
408static void
409io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
410{
411	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
412}
413
414
415static void
416await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
417{
418	CALLED();
419
420	ConditionVariableEntry entry;
421	int timeouts = 0;
422	while (status == EINPROGRESS) {
423		info->interrupt.Add(&entry);
424
425		nvme_qpair_poll(qpair, 0);
426
427		if (status != EINPROGRESS)
428			return;
429
430		if (info->polling > 0) {
431			entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000,
432				(1 << timeouts) * 1000));
433			timeouts++;
434		} else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
435			// This should never happen, as we are woken up on every interrupt
436			// no matter the qpair or transfer within; so if it does occur,
437			// that probably means the controller stalled, or maybe cannot
438			// generate interrupts at all.
439
440			TRACE_ERROR("timed out waiting for interrupt!\n");
441			if (timeouts++ >= 3) {
442				nvme_qpair_fail(qpair);
443				status = B_TIMED_OUT;
444				return;
445			}
446
447			info->polling++;
448			if (info->polling > 0) {
449				TRACE_ALWAYS("switching to polling mode, performance will be affected!\n");
450			}
451		}
452
453		nvme_qpair_poll(qpair, 0);
454	}
455}
456
457
458struct nvme_io_request {
459	status_t status;
460
461	bool write;
462
463	off_t lba_start;
464	size_t lba_count;
465
466	physical_entry* iovecs;
467	int32 iovec_count;
468
469	int32 iovec_i;
470	uint32 iovec_offset;
471};
472
473
474static void
475ior_reset_sgl(nvme_io_request* request, uint32_t offset)
476{
477	TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
478
479	int32 i = 0;
480	while (offset > 0 && request->iovecs[i].size <= offset) {
481		offset -= request->iovecs[i].size;
482		i++;
483	}
484	request->iovec_i = i;
485	request->iovec_offset = offset;
486}
487
488
489static int
490ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
491{
492	int32 index = request->iovec_i;
493	if (index < 0 || index > request->iovec_count)
494		return -1;
495
496	*address = request->iovecs[index].address + request->iovec_offset;
497	*length = request->iovecs[index].size - request->iovec_offset;
498
499	TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
500		request->iovec_i, request->iovec_offset, *address, *length);
501
502	request->iovec_i++;
503	request->iovec_offset = 0;
504	return 0;
505}
506
507
508static status_t
509do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
510{
511	request->status = EINPROGRESS;
512
513	qpair_info* qpinfo = get_qpair(info);
514	int ret = -1;
515	if (request->write) {
516		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
517			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
518			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
519			(nvme_req_next_sge_cb)ior_next_sge);
520	} else {
521		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
522			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
523			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
524			(nvme_req_next_sge_cb)ior_next_sge);
525	}
526	if (ret != 0) {
527		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
528			" blocks failed!\n", request->write ? "write" : "read",
529			request->lba_start, request->lba_count);
530
531		request->lba_count = 0;
532		return ret;
533	}
534
535	await_status(info, qpinfo->qpair, request->status);
536
537	if (request->status != B_OK) {
538		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
539			" blocks failed!\n", request->write ? "write" : "read",
540			request->lba_start, request->lba_count);
541
542		request->lba_count = 0;
543	}
544	return request->status;
545}
546
547
548static status_t
549nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
550{
551	CALLED();
552
553	WriteLocker writeLocker;
554	if (request->IsWrite())
555		writeLocker.SetTo(handle->info->rounded_write_lock, false);
556
557	status_t status = acquire_sem(handle->info->dma_buffers_sem);
558	if (status != B_OK) {
559		request->SetStatusAndNotify(status);
560		return status;
561	}
562
563	const size_t block_size = handle->info->block_size;
564
565	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
566		"; Write %s\n", request, request->Offset(), request->Length(),
567		request->IsWrite() ? "yes" : "no");
568
569	nvme_io_request nvme_request;
570	while (request->RemainingBytes() > 0) {
571		IOOperation operation;
572		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
573		if (status != B_OK)
574			break;
575
576		do {
577			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
578				", write: %s\n", request, operation.Offset(),
579				operation.Length(), operation.IsWrite() ? "yes" : "no");
580
581			nvme_request.write = operation.IsWrite();
582			nvme_request.lba_start = operation.Offset() / block_size;
583			nvme_request.lba_count = operation.Length() / block_size;
584			nvme_request.iovecs = (physical_entry*)operation.Vecs();
585			nvme_request.iovec_count = operation.VecCount();
586
587			status = do_nvme_io_request(handle->info, &nvme_request);
588
589			operation.SetStatus(status,
590				status == B_OK ? operation.Length() : 0);
591		} while (status == B_OK && !operation.Finish());
592
593		if (status == B_OK && operation.Status() != B_OK) {
594			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
595			status = operation.Status();
596		}
597
598		request->OperationFinished(&operation);
599
600		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
601
602		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
603			strerror(status), request->RemainingBytes());
604		if (status != B_OK)
605			break;
606	}
607
608	release_sem(handle->info->dma_buffers_sem);
609
610	// Notify() also takes care of UnlockMemory().
611	if (status != B_OK && request->Status() == B_OK)
612		request->SetStatusAndNotify(status);
613	else
614		request->NotifyFinished();
615	return status;
616}
617
618
619static status_t
620nvme_disk_io(void* cookie, io_request* request)
621{
622	CALLED();
623
624	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
625
626	const off_t ns_end = (handle->info->capacity * handle->info->block_size);
627	if ((request->Offset() + (off_t)request->Length()) > ns_end)
628		return ERANGE;
629
630	nvme_io_request nvme_request;
631	memset(&nvme_request, 0, sizeof(nvme_io_request));
632
633	nvme_request.write = request->IsWrite();
634
635	physical_entry* vtophys = NULL;
636	MemoryDeleter vtophysDeleter;
637
638	IOBuffer* buffer = request->Buffer();
639	status_t status = B_OK;
640	if (!buffer->IsPhysical()) {
641		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
642		if (status != B_OK) {
643			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
644			return status;
645		}
646		// SetStatusAndNotify() takes care of unlocking memory if necessary.
647
648		// This is slightly inefficient, as we could use a BStackOrHeapArray in
649		// the optimal case (few physical entries required), but we would not
650		// know whether or not that was possible until calling get_memory_map()
651		// and then potentially reallocating, which would complicate the logic.
652
653		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
654		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
655			* vtophys_length);
656		if (vtophys == NULL) {
657			TRACE_ERROR("failed to allocate memory for iovecs\n");
658			request->SetStatusAndNotify(B_NO_MEMORY);
659			return B_NO_MEMORY;
660		}
661		vtophysDeleter.SetTo(vtophys);
662
663		for (size_t i = 0; i < buffer->VecCount(); i++) {
664			generic_io_vec virt = buffer->VecAt(i);
665			uint32 entries = vtophys_length - nvme_request.iovec_count;
666
667			// Avoid copies by going straight into the vtophys array.
668			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
669				virt.length, vtophys + nvme_request.iovec_count, &entries);
670			if (status == B_BUFFER_OVERFLOW) {
671				TRACE("vtophys array was too small, reallocating\n");
672
673				vtophysDeleter.Detach();
674				vtophys_length *= 2;
675				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
676					sizeof(physical_entry) * vtophys_length);
677				vtophysDeleter.SetTo(vtophys);
678				if (vtophys == NULL) {
679					status = B_NO_MEMORY;
680				} else {
681					// Try again, with the larger buffer this time.
682					i--;
683					continue;
684				}
685			}
686			if (status != B_OK) {
687				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
688				request->SetStatusAndNotify(status);
689				return status;
690			}
691
692			nvme_request.iovec_count += entries;
693		}
694	} else {
695		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
696		nvme_request.iovec_count = buffer->VecCount();
697	}
698
699	// See if we need to bounce anything other than the first or last vec.
700	const size_t block_size = handle->info->block_size;
701	bool bounceAll = false;
702	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
703		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
704			bounceAll = true;
705		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
706			bounceAll = true;
707	}
708
709	// See if we need to bounce due to the first or last vecs.
710	if (nvme_request.iovec_count > 1) {
711		// There are middle vecs, so the first and last vecs have different restrictions: they
712		// need only be a multiple of the block size, and must end and start on a page boundary,
713		// respectively, though the start address must always be 32-bit-aligned.
714		physical_entry* entry = &nvme_request.iovecs[0];
715		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
716				|| (entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
717			bounceAll = true;
718
719		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
720		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
721				|| (entry->size % block_size) != 0))
722			bounceAll = true;
723	} else {
724		// There is only one vec. Check that it is a multiple of the block size,
725		// and that its address is 32-bit-aligned.
726		physical_entry* entry = &nvme_request.iovecs[0];
727		if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
728			bounceAll = true;
729	}
730
731	// See if we need to bounce due to rounding.
732	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
733	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
734		- rounded_pos), block_size);
735	if (rounded_pos != request->Offset() || rounded_len != request->Length())
736		bounceAll = true;
737
738	if (bounceAll) {
739		// Let the bounced I/O routine take care of everything from here.
740		return nvme_disk_bounced_io(handle, request);
741	}
742
743	nvme_request.lba_start = rounded_pos / block_size;
744	nvme_request.lba_count = rounded_len / block_size;
745
746	// No bouncing was required.
747	ReadLocker readLocker;
748	if (nvme_request.write)
749		readLocker.SetTo(handle->info->rounded_write_lock, false);
750
751	// Error check before actually doing I/O.
752	if (status != B_OK) {
753		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
754		request->SetStatusAndNotify(status);
755		return status;
756	}
757
758	const uint32 max_io_blocks = handle->info->max_io_blocks;
759	int32 remaining = nvme_request.iovec_count;
760	while (remaining > 0) {
761		nvme_request.iovec_count = min_c(remaining,
762			NVME_MAX_SGL_DESCRIPTORS / 2);
763
764		nvme_request.lba_count = 0;
765		for (int i = 0; i < nvme_request.iovec_count; i++) {
766			uint32 new_lba_count = nvme_request.lba_count
767				+ (nvme_request.iovecs[i].size / block_size);
768			if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
769				// We already have a nonzero length, and adding this vec would
770				// make us go over (or we already are over.) Stop adding.
771				nvme_request.iovec_count = i;
772				break;
773			}
774
775			nvme_request.lba_count = new_lba_count;
776		}
777
778		status = do_nvme_io_request(handle->info, &nvme_request);
779		if (status != B_OK)
780			break;
781
782		nvme_request.iovecs += nvme_request.iovec_count;
783		remaining -= nvme_request.iovec_count;
784		nvme_request.lba_start += nvme_request.lba_count;
785	}
786
787	if (status != B_OK)
788		TRACE_ERROR("I/O failed: %s\n", strerror(status));
789
790	request->SetTransferredBytes(status != B_OK,
791		(nvme_request.lba_start * block_size) - rounded_pos);
792	request->SetStatusAndNotify(status);
793	return status;
794}
795
796
797static status_t
798nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
799{
800	CALLED();
801	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
802
803	const off_t ns_end = (handle->info->capacity * handle->info->block_size);
804	if (pos >= ns_end)
805		return B_BAD_VALUE;
806	if ((pos + (off_t)*length) > ns_end)
807		*length = ns_end - pos;
808
809	IORequest request;
810	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
811	if (status != B_OK)
812		return status;
813
814	status = nvme_disk_io(handle, &request);
815	*length = request.TransferredBytes();
816	return status;
817}
818
819
820static status_t
821nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
822{
823	CALLED();
824	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
825
826	const off_t ns_end = (handle->info->capacity * handle->info->block_size);
827	if (pos >= ns_end)
828		return B_BAD_VALUE;
829	if ((pos + (off_t)*length) > ns_end)
830		*length = ns_end - pos;
831
832	IORequest request;
833	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
834	if (status != B_OK)
835		return status;
836
837	status = nvme_disk_io(handle, &request);
838	*length = request.TransferredBytes();
839	return status;
840}
841
842
843static status_t
844nvme_disk_flush(nvme_disk_driver_info* info)
845{
846	CALLED();
847	status_t status = EINPROGRESS;
848
849	qpair_info* qpinfo = get_qpair(info);
850	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
851		(nvme_cmd_cb)io_finished_callback, &status);
852	if (ret != 0)
853		return ret;
854
855	await_status(info, qpinfo->qpair, status);
856	return status;
857}
858
859
860static status_t
861nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData)
862{
863	CALLED();
864	trimData->trimmed_size = 0;
865
866	const off_t deviceSize = info->capacity * info->block_size; // in bytes
867	if (deviceSize < 0)
868		return B_BAD_VALUE;
869
870	STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64));
871	ASSERT(deviceSize >= 0);
872
873	// Do not trim past device end.
874	for (uint32 i = 0; i < trimData->range_count; i++) {
875		uint64 offset = trimData->ranges[i].offset;
876		uint64& size = trimData->ranges[i].size;
877
878		if (offset >= (uint64)deviceSize)
879			return B_BAD_VALUE;
880		size = std::min(size, (uint64)deviceSize - offset);
881	}
882
883	// We need contiguous memory for the DSM ranges.
884	nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node(
885		trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL);
886	if (dsmRanges == NULL)
887		return B_NO_MEMORY;
888	CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges);
889
890	uint64 trimmingSize = 0;
891	for (uint32 i = 0; i < trimData->range_count; i++) {
892		uint64 offset = trimData->ranges[i].offset;
893		uint64 length = trimData->ranges[i].size;
894
895		// Round up offset and length to the block size.
896		// (Some space at the beginning and end may thus not be trimmed.)
897		offset = ROUNDUP(offset, info->block_size);
898		length -= offset - trimData->ranges[i].offset;
899		length = ROUNDDOWN(length, info->block_size);
900
901		if (length == 0)
902			continue;
903		if ((length / info->block_size) > UINT32_MAX)
904			length = uint64(UINT32_MAX) * info->block_size;
905			// TODO: Break into smaller trim ranges!
906
907		TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset);
908
909		dsmRanges[i].attributes = 0;
910		dsmRanges[i].length = length / info->block_size;
911		dsmRanges[i].starting_lba = offset / info->block_size;
912
913		trimmingSize += length;
914	}
915
916	status_t status = EINPROGRESS;
917	qpair_info* qpair = get_qpair(info);
918	if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count,
919			(nvme_cmd_cb)io_finished_callback, &status) != 0)
920		return B_IO_ERROR;
921
922	await_status(info, qpair->qpair, status);
923	if (status != B_OK)
924		return status;
925
926	trimData->trimmed_size = trimmingSize;
927	return B_OK;
928}
929
930
931static status_t
932nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
933{
934	CALLED();
935	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
936	nvme_disk_driver_info* info = handle->info;
937
938	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
939
940	switch (op) {
941		case B_GET_MEDIA_STATUS:
942		{
943			return user_memcpy(buffer, &info->media_status, sizeof(status_t));
944		}
945
946		case B_GET_DEVICE_SIZE:
947		{
948			size_t size = info->capacity * info->block_size;
949			return user_memcpy(buffer, &size, sizeof(size_t));
950		}
951
952		case B_GET_GEOMETRY:
953		{
954			if (buffer == NULL || length > sizeof(device_geometry))
955				return B_BAD_VALUE;
956
957		 	device_geometry geometry;
958			status_t status = get_geometry(handle, &geometry);
959			if (status != B_OK)
960				return status;
961
962			return user_memcpy(buffer, &geometry, length);
963		}
964
965		case B_GET_ICON_NAME:
966			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
967				B_FILE_NAME_LENGTH);
968
969		case B_GET_VECTOR_ICON:
970		{
971			device_icon iconData;
972			if (length != sizeof(device_icon))
973				return B_BAD_VALUE;
974			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
975				return B_BAD_ADDRESS;
976
977			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
978				if (user_memcpy(iconData.icon_data, kDriveIcon,
979						sizeof(kDriveIcon)) != B_OK)
980					return B_BAD_ADDRESS;
981			}
982
983			iconData.icon_size = sizeof(kDriveIcon);
984			return user_memcpy(buffer, &iconData, sizeof(device_icon));
985		}
986
987		case B_FLUSH_DRIVE_CACHE:
988			return nvme_disk_flush(info);
989
990		case B_TRIM_DEVICE:
991			ASSERT(IS_KERNEL_ADDRESS(buffer));
992			return nvme_disk_trim(info, (fs_trim_data*)buffer);
993	}
994
995	return B_DEV_INVALID_IOCTL;
996}
997
998
999//	#pragma mark - driver module API
1000
1001
1002static float
1003nvme_disk_supports_device(device_node *parent)
1004{
1005	CALLED();
1006
1007	const char* bus;
1008	uint16 baseClass, subClass;
1009
1010	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
1011		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
1012		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
1013		return -1.0f;
1014
1015	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
1016		return 0.0f;
1017
1018	if (subClass != PCI_nvm)
1019		return 0.0f;
1020
1021	TRACE("NVMe device found!\n");
1022	return 1.0f;
1023}
1024
1025
1026static status_t
1027nvme_disk_register_device(device_node* parent)
1028{
1029	CALLED();
1030
1031	device_attr attrs[] = {
1032		{ B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } },
1033		{ NULL }
1034	};
1035
1036	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
1037		attrs, NULL, NULL);
1038}
1039
1040
1041static status_t
1042nvme_disk_init_driver(device_node* node, void** cookie)
1043{
1044	CALLED();
1045
1046	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
1047	if (ret != 0) {
1048		TRACE_ERROR("libnvme initialization failed!\n");
1049		return ret;
1050	}
1051
1052	nvme_disk_driver_info* info = new nvme_disk_driver_info;
1053	if (info == NULL)
1054		return B_NO_MEMORY;
1055
1056	info->media_status = B_OK;
1057	info->node = node;
1058
1059	info->ctrlr = NULL;
1060
1061	*cookie = info;
1062	return B_OK;
1063}
1064
1065
1066static void
1067nvme_disk_uninit_driver(void* _cookie)
1068{
1069	CALLED();
1070
1071	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1072	free(info);
1073}
1074
1075
1076static status_t
1077nvme_disk_register_child_devices(void* _cookie)
1078{
1079	CALLED();
1080
1081	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1082	status_t status;
1083
1084	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1085	if (id < 0)
1086		return id;
1087
1088	char name[64];
1089	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1090		id);
1091
1092	status = sDeviceManager->publish_device(info->node, name,
1093		NVME_DISK_DEVICE_MODULE_NAME);
1094
1095	return status;
1096}
1097
1098
1099//	#pragma mark -
1100
1101
1102module_dependency module_dependencies[] = {
1103	{ B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager },
1104	{ NULL }
1105};
1106
1107struct device_module_info sNvmeDiskDevice = {
1108	{
1109		NVME_DISK_DEVICE_MODULE_NAME,
1110		0,
1111		NULL
1112	},
1113
1114	nvme_disk_init_device,
1115	nvme_disk_uninit_device,
1116	NULL, // remove,
1117
1118	nvme_disk_open,
1119	nvme_disk_close,
1120	nvme_disk_free,
1121	nvme_disk_read,
1122	nvme_disk_write,
1123	nvme_disk_io,
1124	nvme_disk_ioctl,
1125
1126	NULL,	// select
1127	NULL,	// deselect
1128};
1129
1130struct driver_module_info sNvmeDiskDriver = {
1131	{
1132		NVME_DISK_DRIVER_MODULE_NAME,
1133		0,
1134		NULL
1135	},
1136
1137	nvme_disk_supports_device,
1138	nvme_disk_register_device,
1139	nvme_disk_init_driver,
1140	nvme_disk_uninit_driver,
1141	nvme_disk_register_child_devices,
1142	NULL,	// rescan
1143	NULL,	// removed
1144};
1145
1146module_info* modules[] = {
1147	(module_info*)&sNvmeDiskDriver,
1148	(module_info*)&sNvmeDiskDevice,
1149	NULL
1150};
1151