1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe ZNS-ZBD command implementation.
4 * Copyright (C) 2021 Western Digital Corporation or its affiliates.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/nvme.h>
8#include <linux/blkdev.h>
9#include "nvmet.h"
10
11/*
12 * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0
13 * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k
14 * as page_shift value. When calculating the ZASL use shift by 12.
15 */
16#define NVMET_MPSMIN_SHIFT	12
17
18static inline u8 nvmet_zasl(unsigned int zone_append_sects)
19{
20	/*
21	 * Zone Append Size Limit (zasl) is expressed as a power of 2 value
22	 * with the minimum memory page size (i.e. 12) as unit.
23	 */
24	return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - 9));
25}
26
27static int validate_conv_zones_cb(struct blk_zone *z,
28				  unsigned int i, void *data)
29{
30	if (z->type == BLK_ZONE_TYPE_CONVENTIONAL)
31		return -EOPNOTSUPP;
32	return 0;
33}
34
35bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
36{
37	u8 zasl = nvmet_zasl(bdev_max_zone_append_sectors(ns->bdev));
38	struct gendisk *bd_disk = ns->bdev->bd_disk;
39	int ret;
40
41	if (ns->subsys->zasl) {
42		if (ns->subsys->zasl > zasl)
43			return false;
44	}
45	ns->subsys->zasl = zasl;
46
47	/*
48	 * Generic zoned block devices may have a smaller last zone which is
49	 * not supported by ZNS. Exclude zoned drives that have such smaller
50	 * last zone.
51	 */
52	if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1))
53		return false;
54	/*
55	 * ZNS does not define a conventional zone type. If the underlying
56	 * device has a bitmap set indicating the existence of conventional
57	 * zones, reject the device. Otherwise, use report zones to detect if
58	 * the device has conventional zones.
59	 */
60	if (ns->bdev->bd_disk->conv_zones_bitmap)
61		return false;
62
63	ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
64				  validate_conv_zones_cb, NULL);
65	if (ret < 0)
66		return false;
67
68	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
69
70	return true;
71}
72
73void nvmet_execute_identify_ctrl_zns(struct nvmet_req *req)
74{
75	u8 zasl = req->sq->ctrl->subsys->zasl;
76	struct nvmet_ctrl *ctrl = req->sq->ctrl;
77	struct nvme_id_ctrl_zns *id;
78	u16 status;
79
80	id = kzalloc(sizeof(*id), GFP_KERNEL);
81	if (!id) {
82		status = NVME_SC_INTERNAL;
83		goto out;
84	}
85
86	if (ctrl->ops->get_mdts)
87		id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl);
88	else
89		id->zasl = zasl;
90
91	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
92
93	kfree(id);
94out:
95	nvmet_req_complete(req, status);
96}
97
98void nvmet_execute_identify_ns_zns(struct nvmet_req *req)
99{
100	struct nvme_id_ns_zns *id_zns = NULL;
101	u64 zsze;
102	u16 status;
103	u32 mar, mor;
104
105	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
106		req->error_loc = offsetof(struct nvme_identify, nsid);
107		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
108		goto out;
109	}
110
111	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
112	if (!id_zns) {
113		status = NVME_SC_INTERNAL;
114		goto out;
115	}
116
117	status = nvmet_req_find_ns(req);
118	if (status)
119		goto done;
120
121	if (nvmet_ns_revalidate(req->ns)) {
122		mutex_lock(&req->ns->subsys->lock);
123		nvmet_ns_changed(req->ns->subsys, req->ns->nsid);
124		mutex_unlock(&req->ns->subsys->lock);
125	}
126
127	if (!bdev_is_zoned(req->ns->bdev)) {
128		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
129		req->error_loc = offsetof(struct nvme_identify, nsid);
130		goto out;
131	}
132
133	zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >>
134					req->ns->blksize_shift;
135	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
136
137	mor = bdev_max_open_zones(req->ns->bdev);
138	if (!mor)
139		mor = U32_MAX;
140	else
141		mor--;
142	id_zns->mor = cpu_to_le32(mor);
143
144	mar = bdev_max_active_zones(req->ns->bdev);
145	if (!mar)
146		mar = U32_MAX;
147	else
148		mar--;
149	id_zns->mar = cpu_to_le32(mar);
150
151done:
152	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
153out:
154	kfree(id_zns);
155	nvmet_req_complete(req, status);
156}
157
158static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req)
159{
160	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
161	u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
162
163	if (sect >= get_capacity(req->ns->bdev->bd_disk)) {
164		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, slba);
165		return NVME_SC_LBA_RANGE | NVME_SC_DNR;
166	}
167
168	if (out_bufsize < sizeof(struct nvme_zone_report)) {
169		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, numd);
170		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
171	}
172
173	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
174		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra);
175		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
176	}
177
178	switch (req->cmd->zmr.pr) {
179	case 0:
180	case 1:
181		break;
182	default:
183		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr);
184		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
185	}
186
187	switch (req->cmd->zmr.zrasf) {
188	case NVME_ZRASF_ZONE_REPORT_ALL:
189	case NVME_ZRASF_ZONE_STATE_EMPTY:
190	case NVME_ZRASF_ZONE_STATE_IMP_OPEN:
191	case NVME_ZRASF_ZONE_STATE_EXP_OPEN:
192	case NVME_ZRASF_ZONE_STATE_CLOSED:
193	case NVME_ZRASF_ZONE_STATE_FULL:
194	case NVME_ZRASF_ZONE_STATE_READONLY:
195	case NVME_ZRASF_ZONE_STATE_OFFLINE:
196		break;
197	default:
198		req->error_loc =
199			offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf);
200		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
201	}
202
203	return NVME_SC_SUCCESS;
204}
205
206struct nvmet_report_zone_data {
207	struct nvmet_req *req;
208	u64 out_buf_offset;
209	u64 out_nr_zones;
210	u64 nr_zones;
211	u8 zrasf;
212};
213
214static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned i, void *d)
215{
216	static const unsigned int nvme_zrasf_to_blk_zcond[] = {
217		[NVME_ZRASF_ZONE_STATE_EMPTY]	 = BLK_ZONE_COND_EMPTY,
218		[NVME_ZRASF_ZONE_STATE_IMP_OPEN] = BLK_ZONE_COND_IMP_OPEN,
219		[NVME_ZRASF_ZONE_STATE_EXP_OPEN] = BLK_ZONE_COND_EXP_OPEN,
220		[NVME_ZRASF_ZONE_STATE_CLOSED]	 = BLK_ZONE_COND_CLOSED,
221		[NVME_ZRASF_ZONE_STATE_READONLY] = BLK_ZONE_COND_READONLY,
222		[NVME_ZRASF_ZONE_STATE_FULL]	 = BLK_ZONE_COND_FULL,
223		[NVME_ZRASF_ZONE_STATE_OFFLINE]	 = BLK_ZONE_COND_OFFLINE,
224	};
225	struct nvmet_report_zone_data *rz = d;
226
227	if (rz->zrasf != NVME_ZRASF_ZONE_REPORT_ALL &&
228	    z->cond != nvme_zrasf_to_blk_zcond[rz->zrasf])
229		return 0;
230
231	if (rz->nr_zones < rz->out_nr_zones) {
232		struct nvme_zone_descriptor zdesc = { };
233		u16 status;
234
235		zdesc.zcap = nvmet_sect_to_lba(rz->req->ns, z->capacity);
236		zdesc.zslba = nvmet_sect_to_lba(rz->req->ns, z->start);
237		zdesc.wp = nvmet_sect_to_lba(rz->req->ns, z->wp);
238		zdesc.za = z->reset ? 1 << 2 : 0;
239		zdesc.zs = z->cond << 4;
240		zdesc.zt = z->type;
241
242		status = nvmet_copy_to_sgl(rz->req, rz->out_buf_offset, &zdesc,
243					   sizeof(zdesc));
244		if (status)
245			return -EINVAL;
246
247		rz->out_buf_offset += sizeof(zdesc);
248	}
249
250	rz->nr_zones++;
251
252	return 0;
253}
254
255static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req)
256{
257	unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
258
259	return bdev_nr_zones(req->ns->bdev) - bdev_zone_no(req->ns->bdev, sect);
260}
261
262static unsigned long get_nr_zones_from_buf(struct nvmet_req *req, u32 bufsize)
263{
264	if (bufsize <= sizeof(struct nvme_zone_report))
265		return 0;
266
267	return (bufsize - sizeof(struct nvme_zone_report)) /
268		sizeof(struct nvme_zone_descriptor);
269}
270
271static void nvmet_bdev_zone_zmgmt_recv_work(struct work_struct *w)
272{
273	struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
274	sector_t start_sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
275	unsigned long req_slba_nr_zones = nvmet_req_nr_zones_from_slba(req);
276	u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
277	__le64 nr_zones;
278	u16 status;
279	int ret;
280	struct nvmet_report_zone_data rz_data = {
281		.out_nr_zones = get_nr_zones_from_buf(req, out_bufsize),
282		/* leave the place for report zone header */
283		.out_buf_offset = sizeof(struct nvme_zone_report),
284		.zrasf = req->cmd->zmr.zrasf,
285		.nr_zones = 0,
286		.req = req,
287	};
288
289	status = nvmet_bdev_validate_zone_mgmt_recv(req);
290	if (status)
291		goto out;
292
293	if (!req_slba_nr_zones) {
294		status = NVME_SC_SUCCESS;
295		goto out;
296	}
297
298	ret = blkdev_report_zones(req->ns->bdev, start_sect, req_slba_nr_zones,
299				 nvmet_bdev_report_zone_cb, &rz_data);
300	if (ret < 0) {
301		status = NVME_SC_INTERNAL;
302		goto out;
303	}
304
305	/*
306	 * When partial bit is set nr_zones must indicate the number of zone
307	 * descriptors actually transferred.
308	 */
309	if (req->cmd->zmr.pr)
310		rz_data.nr_zones = min(rz_data.nr_zones, rz_data.out_nr_zones);
311
312	nr_zones = cpu_to_le64(rz_data.nr_zones);
313	status = nvmet_copy_to_sgl(req, 0, &nr_zones, sizeof(nr_zones));
314
315out:
316	nvmet_req_complete(req, status);
317}
318
319void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
320{
321	INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zone_zmgmt_recv_work);
322	queue_work(zbd_wq, &req->z.zmgmt_work);
323}
324
325static inline enum req_op zsa_req_op(u8 zsa)
326{
327	switch (zsa) {
328	case NVME_ZONE_OPEN:
329		return REQ_OP_ZONE_OPEN;
330	case NVME_ZONE_CLOSE:
331		return REQ_OP_ZONE_CLOSE;
332	case NVME_ZONE_FINISH:
333		return REQ_OP_ZONE_FINISH;
334	case NVME_ZONE_RESET:
335		return REQ_OP_ZONE_RESET;
336	default:
337		return REQ_OP_LAST;
338	}
339}
340
341static u16 blkdev_zone_mgmt_errno_to_nvme_status(int ret)
342{
343	switch (ret) {
344	case 0:
345		return NVME_SC_SUCCESS;
346	case -EINVAL:
347	case -EIO:
348		return NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR;
349	default:
350		return NVME_SC_INTERNAL;
351	}
352}
353
354struct nvmet_zone_mgmt_send_all_data {
355	unsigned long *zbitmap;
356	struct nvmet_req *req;
357};
358
359static int zmgmt_send_scan_cb(struct blk_zone *z, unsigned i, void *d)
360{
361	struct nvmet_zone_mgmt_send_all_data *data = d;
362
363	switch (zsa_req_op(data->req->cmd->zms.zsa)) {
364	case REQ_OP_ZONE_OPEN:
365		switch (z->cond) {
366		case BLK_ZONE_COND_CLOSED:
367			break;
368		default:
369			return 0;
370		}
371		break;
372	case REQ_OP_ZONE_CLOSE:
373		switch (z->cond) {
374		case BLK_ZONE_COND_IMP_OPEN:
375		case BLK_ZONE_COND_EXP_OPEN:
376			break;
377		default:
378			return 0;
379		}
380		break;
381	case REQ_OP_ZONE_FINISH:
382		switch (z->cond) {
383		case BLK_ZONE_COND_IMP_OPEN:
384		case BLK_ZONE_COND_EXP_OPEN:
385		case BLK_ZONE_COND_CLOSED:
386			break;
387		default:
388			return 0;
389		}
390		break;
391	default:
392		return -EINVAL;
393	}
394
395	set_bit(i, data->zbitmap);
396
397	return 0;
398}
399
400static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
401{
402	struct block_device *bdev = req->ns->bdev;
403	unsigned int nr_zones = bdev_nr_zones(bdev);
404	struct bio *bio = NULL;
405	sector_t sector = 0;
406	int ret;
407	struct nvmet_zone_mgmt_send_all_data d = {
408		.req = req,
409	};
410
411	d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)),
412				 GFP_NOIO, bdev->bd_disk->node_id);
413	if (!d.zbitmap) {
414		ret = -ENOMEM;
415		goto out;
416	}
417
418	/* Scan and build bitmap of the eligible zones */
419	ret = blkdev_report_zones(bdev, 0, nr_zones, zmgmt_send_scan_cb, &d);
420	if (ret != nr_zones) {
421		if (ret > 0)
422			ret = -EIO;
423		goto out;
424	} else {
425		/* We scanned all the zones */
426		ret = 0;
427	}
428
429	while (sector < bdev_nr_sectors(bdev)) {
430		if (test_bit(disk_zone_no(bdev->bd_disk, sector), d.zbitmap)) {
431			bio = blk_next_bio(bio, bdev, 0,
432				zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC,
433				GFP_KERNEL);
434			bio->bi_iter.bi_sector = sector;
435			/* This may take a while, so be nice to others */
436			cond_resched();
437		}
438		sector += bdev_zone_sectors(bdev);
439	}
440
441	if (bio) {
442		ret = submit_bio_wait(bio);
443		bio_put(bio);
444	}
445
446out:
447	kfree(d.zbitmap);
448
449	return blkdev_zone_mgmt_errno_to_nvme_status(ret);
450}
451
452static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req)
453{
454	int ret;
455
456	switch (zsa_req_op(req->cmd->zms.zsa)) {
457	case REQ_OP_ZONE_RESET:
458		ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0,
459				       get_capacity(req->ns->bdev->bd_disk));
460		if (ret < 0)
461			return blkdev_zone_mgmt_errno_to_nvme_status(ret);
462		break;
463	case REQ_OP_ZONE_OPEN:
464	case REQ_OP_ZONE_CLOSE:
465	case REQ_OP_ZONE_FINISH:
466		return nvmet_bdev_zone_mgmt_emulate_all(req);
467	default:
468		/* this is needed to quiet compiler warning */
469		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa);
470		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
471	}
472
473	return NVME_SC_SUCCESS;
474}
475
476static void nvmet_bdev_zmgmt_send_work(struct work_struct *w)
477{
478	struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
479	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
480	enum req_op op = zsa_req_op(req->cmd->zms.zsa);
481	struct block_device *bdev = req->ns->bdev;
482	sector_t zone_sectors = bdev_zone_sectors(bdev);
483	u16 status = NVME_SC_SUCCESS;
484	int ret;
485
486	if (op == REQ_OP_LAST) {
487		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa);
488		status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR;
489		goto out;
490	}
491
492	/* when select all bit is set slba field is ignored */
493	if (req->cmd->zms.select_all) {
494		status = nvmet_bdev_execute_zmgmt_send_all(req);
495		goto out;
496	}
497
498	if (sect >= get_capacity(bdev->bd_disk)) {
499		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba);
500		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
501		goto out;
502	}
503
504	if (sect & (zone_sectors - 1)) {
505		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba);
506		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
507		goto out;
508	}
509
510	ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors);
511	if (ret < 0)
512		status = blkdev_zone_mgmt_errno_to_nvme_status(ret);
513
514out:
515	nvmet_req_complete(req, status);
516}
517
518void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
519{
520	INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zmgmt_send_work);
521	queue_work(zbd_wq, &req->z.zmgmt_work);
522}
523
524static void nvmet_bdev_zone_append_bio_done(struct bio *bio)
525{
526	struct nvmet_req *req = bio->bi_private;
527
528	if (bio->bi_status == BLK_STS_OK) {
529		req->cqe->result.u64 =
530			nvmet_sect_to_lba(req->ns, bio->bi_iter.bi_sector);
531	}
532
533	nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
534	nvmet_req_bio_put(req, bio);
535}
536
537void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
538{
539	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
540	const blk_opf_t opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
541	u16 status = NVME_SC_SUCCESS;
542	unsigned int total_len = 0;
543	struct scatterlist *sg;
544	struct bio *bio;
545	int sg_cnt;
546
547	/* Request is completed on len mismatch in nvmet_check_transter_len() */
548	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
549		return;
550
551	if (!req->sg_cnt) {
552		nvmet_req_complete(req, 0);
553		return;
554	}
555
556	if (sect >= get_capacity(req->ns->bdev->bd_disk)) {
557		req->error_loc = offsetof(struct nvme_rw_command, slba);
558		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
559		goto out;
560	}
561
562	if (sect & (bdev_zone_sectors(req->ns->bdev) - 1)) {
563		req->error_loc = offsetof(struct nvme_rw_command, slba);
564		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
565		goto out;
566	}
567
568	if (nvmet_use_inline_bvec(req)) {
569		bio = &req->z.inline_bio;
570		bio_init(bio, req->ns->bdev, req->inline_bvec,
571			 ARRAY_SIZE(req->inline_bvec), opf);
572	} else {
573		bio = bio_alloc(req->ns->bdev, req->sg_cnt, opf, GFP_KERNEL);
574	}
575
576	bio->bi_end_io = nvmet_bdev_zone_append_bio_done;
577	bio->bi_iter.bi_sector = sect;
578	bio->bi_private = req;
579	if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
580		bio->bi_opf |= REQ_FUA;
581
582	for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
583		struct page *p = sg_page(sg);
584		unsigned int l = sg->length;
585		unsigned int o = sg->offset;
586		unsigned int ret;
587
588		ret = bio_add_zone_append_page(bio, p, l, o);
589		if (ret != sg->length) {
590			status = NVME_SC_INTERNAL;
591			goto out_put_bio;
592		}
593		total_len += sg->length;
594	}
595
596	if (total_len != nvmet_rw_data_len(req)) {
597		status = NVME_SC_INTERNAL | NVME_SC_DNR;
598		goto out_put_bio;
599	}
600
601	submit_bio(bio);
602	return;
603
604out_put_bio:
605	nvmet_req_bio_put(req, bio);
606out:
607	nvmet_req_complete(req, status);
608}
609
610u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req)
611{
612	struct nvme_command *cmd = req->cmd;
613
614	switch (cmd->common.opcode) {
615	case nvme_cmd_zone_append:
616		req->execute = nvmet_bdev_execute_zone_append;
617		return 0;
618	case nvme_cmd_zone_mgmt_recv:
619		req->execute = nvmet_bdev_execute_zone_mgmt_recv;
620		return 0;
621	case nvme_cmd_zone_mgmt_send:
622		req->execute = nvmet_bdev_execute_zone_mgmt_send;
623		return 0;
624	default:
625		return nvmet_bdev_parse_io_cmd(req);
626	}
627}
628