ctl_backend_block.c revision 288787
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Edward Tomasz Napierala
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17 *    substantially similar to the "NO WARRANTY" disclaimer below
18 *    ("Disclaimer") and any redistribution must be conditioned upon
19 *    including a substantially similar Disclaimer requirement for further
20 *    binary redistribution.
21 *
22 * NO WARRANTY
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGES.
34 *
35 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36 */
37/*
38 * CAM Target Layer driver backend for block devices.
39 *
40 * Author: Ken Merry <ken@FreeBSD.org>
41 */
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: stable/10/sys/cam/ctl/ctl_backend_block.c 288787 2015-10-05 10:51:24Z mav $");
44
45#include <opt_kdtrace.h>
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kernel.h>
50#include <sys/types.h>
51#include <sys/kthread.h>
52#include <sys/bio.h>
53#include <sys/fcntl.h>
54#include <sys/limits.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/condvar.h>
58#include <sys/malloc.h>
59#include <sys/conf.h>
60#include <sys/ioccom.h>
61#include <sys/queue.h>
62#include <sys/sbuf.h>
63#include <sys/endian.h>
64#include <sys/uio.h>
65#include <sys/buf.h>
66#include <sys/taskqueue.h>
67#include <sys/vnode.h>
68#include <sys/namei.h>
69#include <sys/mount.h>
70#include <sys/disk.h>
71#include <sys/fcntl.h>
72#include <sys/filedesc.h>
73#include <sys/filio.h>
74#include <sys/proc.h>
75#include <sys/pcpu.h>
76#include <sys/module.h>
77#include <sys/sdt.h>
78#include <sys/devicestat.h>
79#include <sys/sysctl.h>
80
81#include <geom/geom.h>
82
83#include <cam/cam.h>
84#include <cam/scsi/scsi_all.h>
85#include <cam/scsi/scsi_da.h>
86#include <cam/ctl/ctl_io.h>
87#include <cam/ctl/ctl.h>
88#include <cam/ctl/ctl_backend.h>
89#include <cam/ctl/ctl_ioctl.h>
90#include <cam/ctl/ctl_ha.h>
91#include <cam/ctl/ctl_scsi_all.h>
92#include <cam/ctl/ctl_private.h>
93#include <cam/ctl/ctl_error.h>
94
95/*
96 * The idea here is that we'll allocate enough S/G space to hold a 1MB
97 * I/O.  If we get an I/O larger than that, we'll split it.
98 */
99#define	CTLBLK_HALF_IO_SIZE	(512 * 1024)
100#define	CTLBLK_MAX_IO_SIZE	(CTLBLK_HALF_IO_SIZE * 2)
101#define	CTLBLK_MAX_SEG		MAXPHYS
102#define	CTLBLK_HALF_SEGS	MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
103#define	CTLBLK_MAX_SEGS		(CTLBLK_HALF_SEGS * 2)
104
105#ifdef CTLBLK_DEBUG
106#define DPRINTF(fmt, args...) \
107    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
108#else
109#define DPRINTF(fmt, args...) do {} while(0)
110#endif
111
112#define PRIV(io)	\
113    ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
114#define ARGS(io)	\
115    ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
116
117SDT_PROVIDER_DEFINE(cbb);
118
119typedef enum {
120	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
121	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
122	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
123} ctl_be_block_lun_flags;
124
125typedef enum {
126	CTL_BE_BLOCK_NONE,
127	CTL_BE_BLOCK_DEV,
128	CTL_BE_BLOCK_FILE
129} ctl_be_block_type;
130
131struct ctl_be_block_filedata {
132	struct ucred *cred;
133};
134
135union ctl_be_block_bedata {
136	struct ctl_be_block_filedata file;
137};
138
139struct ctl_be_block_io;
140struct ctl_be_block_lun;
141
142typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
143			       struct ctl_be_block_io *beio);
144typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
145				  const char *attrname);
146
147/*
148 * Backend LUN structure.  There is a 1:1 mapping between a block device
149 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
150 */
151struct ctl_be_block_lun {
152	struct ctl_lun_create_params params;
153	char lunname[32];
154	char *dev_path;
155	ctl_be_block_type dev_type;
156	struct vnode *vn;
157	union ctl_be_block_bedata backend;
158	cbb_dispatch_t dispatch;
159	cbb_dispatch_t lun_flush;
160	cbb_dispatch_t unmap;
161	cbb_dispatch_t get_lba_status;
162	cbb_getattr_t getattr;
163	uma_zone_t lun_zone;
164	uint64_t size_blocks;
165	uint64_t size_bytes;
166	struct ctl_be_block_softc *softc;
167	struct devstat *disk_stats;
168	ctl_be_block_lun_flags flags;
169	STAILQ_ENTRY(ctl_be_block_lun) links;
170	struct ctl_be_lun cbe_lun;
171	struct taskqueue *io_taskqueue;
172	struct task io_task;
173	int num_threads;
174	STAILQ_HEAD(, ctl_io_hdr) input_queue;
175	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
176	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
177	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
178	struct mtx_padalign io_lock;
179	struct mtx_padalign queue_lock;
180};
181
182/*
183 * Overall softc structure for the block backend module.
184 */
185struct ctl_be_block_softc {
186	struct mtx			 lock;
187	int				 num_luns;
188	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
189};
190
191static struct ctl_be_block_softc backend_block_softc;
192
193/*
194 * Per-I/O information.
195 */
196struct ctl_be_block_io {
197	union ctl_io			*io;
198	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
199	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
200	int				bio_cmd;
201	int				num_segs;
202	int				num_bios_sent;
203	int				num_bios_done;
204	int				send_complete;
205	int				num_errors;
206	struct bintime			ds_t0;
207	devstat_tag_type		ds_tag_type;
208	devstat_trans_flags		ds_trans_type;
209	uint64_t			io_len;
210	uint64_t			io_offset;
211	int				io_arg;
212	struct ctl_be_block_softc	*softc;
213	struct ctl_be_block_lun		*lun;
214	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
215};
216
217extern struct ctl_softc *control_softc;
218
219static int cbb_num_threads = 14;
220TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
221SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
222	    "CAM Target Layer Block Backend");
223SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
224           &cbb_num_threads, 0, "Number of threads per backing file");
225
226static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
227static void ctl_free_beio(struct ctl_be_block_io *beio);
228static void ctl_complete_beio(struct ctl_be_block_io *beio);
229static int ctl_be_block_move_done(union ctl_io *io);
230static void ctl_be_block_biodone(struct bio *bio);
231static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
232				    struct ctl_be_block_io *beio);
233static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
234				       struct ctl_be_block_io *beio);
235static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
236				  struct ctl_be_block_io *beio);
237static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
238					 const char *attrname);
239static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
240				   struct ctl_be_block_io *beio);
241static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
242				   struct ctl_be_block_io *beio);
243static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
244				      struct ctl_be_block_io *beio);
245static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
246					 const char *attrname);
247static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
248				    union ctl_io *io);
249static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
250				    union ctl_io *io);
251static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
252				  union ctl_io *io);
253static void ctl_be_block_worker(void *context, int pending);
254static int ctl_be_block_submit(union ctl_io *io);
255static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
256				   int flag, struct thread *td);
257static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
258				  struct ctl_lun_req *req);
259static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
260				 struct ctl_lun_req *req);
261static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
262static int ctl_be_block_open(struct ctl_be_block_softc *softc,
263			     struct ctl_be_block_lun *be_lun,
264			     struct ctl_lun_req *req);
265static int ctl_be_block_create(struct ctl_be_block_softc *softc,
266			       struct ctl_lun_req *req);
267static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
268			   struct ctl_lun_req *req);
269static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
270			   struct ctl_lun_req *req);
271static void ctl_be_block_lun_shutdown(void *be_lun);
272static void ctl_be_block_lun_config_status(void *be_lun,
273					   ctl_lun_config_status status);
274static int ctl_be_block_config_write(union ctl_io *io);
275static int ctl_be_block_config_read(union ctl_io *io);
276static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
277static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname);
278int ctl_be_block_init(void);
279
280static struct ctl_backend_driver ctl_be_block_driver =
281{
282	.name = "block",
283	.flags = CTL_BE_FLAG_HAS_CONFIG,
284	.init = ctl_be_block_init,
285	.data_submit = ctl_be_block_submit,
286	.data_move_done = ctl_be_block_move_done,
287	.config_read = ctl_be_block_config_read,
288	.config_write = ctl_be_block_config_write,
289	.ioctl = ctl_be_block_ioctl,
290	.lun_info = ctl_be_block_lun_info,
291	.lun_attr = ctl_be_block_lun_attr
292};
293
294MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
295CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
296
297static uma_zone_t beio_zone;
298
299static struct ctl_be_block_io *
300ctl_alloc_beio(struct ctl_be_block_softc *softc)
301{
302	struct ctl_be_block_io *beio;
303
304	beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO);
305	beio->softc = softc;
306	return (beio);
307}
308
309static void
310ctl_free_beio(struct ctl_be_block_io *beio)
311{
312	int duplicate_free;
313	int i;
314
315	duplicate_free = 0;
316
317	for (i = 0; i < beio->num_segs; i++) {
318		if (beio->sg_segs[i].addr == NULL)
319			duplicate_free++;
320
321		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
322		beio->sg_segs[i].addr = NULL;
323
324		/* For compare we had two equal S/G lists. */
325		if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) {
326			uma_zfree(beio->lun->lun_zone,
327			    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
328			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL;
329		}
330	}
331
332	if (duplicate_free > 0) {
333		printf("%s: %d duplicate frees out of %d segments\n", __func__,
334		       duplicate_free, beio->num_segs);
335	}
336
337	uma_zfree(beio_zone, beio);
338}
339
340static void
341ctl_complete_beio(struct ctl_be_block_io *beio)
342{
343	union ctl_io *io = beio->io;
344
345	if (beio->beio_cont != NULL) {
346		beio->beio_cont(beio);
347	} else {
348		ctl_free_beio(beio);
349		ctl_data_submit_done(io);
350	}
351}
352
353static size_t
354cmp(uint8_t *a, uint8_t *b, size_t size)
355{
356	size_t i;
357
358	for (i = 0; i < size; i++) {
359		if (a[i] != b[i])
360			break;
361	}
362	return (i);
363}
364
365static void
366ctl_be_block_compare(union ctl_io *io)
367{
368	struct ctl_be_block_io *beio;
369	uint64_t off, res;
370	int i;
371	uint8_t info[8];
372
373	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
374	off = 0;
375	for (i = 0; i < beio->num_segs; i++) {
376		res = cmp(beio->sg_segs[i].addr,
377		    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
378		    beio->sg_segs[i].len);
379		off += res;
380		if (res < beio->sg_segs[i].len)
381			break;
382	}
383	if (i < beio->num_segs) {
384		scsi_u64to8b(off, info);
385		ctl_set_sense(&io->scsiio, /*current_error*/ 1,
386		    /*sense_key*/ SSD_KEY_MISCOMPARE,
387		    /*asc*/ 0x1D, /*ascq*/ 0x00,
388		    /*type*/ SSD_ELEM_INFO,
389		    /*size*/ sizeof(info), /*data*/ &info,
390		    /*type*/ SSD_ELEM_NONE);
391	} else
392		ctl_set_success(&io->scsiio);
393}
394
395static int
396ctl_be_block_move_done(union ctl_io *io)
397{
398	struct ctl_be_block_io *beio;
399	struct ctl_be_block_lun *be_lun;
400	struct ctl_lba_len_flags *lbalen;
401#ifdef CTL_TIME_IO
402	struct bintime cur_bt;
403#endif
404
405	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
406	be_lun = beio->lun;
407
408	DPRINTF("entered\n");
409
410#ifdef CTL_TIME_IO
411	getbintime(&cur_bt);
412	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
413	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
414	io->io_hdr.num_dmas++;
415#endif
416	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
417
418	/*
419	 * We set status at this point for read commands, and write
420	 * commands with errors.
421	 */
422	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
423		;
424	} else if ((io->io_hdr.port_status == 0) &&
425	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
426		lbalen = ARGS(beio->io);
427		if (lbalen->flags & CTL_LLF_READ) {
428			ctl_set_success(&io->scsiio);
429		} else if (lbalen->flags & CTL_LLF_COMPARE) {
430			/* We have two data blocks ready for comparison. */
431			ctl_be_block_compare(io);
432		}
433	} else if ((io->io_hdr.port_status != 0) &&
434	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
435	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
436		/*
437		 * For hardware error sense keys, the sense key
438		 * specific value is defined to be a retry count,
439		 * but we use it to pass back an internal FETD
440		 * error code.  XXX KDM  Hopefully the FETD is only
441		 * using 16 bits for an error code, since that's
442		 * all the space we have in the sks field.
443		 */
444		ctl_set_internal_failure(&io->scsiio,
445					 /*sks_valid*/ 1,
446					 /*retry_count*/
447					 io->io_hdr.port_status);
448	}
449
450	/*
451	 * If this is a read, or a write with errors, it is done.
452	 */
453	if ((beio->bio_cmd == BIO_READ)
454	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
455	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
456		ctl_complete_beio(beio);
457		return (0);
458	}
459
460	/*
461	 * At this point, we have a write and the DMA completed
462	 * successfully.  We now have to queue it to the task queue to
463	 * execute the backend I/O.  That is because we do blocking
464	 * memory allocations, and in the file backing case, blocking I/O.
465	 * This move done routine is generally called in the SIM's
466	 * interrupt context, and therefore we cannot block.
467	 */
468	mtx_lock(&be_lun->queue_lock);
469	/*
470	 * XXX KDM make sure that links is okay to use at this point.
471	 * Otherwise, we either need to add another field to ctl_io_hdr,
472	 * or deal with resource allocation here.
473	 */
474	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
475	mtx_unlock(&be_lun->queue_lock);
476
477	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
478
479	return (0);
480}
481
482static void
483ctl_be_block_biodone(struct bio *bio)
484{
485	struct ctl_be_block_io *beio;
486	struct ctl_be_block_lun *be_lun;
487	union ctl_io *io;
488	int error;
489
490	beio = bio->bio_caller1;
491	be_lun = beio->lun;
492	io = beio->io;
493
494	DPRINTF("entered\n");
495
496	error = bio->bio_error;
497	mtx_lock(&be_lun->io_lock);
498	if (error != 0)
499		beio->num_errors++;
500
501	beio->num_bios_done++;
502
503	/*
504	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
505	 * during the free might cause it to complain.
506	 */
507	g_destroy_bio(bio);
508
509	/*
510	 * If the send complete bit isn't set, or we aren't the last I/O to
511	 * complete, then we're done.
512	 */
513	if ((beio->send_complete == 0)
514	 || (beio->num_bios_done < beio->num_bios_sent)) {
515		mtx_unlock(&be_lun->io_lock);
516		return;
517	}
518
519	/*
520	 * At this point, we've verified that we are the last I/O to
521	 * complete, so it's safe to drop the lock.
522	 */
523	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
524	    beio->ds_tag_type, beio->ds_trans_type,
525	    /*now*/ NULL, /*then*/&beio->ds_t0);
526	mtx_unlock(&be_lun->io_lock);
527
528	/*
529	 * If there are any errors from the backing device, we fail the
530	 * entire I/O with a medium error.
531	 */
532	if (beio->num_errors > 0) {
533		if (error == EOPNOTSUPP) {
534			ctl_set_invalid_opcode(&io->scsiio);
535		} else if (error == ENOSPC || error == EDQUOT) {
536			ctl_set_space_alloc_fail(&io->scsiio);
537		} else if (error == EROFS || error == EACCES) {
538			ctl_set_hw_write_protected(&io->scsiio);
539		} else if (beio->bio_cmd == BIO_FLUSH) {
540			/* XXX KDM is there is a better error here? */
541			ctl_set_internal_failure(&io->scsiio,
542						 /*sks_valid*/ 1,
543						 /*retry_count*/ 0xbad2);
544		} else {
545			ctl_set_medium_error(&io->scsiio,
546			    beio->bio_cmd == BIO_READ);
547		}
548		ctl_complete_beio(beio);
549		return;
550	}
551
552	/*
553	 * If this is a write, a flush, a delete or verify, we're all done.
554	 * If this is a read, we can now send the data to the user.
555	 */
556	if ((beio->bio_cmd == BIO_WRITE)
557	 || (beio->bio_cmd == BIO_FLUSH)
558	 || (beio->bio_cmd == BIO_DELETE)
559	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
560		ctl_set_success(&io->scsiio);
561		ctl_complete_beio(beio);
562	} else {
563		if ((ARGS(io)->flags & CTL_LLF_READ) &&
564		    beio->beio_cont == NULL) {
565			ctl_set_success(&io->scsiio);
566			ctl_serseq_done(io);
567		}
568#ifdef CTL_TIME_IO
569        	getbintime(&io->io_hdr.dma_start_bt);
570#endif
571		ctl_datamove(io);
572	}
573}
574
575static void
576ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
577			struct ctl_be_block_io *beio)
578{
579	union ctl_io *io = beio->io;
580	struct mount *mountpoint;
581	int error, lock_flags;
582
583	DPRINTF("entered\n");
584
585	binuptime(&beio->ds_t0);
586	mtx_lock(&be_lun->io_lock);
587	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
588	mtx_unlock(&be_lun->io_lock);
589
590	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
591
592	if (MNT_SHARED_WRITES(mountpoint)
593	 || ((mountpoint == NULL)
594	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
595		lock_flags = LK_SHARED;
596	else
597		lock_flags = LK_EXCLUSIVE;
598
599	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
600
601	error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
602	    curthread);
603	VOP_UNLOCK(be_lun->vn, 0);
604
605	vn_finished_write(mountpoint);
606
607	mtx_lock(&be_lun->io_lock);
608	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
609	    beio->ds_tag_type, beio->ds_trans_type,
610	    /*now*/ NULL, /*then*/&beio->ds_t0);
611	mtx_unlock(&be_lun->io_lock);
612
613	if (error == 0)
614		ctl_set_success(&io->scsiio);
615	else {
616		/* XXX KDM is there is a better error here? */
617		ctl_set_internal_failure(&io->scsiio,
618					 /*sks_valid*/ 1,
619					 /*retry_count*/ 0xbad1);
620	}
621
622	ctl_complete_beio(beio);
623}
624
625SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, "uint64_t");
626SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, "uint64_t");
627SDT_PROBE_DEFINE1(cbb, kernel, read, file_done,"uint64_t");
628SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, "uint64_t");
629
630static void
631ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
632			   struct ctl_be_block_io *beio)
633{
634	struct ctl_be_block_filedata *file_data;
635	union ctl_io *io;
636	struct uio xuio;
637	struct iovec *xiovec;
638	size_t s;
639	int error, flags, i;
640
641	DPRINTF("entered\n");
642
643	file_data = &be_lun->backend.file;
644	io = beio->io;
645	flags = 0;
646	if (ARGS(io)->flags & CTL_LLF_DPO)
647		flags |= IO_DIRECT;
648	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
649		flags |= IO_SYNC;
650
651	bzero(&xuio, sizeof(xuio));
652	if (beio->bio_cmd == BIO_READ) {
653		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
654		xuio.uio_rw = UIO_READ;
655	} else {
656		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
657		xuio.uio_rw = UIO_WRITE;
658	}
659	xuio.uio_offset = beio->io_offset;
660	xuio.uio_resid = beio->io_len;
661	xuio.uio_segflg = UIO_SYSSPACE;
662	xuio.uio_iov = beio->xiovecs;
663	xuio.uio_iovcnt = beio->num_segs;
664	xuio.uio_td = curthread;
665
666	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
667		xiovec->iov_base = beio->sg_segs[i].addr;
668		xiovec->iov_len = beio->sg_segs[i].len;
669	}
670
671	binuptime(&beio->ds_t0);
672	mtx_lock(&be_lun->io_lock);
673	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
674	mtx_unlock(&be_lun->io_lock);
675
676	if (beio->bio_cmd == BIO_READ) {
677		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
678
679		/*
680		 * UFS pays attention to IO_DIRECT for reads.  If the
681		 * DIRECTIO option is configured into the kernel, it calls
682		 * ffs_rawread().  But that only works for single-segment
683		 * uios with user space addresses.  In our case, with a
684		 * kernel uio, it still reads into the buffer cache, but it
685		 * will just try to release the buffer from the cache later
686		 * on in ffs_read().
687		 *
688		 * ZFS does not pay attention to IO_DIRECT for reads.
689		 *
690		 * UFS does not pay attention to IO_SYNC for reads.
691		 *
692		 * ZFS pays attention to IO_SYNC (which translates into the
693		 * Solaris define FRSYNC for zfs_read()) for reads.  It
694		 * attempts to sync the file before reading.
695		 */
696		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
697
698		VOP_UNLOCK(be_lun->vn, 0);
699		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
700		if (error == 0 && xuio.uio_resid > 0) {
701			/*
702			 * If we red less then requested (EOF), then
703			 * we should clean the rest of the buffer.
704			 */
705			s = beio->io_len - xuio.uio_resid;
706			for (i = 0; i < beio->num_segs; i++) {
707				if (s >= beio->sg_segs[i].len) {
708					s -= beio->sg_segs[i].len;
709					continue;
710				}
711				bzero((uint8_t *)beio->sg_segs[i].addr + s,
712				    beio->sg_segs[i].len - s);
713				s = 0;
714			}
715		}
716	} else {
717		struct mount *mountpoint;
718		int lock_flags;
719
720		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
721
722		if (MNT_SHARED_WRITES(mountpoint)
723		 || ((mountpoint == NULL)
724		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
725			lock_flags = LK_SHARED;
726		else
727			lock_flags = LK_EXCLUSIVE;
728
729		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
730
731		/*
732		 * UFS pays attention to IO_DIRECT for writes.  The write
733		 * is done asynchronously.  (Normally the write would just
734		 * get put into cache.
735		 *
736		 * UFS pays attention to IO_SYNC for writes.  It will
737		 * attempt to write the buffer out synchronously if that
738		 * flag is set.
739		 *
740		 * ZFS does not pay attention to IO_DIRECT for writes.
741		 *
742		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
743		 * for writes.  It will flush the transaction from the
744		 * cache before returning.
745		 */
746		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
747		VOP_UNLOCK(be_lun->vn, 0);
748
749		vn_finished_write(mountpoint);
750		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
751        }
752
753	mtx_lock(&be_lun->io_lock);
754	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
755	    beio->ds_tag_type, beio->ds_trans_type,
756	    /*now*/ NULL, /*then*/&beio->ds_t0);
757	mtx_unlock(&be_lun->io_lock);
758
759	/*
760	 * If we got an error, set the sense data to "MEDIUM ERROR" and
761	 * return the I/O to the user.
762	 */
763	if (error != 0) {
764		if (error == ENOSPC || error == EDQUOT) {
765			ctl_set_space_alloc_fail(&io->scsiio);
766		} else if (error == EROFS || error == EACCES) {
767			ctl_set_hw_write_protected(&io->scsiio);
768		} else {
769			ctl_set_medium_error(&io->scsiio,
770			    beio->bio_cmd == BIO_READ);
771		}
772		ctl_complete_beio(beio);
773		return;
774	}
775
776	/*
777	 * If this is a write or a verify, we're all done.
778	 * If this is a read, we can now send the data to the user.
779	 */
780	if ((beio->bio_cmd == BIO_WRITE) ||
781	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
782		ctl_set_success(&io->scsiio);
783		ctl_complete_beio(beio);
784	} else {
785		if ((ARGS(io)->flags & CTL_LLF_READ) &&
786		    beio->beio_cont == NULL) {
787			ctl_set_success(&io->scsiio);
788			ctl_serseq_done(io);
789		}
790#ifdef CTL_TIME_IO
791        	getbintime(&io->io_hdr.dma_start_bt);
792#endif
793		ctl_datamove(io);
794	}
795}
796
797static void
798ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
799			struct ctl_be_block_io *beio)
800{
801	union ctl_io *io = beio->io;
802	struct ctl_lba_len_flags *lbalen = ARGS(io);
803	struct scsi_get_lba_status_data *data;
804	off_t roff, off;
805	int error, status;
806
807	DPRINTF("entered\n");
808
809	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
810	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
811	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
812	    0, curthread->td_ucred, curthread);
813	if (error == 0 && off > roff)
814		status = 0;	/* mapped up to off */
815	else {
816		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
817		    0, curthread->td_ucred, curthread);
818		if (error == 0 && off > roff)
819			status = 1;	/* deallocated up to off */
820		else {
821			status = 0;	/* unknown up to the end */
822			off = be_lun->size_bytes;
823		}
824	}
825	VOP_UNLOCK(be_lun->vn, 0);
826
827	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
828	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
829	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
830	    lbalen->lba), data->descr[0].length);
831	data->descr[0].status = status;
832
833	ctl_complete_beio(beio);
834}
835
836static uint64_t
837ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
838{
839	struct vattr		vattr;
840	struct statfs		statfs;
841	uint64_t		val;
842	int			error;
843
844	val = UINT64_MAX;
845	if (be_lun->vn == NULL)
846		return (val);
847	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
848	if (strcmp(attrname, "blocksused") == 0) {
849		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
850		if (error == 0)
851			val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
852	}
853	if (strcmp(attrname, "blocksavail") == 0 &&
854	    (be_lun->vn->v_iflag & VI_DOOMED) == 0) {
855		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
856		if (error == 0)
857			val = statfs.f_bavail * statfs.f_bsize /
858			    be_lun->cbe_lun.blocksize;
859	}
860	VOP_UNLOCK(be_lun->vn, 0);
861	return (val);
862}
863
864static void
865ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
866			   struct ctl_be_block_io *beio)
867{
868	union ctl_io *io;
869	struct cdevsw *csw;
870	struct cdev *dev;
871	struct uio xuio;
872	struct iovec *xiovec;
873	int error, flags, i, ref;
874
875	DPRINTF("entered\n");
876
877	io = beio->io;
878	flags = 0;
879	if (ARGS(io)->flags & CTL_LLF_DPO)
880		flags |= IO_DIRECT;
881	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
882		flags |= IO_SYNC;
883
884	bzero(&xuio, sizeof(xuio));
885	if (beio->bio_cmd == BIO_READ) {
886		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
887		xuio.uio_rw = UIO_READ;
888	} else {
889		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
890		xuio.uio_rw = UIO_WRITE;
891	}
892	xuio.uio_offset = beio->io_offset;
893	xuio.uio_resid = beio->io_len;
894	xuio.uio_segflg = UIO_SYSSPACE;
895	xuio.uio_iov = beio->xiovecs;
896	xuio.uio_iovcnt = beio->num_segs;
897	xuio.uio_td = curthread;
898
899	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
900		xiovec->iov_base = beio->sg_segs[i].addr;
901		xiovec->iov_len = beio->sg_segs[i].len;
902	}
903
904	binuptime(&beio->ds_t0);
905	mtx_lock(&be_lun->io_lock);
906	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
907	mtx_unlock(&be_lun->io_lock);
908
909	csw = devvn_refthread(be_lun->vn, &dev, &ref);
910	if (csw) {
911		if (beio->bio_cmd == BIO_READ)
912			error = csw->d_read(dev, &xuio, flags);
913		else
914			error = csw->d_write(dev, &xuio, flags);
915		dev_relthread(dev, ref);
916	} else
917		error = ENXIO;
918
919	if (beio->bio_cmd == BIO_READ)
920		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
921	else
922		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
923
924	mtx_lock(&be_lun->io_lock);
925	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
926	    beio->ds_tag_type, beio->ds_trans_type,
927	    /*now*/ NULL, /*then*/&beio->ds_t0);
928	mtx_unlock(&be_lun->io_lock);
929
930	/*
931	 * If we got an error, set the sense data to "MEDIUM ERROR" and
932	 * return the I/O to the user.
933	 */
934	if (error != 0) {
935		if (error == ENOSPC || error == EDQUOT) {
936			ctl_set_space_alloc_fail(&io->scsiio);
937		} else if (error == EROFS || error == EACCES) {
938			ctl_set_hw_write_protected(&io->scsiio);
939		} else {
940			ctl_set_medium_error(&io->scsiio,
941			    beio->bio_cmd == BIO_READ);
942		}
943		ctl_complete_beio(beio);
944		return;
945	}
946
947	/*
948	 * If this is a write or a verify, we're all done.
949	 * If this is a read, we can now send the data to the user.
950	 */
951	if ((beio->bio_cmd == BIO_WRITE) ||
952	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
953		ctl_set_success(&io->scsiio);
954		ctl_complete_beio(beio);
955	} else {
956		if ((ARGS(io)->flags & CTL_LLF_READ) &&
957		    beio->beio_cont == NULL) {
958			ctl_set_success(&io->scsiio);
959			ctl_serseq_done(io);
960		}
961#ifdef CTL_TIME_IO
962        	getbintime(&io->io_hdr.dma_start_bt);
963#endif
964		ctl_datamove(io);
965	}
966}
967
968static void
969ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
970			struct ctl_be_block_io *beio)
971{
972	union ctl_io *io = beio->io;
973	struct cdevsw *csw;
974	struct cdev *dev;
975	struct ctl_lba_len_flags *lbalen = ARGS(io);
976	struct scsi_get_lba_status_data *data;
977	off_t roff, off;
978	int error, ref, status;
979
980	DPRINTF("entered\n");
981
982	csw = devvn_refthread(be_lun->vn, &dev, &ref);
983	if (csw == NULL) {
984		status = 0;	/* unknown up to the end */
985		off = be_lun->size_bytes;
986		goto done;
987	}
988	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
989	error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
990	    curthread);
991	if (error == 0 && off > roff)
992		status = 0;	/* mapped up to off */
993	else {
994		error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
995		    curthread);
996		if (error == 0 && off > roff)
997			status = 1;	/* deallocated up to off */
998		else {
999			status = 0;	/* unknown up to the end */
1000			off = be_lun->size_bytes;
1001		}
1002	}
1003	dev_relthread(dev, ref);
1004
1005done:
1006	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
1007	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
1008	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
1009	    lbalen->lba), data->descr[0].length);
1010	data->descr[0].status = status;
1011
1012	ctl_complete_beio(beio);
1013}
1014
1015static void
1016ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
1017		       struct ctl_be_block_io *beio)
1018{
1019	struct bio *bio;
1020	union ctl_io *io;
1021	struct cdevsw *csw;
1022	struct cdev *dev;
1023	int ref;
1024
1025	io = beio->io;
1026
1027	DPRINTF("entered\n");
1028
1029	/* This can't fail, it's a blocking allocation. */
1030	bio = g_alloc_bio();
1031
1032	bio->bio_cmd	    = BIO_FLUSH;
1033	bio->bio_offset	    = 0;
1034	bio->bio_data	    = 0;
1035	bio->bio_done	    = ctl_be_block_biodone;
1036	bio->bio_caller1    = beio;
1037	bio->bio_pblkno	    = 0;
1038
1039	/*
1040	 * We don't need to acquire the LUN lock here, because we are only
1041	 * sending one bio, and so there is no other context to synchronize
1042	 * with.
1043	 */
1044	beio->num_bios_sent = 1;
1045	beio->send_complete = 1;
1046
1047	binuptime(&beio->ds_t0);
1048	mtx_lock(&be_lun->io_lock);
1049	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1050	mtx_unlock(&be_lun->io_lock);
1051
1052	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1053	if (csw) {
1054		bio->bio_dev = dev;
1055		csw->d_strategy(bio);
1056		dev_relthread(dev, ref);
1057	} else {
1058		bio->bio_error = ENXIO;
1059		ctl_be_block_biodone(bio);
1060	}
1061}
1062
1063static void
1064ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1065		       struct ctl_be_block_io *beio,
1066		       uint64_t off, uint64_t len, int last)
1067{
1068	struct bio *bio;
1069	uint64_t maxlen;
1070	struct cdevsw *csw;
1071	struct cdev *dev;
1072	int ref;
1073
1074	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1075	maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
1076	while (len > 0) {
1077		bio = g_alloc_bio();
1078		bio->bio_cmd	    = BIO_DELETE;
1079		bio->bio_dev	    = dev;
1080		bio->bio_offset	    = off;
1081		bio->bio_length	    = MIN(len, maxlen);
1082		bio->bio_data	    = 0;
1083		bio->bio_done	    = ctl_be_block_biodone;
1084		bio->bio_caller1    = beio;
1085		bio->bio_pblkno     = off / be_lun->cbe_lun.blocksize;
1086
1087		off += bio->bio_length;
1088		len -= bio->bio_length;
1089
1090		mtx_lock(&be_lun->io_lock);
1091		beio->num_bios_sent++;
1092		if (last && len == 0)
1093			beio->send_complete = 1;
1094		mtx_unlock(&be_lun->io_lock);
1095
1096		if (csw) {
1097			csw->d_strategy(bio);
1098		} else {
1099			bio->bio_error = ENXIO;
1100			ctl_be_block_biodone(bio);
1101		}
1102	}
1103	if (csw)
1104		dev_relthread(dev, ref);
1105}
1106
1107static void
1108ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1109		       struct ctl_be_block_io *beio)
1110{
1111	union ctl_io *io;
1112	struct ctl_ptr_len_flags *ptrlen;
1113	struct scsi_unmap_desc *buf, *end;
1114	uint64_t len;
1115
1116	io = beio->io;
1117
1118	DPRINTF("entered\n");
1119
1120	binuptime(&beio->ds_t0);
1121	mtx_lock(&be_lun->io_lock);
1122	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1123	mtx_unlock(&be_lun->io_lock);
1124
1125	if (beio->io_offset == -1) {
1126		beio->io_len = 0;
1127		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1128		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1129		end = buf + ptrlen->len / sizeof(*buf);
1130		for (; buf < end; buf++) {
1131			len = (uint64_t)scsi_4btoul(buf->length) *
1132			    be_lun->cbe_lun.blocksize;
1133			beio->io_len += len;
1134			ctl_be_block_unmap_dev_range(be_lun, beio,
1135			    scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
1136			    len, (end - buf < 2) ? TRUE : FALSE);
1137		}
1138	} else
1139		ctl_be_block_unmap_dev_range(be_lun, beio,
1140		    beio->io_offset, beio->io_len, TRUE);
1141}
1142
1143static void
1144ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1145			  struct ctl_be_block_io *beio)
1146{
1147	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1148	struct bio *bio;
1149	struct cdevsw *csw;
1150	struct cdev *dev;
1151	off_t cur_offset;
1152	int i, max_iosize, ref;
1153
1154	DPRINTF("entered\n");
1155	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1156
1157	/*
1158	 * We have to limit our I/O size to the maximum supported by the
1159	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
1160	 * set it properly, use DFLTPHYS.
1161	 */
1162	if (csw) {
1163		max_iosize = dev->si_iosize_max;
1164		if (max_iosize < PAGE_SIZE)
1165			max_iosize = DFLTPHYS;
1166	} else
1167		max_iosize = DFLTPHYS;
1168
1169	cur_offset = beio->io_offset;
1170	for (i = 0; i < beio->num_segs; i++) {
1171		size_t cur_size;
1172		uint8_t *cur_ptr;
1173
1174		cur_size = beio->sg_segs[i].len;
1175		cur_ptr = beio->sg_segs[i].addr;
1176
1177		while (cur_size > 0) {
1178			/* This can't fail, it's a blocking allocation. */
1179			bio = g_alloc_bio();
1180
1181			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1182
1183			bio->bio_cmd = beio->bio_cmd;
1184			bio->bio_dev = dev;
1185			bio->bio_caller1 = beio;
1186			bio->bio_length = min(cur_size, max_iosize);
1187			bio->bio_offset = cur_offset;
1188			bio->bio_data = cur_ptr;
1189			bio->bio_done = ctl_be_block_biodone;
1190			bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;
1191
1192			cur_offset += bio->bio_length;
1193			cur_ptr += bio->bio_length;
1194			cur_size -= bio->bio_length;
1195
1196			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1197			beio->num_bios_sent++;
1198		}
1199	}
1200	binuptime(&beio->ds_t0);
1201	mtx_lock(&be_lun->io_lock);
1202	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1203	beio->send_complete = 1;
1204	mtx_unlock(&be_lun->io_lock);
1205
1206	/*
1207	 * Fire off all allocated requests!
1208	 */
1209	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1210		TAILQ_REMOVE(&queue, bio, bio_queue);
1211		if (csw)
1212			csw->d_strategy(bio);
1213		else {
1214			bio->bio_error = ENXIO;
1215			ctl_be_block_biodone(bio);
1216		}
1217	}
1218	if (csw)
1219		dev_relthread(dev, ref);
1220}
1221
1222static uint64_t
1223ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1224{
1225	struct diocgattr_arg	arg;
1226	struct cdevsw *csw;
1227	struct cdev *dev;
1228	int error, ref;
1229
1230	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1231	if (csw == NULL)
1232		return (UINT64_MAX);
1233	strlcpy(arg.name, attrname, sizeof(arg.name));
1234	arg.len = sizeof(arg.value.off);
1235	if (csw->d_ioctl) {
1236		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
1237		    curthread);
1238	} else
1239		error = ENODEV;
1240	dev_relthread(dev, ref);
1241	if (error != 0)
1242		return (UINT64_MAX);
1243	return (arg.value.off);
1244}
1245
1246static void
1247ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
1248			    union ctl_io *io)
1249{
1250	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1251	struct ctl_be_block_io *beio;
1252	struct ctl_lba_len_flags *lbalen;
1253
1254	DPRINTF("entered\n");
1255	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1256	lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1257
1258	beio->io_len = lbalen->len * cbe_lun->blocksize;
1259	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1260	beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
1261	beio->bio_cmd = BIO_FLUSH;
1262	beio->ds_trans_type = DEVSTAT_NO_DATA;
1263	DPRINTF("SYNC\n");
1264	be_lun->lun_flush(be_lun, beio);
1265}
1266
1267static void
1268ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1269{
1270	union ctl_io *io;
1271
1272	io = beio->io;
1273	ctl_free_beio(beio);
1274	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1275	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1276	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1277		ctl_config_write_done(io);
1278		return;
1279	}
1280
1281	ctl_be_block_config_write(io);
1282}
1283
1284static void
1285ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1286			    union ctl_io *io)
1287{
1288	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1289	struct ctl_be_block_io *beio;
1290	struct ctl_lba_len_flags *lbalen;
1291	uint64_t len_left, lba;
1292	uint32_t pb, pbo, adj;
1293	int i, seglen;
1294	uint8_t *buf, *end;
1295
1296	DPRINTF("entered\n");
1297
1298	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1299	lbalen = ARGS(beio->io);
1300
1301	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1302	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1303		ctl_free_beio(beio);
1304		ctl_set_invalid_field(&io->scsiio,
1305				      /*sks_valid*/ 1,
1306				      /*command*/ 1,
1307				      /*field*/ 1,
1308				      /*bit_valid*/ 0,
1309				      /*bit*/ 0);
1310		ctl_config_write_done(io);
1311		return;
1312	}
1313
1314	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1315		beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1316		beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1317		beio->bio_cmd = BIO_DELETE;
1318		beio->ds_trans_type = DEVSTAT_FREE;
1319
1320		be_lun->unmap(be_lun, beio);
1321		return;
1322	}
1323
1324	beio->bio_cmd = BIO_WRITE;
1325	beio->ds_trans_type = DEVSTAT_WRITE;
1326
1327	DPRINTF("WRITE SAME at LBA %jx len %u\n",
1328	       (uintmax_t)lbalen->lba, lbalen->len);
1329
1330	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
1331	if (be_lun->cbe_lun.pblockoff > 0)
1332		pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
1333	else
1334		pbo = 0;
1335	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
1336	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1337
1338		/*
1339		 * Setup the S/G entry for this chunk.
1340		 */
1341		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1342		if (pb > cbe_lun->blocksize) {
1343			adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
1344			    seglen - pbo) % pb;
1345			if (seglen > adj)
1346				seglen -= adj;
1347			else
1348				seglen -= seglen % cbe_lun->blocksize;
1349		} else
1350			seglen -= seglen % cbe_lun->blocksize;
1351		beio->sg_segs[i].len = seglen;
1352		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1353
1354		DPRINTF("segment %d addr %p len %zd\n", i,
1355			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1356
1357		beio->num_segs++;
1358		len_left -= seglen;
1359
1360		buf = beio->sg_segs[i].addr;
1361		end = buf + seglen;
1362		for (; buf < end; buf += cbe_lun->blocksize) {
1363			memcpy(buf, io->scsiio.kern_data_ptr, cbe_lun->blocksize);
1364			if (lbalen->flags & SWS_LBDATA)
1365				scsi_ulto4b(lbalen->lba + lba, buf);
1366			lba++;
1367		}
1368	}
1369
1370	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1371	beio->io_len = lba * cbe_lun->blocksize;
1372
1373	/* We can not do all in one run. Correct and schedule rerun. */
1374	if (len_left > 0) {
1375		lbalen->lba += lba;
1376		lbalen->len -= lba;
1377		beio->beio_cont = ctl_be_block_cw_done_ws;
1378	}
1379
1380	be_lun->dispatch(be_lun, beio);
1381}
1382
1383static void
1384ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1385			    union ctl_io *io)
1386{
1387	struct ctl_be_block_io *beio;
1388	struct ctl_ptr_len_flags *ptrlen;
1389
1390	DPRINTF("entered\n");
1391
1392	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1393	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1394
1395	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1396		ctl_free_beio(beio);
1397		ctl_set_invalid_field(&io->scsiio,
1398				      /*sks_valid*/ 0,
1399				      /*command*/ 1,
1400				      /*field*/ 0,
1401				      /*bit_valid*/ 0,
1402				      /*bit*/ 0);
1403		ctl_config_write_done(io);
1404		return;
1405	}
1406
1407	beio->io_len = 0;
1408	beio->io_offset = -1;
1409	beio->bio_cmd = BIO_DELETE;
1410	beio->ds_trans_type = DEVSTAT_FREE;
1411	DPRINTF("UNMAP\n");
1412	be_lun->unmap(be_lun, beio);
1413}
1414
1415static void
1416ctl_be_block_cr_done(struct ctl_be_block_io *beio)
1417{
1418	union ctl_io *io;
1419
1420	io = beio->io;
1421	ctl_free_beio(beio);
1422	ctl_config_read_done(io);
1423}
1424
1425static void
1426ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1427			 union ctl_io *io)
1428{
1429	struct ctl_be_block_io *beio;
1430	struct ctl_be_block_softc *softc;
1431
1432	DPRINTF("entered\n");
1433
1434	softc = be_lun->softc;
1435	beio = ctl_alloc_beio(softc);
1436	beio->io = io;
1437	beio->lun = be_lun;
1438	beio->beio_cont = ctl_be_block_cr_done;
1439	PRIV(io)->ptr = (void *)beio;
1440
1441	switch (io->scsiio.cdb[0]) {
1442	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
1443		beio->bio_cmd = -1;
1444		beio->ds_trans_type = DEVSTAT_NO_DATA;
1445		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1446		beio->io_len = 0;
1447		if (be_lun->get_lba_status)
1448			be_lun->get_lba_status(be_lun, beio);
1449		else
1450			ctl_be_block_cr_done(beio);
1451		break;
1452	default:
1453		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1454		break;
1455	}
1456}
1457
1458static void
1459ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1460{
1461	union ctl_io *io;
1462
1463	io = beio->io;
1464	ctl_free_beio(beio);
1465	ctl_config_write_done(io);
1466}
1467
1468static void
1469ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1470			 union ctl_io *io)
1471{
1472	struct ctl_be_block_io *beio;
1473	struct ctl_be_block_softc *softc;
1474
1475	DPRINTF("entered\n");
1476
1477	softc = be_lun->softc;
1478	beio = ctl_alloc_beio(softc);
1479	beio->io = io;
1480	beio->lun = be_lun;
1481	beio->beio_cont = ctl_be_block_cw_done;
1482	switch (io->scsiio.tag_type) {
1483	case CTL_TAG_ORDERED:
1484		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1485		break;
1486	case CTL_TAG_HEAD_OF_QUEUE:
1487		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1488		break;
1489	case CTL_TAG_UNTAGGED:
1490	case CTL_TAG_SIMPLE:
1491	case CTL_TAG_ACA:
1492	default:
1493		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1494		break;
1495	}
1496	PRIV(io)->ptr = (void *)beio;
1497
1498	switch (io->scsiio.cdb[0]) {
1499	case SYNCHRONIZE_CACHE:
1500	case SYNCHRONIZE_CACHE_16:
1501		ctl_be_block_cw_dispatch_sync(be_lun, io);
1502		break;
1503	case WRITE_SAME_10:
1504	case WRITE_SAME_16:
1505		ctl_be_block_cw_dispatch_ws(be_lun, io);
1506		break;
1507	case UNMAP:
1508		ctl_be_block_cw_dispatch_unmap(be_lun, io);
1509		break;
1510	default:
1511		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1512		break;
1513	}
1514}
1515
1516SDT_PROBE_DEFINE1(cbb, kernel, read, start, "uint64_t");
1517SDT_PROBE_DEFINE1(cbb, kernel, write, start, "uint64_t");
1518SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, "uint64_t");
1519SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, "uint64_t");
1520
1521static void
1522ctl_be_block_next(struct ctl_be_block_io *beio)
1523{
1524	struct ctl_be_block_lun *be_lun;
1525	union ctl_io *io;
1526
1527	io = beio->io;
1528	be_lun = beio->lun;
1529	ctl_free_beio(beio);
1530	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1531	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1532	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1533		ctl_data_submit_done(io);
1534		return;
1535	}
1536
1537	io->io_hdr.status &= ~CTL_STATUS_MASK;
1538	io->io_hdr.status |= CTL_STATUS_NONE;
1539
1540	mtx_lock(&be_lun->queue_lock);
1541	/*
1542	 * XXX KDM make sure that links is okay to use at this point.
1543	 * Otherwise, we either need to add another field to ctl_io_hdr,
1544	 * or deal with resource allocation here.
1545	 */
1546	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1547	mtx_unlock(&be_lun->queue_lock);
1548
1549	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1550}
1551
1552static void
1553ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1554			   union ctl_io *io)
1555{
1556	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1557	struct ctl_be_block_io *beio;
1558	struct ctl_be_block_softc *softc;
1559	struct ctl_lba_len_flags *lbalen;
1560	struct ctl_ptr_len_flags *bptrlen;
1561	uint64_t len_left, lbas;
1562	int i;
1563
1564	softc = be_lun->softc;
1565
1566	DPRINTF("entered\n");
1567
1568	lbalen = ARGS(io);
1569	if (lbalen->flags & CTL_LLF_WRITE) {
1570		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
1571	} else {
1572		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
1573	}
1574
1575	beio = ctl_alloc_beio(softc);
1576	beio->io = io;
1577	beio->lun = be_lun;
1578	bptrlen = PRIV(io);
1579	bptrlen->ptr = (void *)beio;
1580
1581	switch (io->scsiio.tag_type) {
1582	case CTL_TAG_ORDERED:
1583		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1584		break;
1585	case CTL_TAG_HEAD_OF_QUEUE:
1586		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1587		break;
1588	case CTL_TAG_UNTAGGED:
1589	case CTL_TAG_SIMPLE:
1590	case CTL_TAG_ACA:
1591	default:
1592		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1593		break;
1594	}
1595
1596	if (lbalen->flags & CTL_LLF_WRITE) {
1597		beio->bio_cmd = BIO_WRITE;
1598		beio->ds_trans_type = DEVSTAT_WRITE;
1599	} else {
1600		beio->bio_cmd = BIO_READ;
1601		beio->ds_trans_type = DEVSTAT_READ;
1602	}
1603
1604	DPRINTF("%s at LBA %jx len %u @%ju\n",
1605	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1606	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
1607	if (lbalen->flags & CTL_LLF_COMPARE)
1608		lbas = CTLBLK_HALF_IO_SIZE;
1609	else
1610		lbas = CTLBLK_MAX_IO_SIZE;
1611	lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
1612	beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
1613	beio->io_len = lbas * cbe_lun->blocksize;
1614	bptrlen->len += lbas;
1615
1616	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
1617		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
1618		    i, CTLBLK_MAX_SEGS));
1619
1620		/*
1621		 * Setup the S/G entry for this chunk.
1622		 */
1623		beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
1624		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1625
1626		DPRINTF("segment %d addr %p len %zd\n", i,
1627			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1628
1629		/* Set up second segment for compare operation. */
1630		if (lbalen->flags & CTL_LLF_COMPARE) {
1631			beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
1632			    beio->sg_segs[i].len;
1633			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
1634			    uma_zalloc(be_lun->lun_zone, M_WAITOK);
1635		}
1636
1637		beio->num_segs++;
1638		len_left -= beio->sg_segs[i].len;
1639	}
1640	if (bptrlen->len < lbalen->len)
1641		beio->beio_cont = ctl_be_block_next;
1642	io->scsiio.be_move_done = ctl_be_block_move_done;
1643	/* For compare we have separate S/G lists for read and datamove. */
1644	if (lbalen->flags & CTL_LLF_COMPARE)
1645		io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
1646	else
1647		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1648	io->scsiio.kern_data_len = beio->io_len;
1649	io->scsiio.kern_data_resid = 0;
1650	io->scsiio.kern_sg_entries = beio->num_segs;
1651	io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
1652
1653	/*
1654	 * For the read case, we need to read the data into our buffers and
1655	 * then we can send it back to the user.  For the write case, we
1656	 * need to get the data from the user first.
1657	 */
1658	if (beio->bio_cmd == BIO_READ) {
1659		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
1660		be_lun->dispatch(be_lun, beio);
1661	} else {
1662		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1663#ifdef CTL_TIME_IO
1664        	getbintime(&io->io_hdr.dma_start_bt);
1665#endif
1666		ctl_datamove(io);
1667	}
1668}
1669
1670static void
1671ctl_be_block_worker(void *context, int pending)
1672{
1673	struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context;
1674	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1675	union ctl_io *io;
1676	struct ctl_be_block_io *beio;
1677
1678	DPRINTF("entered\n");
1679	/*
1680	 * Fetch and process I/Os from all queues.  If we detect LUN
1681	 * CTL_LUN_FLAG_OFFLINE status here -- it is result of a race,
1682	 * so make response maximally opaque to not confuse initiator.
1683	 */
1684	for (;;) {
1685		mtx_lock(&be_lun->queue_lock);
1686		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1687		if (io != NULL) {
1688			DPRINTF("datamove queue\n");
1689			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1690				      ctl_io_hdr, links);
1691			mtx_unlock(&be_lun->queue_lock);
1692			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1693			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1694				ctl_set_busy(&io->scsiio);
1695				ctl_complete_beio(beio);
1696				return;
1697			}
1698			be_lun->dispatch(be_lun, beio);
1699			continue;
1700		}
1701		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1702		if (io != NULL) {
1703			DPRINTF("config write queue\n");
1704			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1705				      ctl_io_hdr, links);
1706			mtx_unlock(&be_lun->queue_lock);
1707			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1708				ctl_set_busy(&io->scsiio);
1709				ctl_config_write_done(io);
1710				return;
1711			}
1712			ctl_be_block_cw_dispatch(be_lun, io);
1713			continue;
1714		}
1715		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
1716		if (io != NULL) {
1717			DPRINTF("config read queue\n");
1718			STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
1719				      ctl_io_hdr, links);
1720			mtx_unlock(&be_lun->queue_lock);
1721			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1722				ctl_set_busy(&io->scsiio);
1723				ctl_config_read_done(io);
1724				return;
1725			}
1726			ctl_be_block_cr_dispatch(be_lun, io);
1727			continue;
1728		}
1729		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1730		if (io != NULL) {
1731			DPRINTF("input queue\n");
1732			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1733				      ctl_io_hdr, links);
1734			mtx_unlock(&be_lun->queue_lock);
1735			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1736				ctl_set_busy(&io->scsiio);
1737				ctl_data_submit_done(io);
1738				return;
1739			}
1740			ctl_be_block_dispatch(be_lun, io);
1741			continue;
1742		}
1743
1744		/*
1745		 * If we get here, there is no work left in the queues, so
1746		 * just break out and let the task queue go to sleep.
1747		 */
1748		mtx_unlock(&be_lun->queue_lock);
1749		break;
1750	}
1751}
1752
1753/*
1754 * Entry point from CTL to the backend for I/O.  We queue everything to a
1755 * work thread, so this just puts the I/O on a queue and wakes up the
1756 * thread.
1757 */
1758static int
1759ctl_be_block_submit(union ctl_io *io)
1760{
1761	struct ctl_be_block_lun *be_lun;
1762	struct ctl_be_lun *cbe_lun;
1763
1764	DPRINTF("entered\n");
1765
1766	cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1767		CTL_PRIV_BACKEND_LUN].ptr;
1768	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
1769
1770	/*
1771	 * Make sure we only get SCSI I/O.
1772	 */
1773	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1774		"%#x) encountered", io->io_hdr.io_type));
1775
1776	PRIV(io)->len = 0;
1777
1778	mtx_lock(&be_lun->queue_lock);
1779	/*
1780	 * XXX KDM make sure that links is okay to use at this point.
1781	 * Otherwise, we either need to add another field to ctl_io_hdr,
1782	 * or deal with resource allocation here.
1783	 */
1784	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1785	mtx_unlock(&be_lun->queue_lock);
1786	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1787
1788	return (CTL_RETVAL_COMPLETE);
1789}
1790
1791static int
1792ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1793			int flag, struct thread *td)
1794{
1795	struct ctl_be_block_softc *softc;
1796	int error;
1797
1798	softc = &backend_block_softc;
1799
1800	error = 0;
1801
1802	switch (cmd) {
1803	case CTL_LUN_REQ: {
1804		struct ctl_lun_req *lun_req;
1805
1806		lun_req = (struct ctl_lun_req *)addr;
1807
1808		switch (lun_req->reqtype) {
1809		case CTL_LUNREQ_CREATE:
1810			error = ctl_be_block_create(softc, lun_req);
1811			break;
1812		case CTL_LUNREQ_RM:
1813			error = ctl_be_block_rm(softc, lun_req);
1814			break;
1815		case CTL_LUNREQ_MODIFY:
1816			error = ctl_be_block_modify(softc, lun_req);
1817			break;
1818		default:
1819			lun_req->status = CTL_LUN_ERROR;
1820			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1821				 "invalid LUN request type %d",
1822				 lun_req->reqtype);
1823			break;
1824		}
1825		break;
1826	}
1827	default:
1828		error = ENOTTY;
1829		break;
1830	}
1831
1832	return (error);
1833}
1834
1835static int
1836ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1837{
1838	struct ctl_be_lun *cbe_lun;
1839	struct ctl_be_block_filedata *file_data;
1840	struct ctl_lun_create_params *params;
1841	char			     *value;
1842	struct vattr		      vattr;
1843	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1844	int			      error;
1845
1846	error = 0;
1847	cbe_lun = &be_lun->cbe_lun;
1848	file_data = &be_lun->backend.file;
1849	params = &be_lun->params;
1850
1851	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1852	be_lun->dispatch = ctl_be_block_dispatch_file;
1853	be_lun->lun_flush = ctl_be_block_flush_file;
1854	be_lun->get_lba_status = ctl_be_block_gls_file;
1855	be_lun->getattr = ctl_be_block_getattr_file;
1856	be_lun->unmap = NULL;
1857	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
1858
1859	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1860	if (error != 0) {
1861		snprintf(req->error_str, sizeof(req->error_str),
1862			 "error calling VOP_GETATTR() for file %s",
1863			 be_lun->dev_path);
1864		return (error);
1865	}
1866
1867	/*
1868	 * Verify that we have the ability to upgrade to exclusive
1869	 * access on this file so we can trap errors at open instead
1870	 * of reporting them during first access.
1871	 */
1872	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1873		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1874		if (be_lun->vn->v_iflag & VI_DOOMED) {
1875			error = EBADF;
1876			snprintf(req->error_str, sizeof(req->error_str),
1877				 "error locking file %s", be_lun->dev_path);
1878			return (error);
1879		}
1880	}
1881
1882	file_data->cred = crhold(curthread->td_ucred);
1883	if (params->lun_size_bytes != 0)
1884		be_lun->size_bytes = params->lun_size_bytes;
1885	else
1886		be_lun->size_bytes = vattr.va_size;
1887
1888	/*
1889	 * For files we can use any logical block size.  Prefer 512 bytes
1890	 * for compatibility reasons.  If file's vattr.va_blocksize
1891	 * (preferred I/O block size) is bigger and multiple to chosen
1892	 * logical block size -- report it as physical block size.
1893	 */
1894	if (params->blocksize_bytes != 0)
1895		cbe_lun->blocksize = params->blocksize_bytes;
1896	else
1897		cbe_lun->blocksize = 512;
1898	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
1899	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
1900	    0 : (be_lun->size_blocks - 1);
1901
1902	us = ps = vattr.va_blocksize;
1903	uo = po = 0;
1904
1905	value = ctl_get_opt(&cbe_lun->options, "pblocksize");
1906	if (value != NULL)
1907		ctl_expand_number(value, &ps);
1908	value = ctl_get_opt(&cbe_lun->options, "pblockoffset");
1909	if (value != NULL)
1910		ctl_expand_number(value, &po);
1911	pss = ps / cbe_lun->blocksize;
1912	pos = po / cbe_lun->blocksize;
1913	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
1914	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
1915		cbe_lun->pblockexp = fls(pss) - 1;
1916		cbe_lun->pblockoff = (pss - pos) % pss;
1917	}
1918
1919	value = ctl_get_opt(&cbe_lun->options, "ublocksize");
1920	if (value != NULL)
1921		ctl_expand_number(value, &us);
1922	value = ctl_get_opt(&cbe_lun->options, "ublockoffset");
1923	if (value != NULL)
1924		ctl_expand_number(value, &uo);
1925	uss = us / cbe_lun->blocksize;
1926	uos = uo / cbe_lun->blocksize;
1927	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
1928	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
1929		cbe_lun->ublockexp = fls(uss) - 1;
1930		cbe_lun->ublockoff = (uss - uos) % uss;
1931	}
1932
1933	/*
1934	 * Sanity check.  The media size has to be at least one
1935	 * sector long.
1936	 */
1937	if (be_lun->size_bytes < cbe_lun->blocksize) {
1938		error = EINVAL;
1939		snprintf(req->error_str, sizeof(req->error_str),
1940			 "file %s size %ju < block size %u", be_lun->dev_path,
1941			 (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
1942	}
1943
1944	cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
1945	return (error);
1946}
1947
1948static int
1949ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1950{
1951	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1952	struct ctl_lun_create_params *params;
1953	struct cdevsw		     *csw;
1954	struct cdev		     *dev;
1955	char			     *value;
1956	int			      error, atomic, maxio, ref, unmap, tmp;
1957	off_t			      ps, pss, po, pos, us, uss, uo, uos, otmp;
1958
1959	params = &be_lun->params;
1960
1961	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1962	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1963	if (csw == NULL)
1964		return (ENXIO);
1965	if (strcmp(csw->d_name, "zvol") == 0) {
1966		be_lun->dispatch = ctl_be_block_dispatch_zvol;
1967		be_lun->get_lba_status = ctl_be_block_gls_zvol;
1968		atomic = maxio = CTLBLK_MAX_IO_SIZE;
1969	} else {
1970		be_lun->dispatch = ctl_be_block_dispatch_dev;
1971		be_lun->get_lba_status = NULL;
1972		atomic = 0;
1973		maxio = dev->si_iosize_max;
1974		if (maxio <= 0)
1975			maxio = DFLTPHYS;
1976		if (maxio > CTLBLK_MAX_IO_SIZE)
1977			maxio = CTLBLK_MAX_IO_SIZE;
1978	}
1979	be_lun->lun_flush = ctl_be_block_flush_dev;
1980	be_lun->getattr = ctl_be_block_getattr_dev;
1981	be_lun->unmap = ctl_be_block_unmap_dev;
1982
1983	if (!csw->d_ioctl) {
1984		dev_relthread(dev, ref);
1985		snprintf(req->error_str, sizeof(req->error_str),
1986			 "no d_ioctl for device %s!", be_lun->dev_path);
1987		return (ENODEV);
1988	}
1989
1990	error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
1991			       curthread);
1992	if (error) {
1993		dev_relthread(dev, ref);
1994		snprintf(req->error_str, sizeof(req->error_str),
1995			 "error %d returned for DIOCGSECTORSIZE ioctl "
1996			 "on %s!", error, be_lun->dev_path);
1997		return (error);
1998	}
1999
2000	/*
2001	 * If the user has asked for a blocksize that is greater than the
2002	 * backing device's blocksize, we can do it only if the blocksize
2003	 * the user is asking for is an even multiple of the underlying
2004	 * device's blocksize.
2005	 */
2006	if ((params->blocksize_bytes != 0) &&
2007	    (params->blocksize_bytes >= tmp)) {
2008		if (params->blocksize_bytes % tmp == 0) {
2009			cbe_lun->blocksize = params->blocksize_bytes;
2010		} else {
2011			dev_relthread(dev, ref);
2012			snprintf(req->error_str, sizeof(req->error_str),
2013				 "requested blocksize %u is not an even "
2014				 "multiple of backing device blocksize %u",
2015				 params->blocksize_bytes, tmp);
2016			return (EINVAL);
2017		}
2018	} else if (params->blocksize_bytes != 0) {
2019		dev_relthread(dev, ref);
2020		snprintf(req->error_str, sizeof(req->error_str),
2021			 "requested blocksize %u < backing device "
2022			 "blocksize %u", params->blocksize_bytes, tmp);
2023		return (EINVAL);
2024	} else
2025		cbe_lun->blocksize = tmp;
2026
2027	error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
2028			     curthread);
2029	if (error) {
2030		dev_relthread(dev, ref);
2031		snprintf(req->error_str, sizeof(req->error_str),
2032			 "error %d returned for DIOCGMEDIASIZE "
2033			 " ioctl on %s!", error,
2034			 be_lun->dev_path);
2035		return (error);
2036	}
2037
2038	if (params->lun_size_bytes != 0) {
2039		if (params->lun_size_bytes > otmp) {
2040			dev_relthread(dev, ref);
2041			snprintf(req->error_str, sizeof(req->error_str),
2042				 "requested LUN size %ju > backing device "
2043				 "size %ju",
2044				 (uintmax_t)params->lun_size_bytes,
2045				 (uintmax_t)otmp);
2046			return (EINVAL);
2047		}
2048
2049		be_lun->size_bytes = params->lun_size_bytes;
2050	} else
2051		be_lun->size_bytes = otmp;
2052	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2053	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2054	    0 : (be_lun->size_blocks - 1);
2055
2056	error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
2057	    curthread);
2058	if (error)
2059		ps = po = 0;
2060	else {
2061		error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
2062		    FREAD, curthread);
2063		if (error)
2064			po = 0;
2065	}
2066	us = ps;
2067	uo = po;
2068
2069	value = ctl_get_opt(&cbe_lun->options, "pblocksize");
2070	if (value != NULL)
2071		ctl_expand_number(value, &ps);
2072	value = ctl_get_opt(&cbe_lun->options, "pblockoffset");
2073	if (value != NULL)
2074		ctl_expand_number(value, &po);
2075	pss = ps / cbe_lun->blocksize;
2076	pos = po / cbe_lun->blocksize;
2077	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
2078	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
2079		cbe_lun->pblockexp = fls(pss) - 1;
2080		cbe_lun->pblockoff = (pss - pos) % pss;
2081	}
2082
2083	value = ctl_get_opt(&cbe_lun->options, "ublocksize");
2084	if (value != NULL)
2085		ctl_expand_number(value, &us);
2086	value = ctl_get_opt(&cbe_lun->options, "ublockoffset");
2087	if (value != NULL)
2088		ctl_expand_number(value, &uo);
2089	uss = us / cbe_lun->blocksize;
2090	uos = uo / cbe_lun->blocksize;
2091	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
2092	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
2093		cbe_lun->ublockexp = fls(uss) - 1;
2094		cbe_lun->ublockoff = (uss - uos) % uss;
2095	}
2096
2097	cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
2098	cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;
2099
2100	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2101		unmap = 1;
2102	} else {
2103		struct diocgattr_arg	arg;
2104
2105		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2106		arg.len = sizeof(arg.value.i);
2107		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
2108		    curthread);
2109		unmap = (error == 0) ? arg.value.i : 0;
2110	}
2111	value = ctl_get_opt(&cbe_lun->options, "unmap");
2112	if (value != NULL)
2113		unmap = (strcmp(value, "on") == 0);
2114	if (unmap)
2115		cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
2116	else
2117		cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
2118
2119	dev_relthread(dev, ref);
2120	return (0);
2121}
2122
2123static int
2124ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2125{
2126	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2127	int flags;
2128
2129	if (be_lun->vn) {
2130		flags = FREAD;
2131		if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
2132			flags |= FWRITE;
2133		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2134		be_lun->vn = NULL;
2135
2136		switch (be_lun->dev_type) {
2137		case CTL_BE_BLOCK_DEV:
2138			break;
2139		case CTL_BE_BLOCK_FILE:
2140			if (be_lun->backend.file.cred != NULL) {
2141				crfree(be_lun->backend.file.cred);
2142				be_lun->backend.file.cred = NULL;
2143			}
2144			break;
2145		case CTL_BE_BLOCK_NONE:
2146			break;
2147		default:
2148			panic("Unexpected backend type.");
2149			break;
2150		}
2151		be_lun->dev_type = CTL_BE_BLOCK_NONE;
2152	}
2153	return (0);
2154}
2155
2156static int
2157ctl_be_block_open(struct ctl_be_block_softc *softc,
2158		  struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2159{
2160	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2161	struct nameidata nd;
2162	char		*value;
2163	int		 error, flags;
2164
2165	error = 0;
2166	if (rootvnode == NULL) {
2167		snprintf(req->error_str, sizeof(req->error_str),
2168			 "Root filesystem is not mounted");
2169		return (1);
2170	}
2171	if (!curthread->td_proc->p_fd->fd_cdir) {
2172		curthread->td_proc->p_fd->fd_cdir = rootvnode;
2173		VREF(rootvnode);
2174	}
2175	if (!curthread->td_proc->p_fd->fd_rdir) {
2176		curthread->td_proc->p_fd->fd_rdir = rootvnode;
2177		VREF(rootvnode);
2178	}
2179	if (!curthread->td_proc->p_fd->fd_jdir) {
2180		curthread->td_proc->p_fd->fd_jdir = rootvnode;
2181		VREF(rootvnode);
2182	}
2183
2184	value = ctl_get_opt(&cbe_lun->options, "file");
2185	if (value == NULL) {
2186		snprintf(req->error_str, sizeof(req->error_str),
2187			 "no file argument specified");
2188		return (1);
2189	}
2190	free(be_lun->dev_path, M_CTLBLK);
2191	be_lun->dev_path = strdup(value, M_CTLBLK);
2192
2193	flags = FREAD;
2194	value = ctl_get_opt(&cbe_lun->options, "readonly");
2195	if (value == NULL || strcmp(value, "on") != 0)
2196		flags |= FWRITE;
2197
2198again:
2199	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
2200	error = vn_open(&nd, &flags, 0, NULL);
2201	if ((error == EROFS || error == EACCES) && (flags & FWRITE)) {
2202		flags &= ~FWRITE;
2203		goto again;
2204	}
2205	if (error) {
2206		/*
2207		 * This is the only reasonable guess we can make as far as
2208		 * path if the user doesn't give us a fully qualified path.
2209		 * If they want to specify a file, they need to specify the
2210		 * full path.
2211		 */
2212		if (be_lun->dev_path[0] != '/') {
2213			char *dev_name;
2214
2215			asprintf(&dev_name, M_CTLBLK, "/dev/%s",
2216				be_lun->dev_path);
2217			free(be_lun->dev_path, M_CTLBLK);
2218			be_lun->dev_path = dev_name;
2219			goto again;
2220		}
2221		snprintf(req->error_str, sizeof(req->error_str),
2222		    "error opening %s: %d", be_lun->dev_path, error);
2223		return (error);
2224	}
2225	if (flags & FWRITE)
2226		cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
2227	else
2228		cbe_lun->flags |= CTL_LUN_FLAG_READONLY;
2229
2230	NDFREE(&nd, NDF_ONLY_PNBUF);
2231	be_lun->vn = nd.ni_vp;
2232
2233	/* We only support disks and files. */
2234	if (vn_isdisk(be_lun->vn, &error)) {
2235		error = ctl_be_block_open_dev(be_lun, req);
2236	} else if (be_lun->vn->v_type == VREG) {
2237		error = ctl_be_block_open_file(be_lun, req);
2238	} else {
2239		error = EINVAL;
2240		snprintf(req->error_str, sizeof(req->error_str),
2241			 "%s is not a disk or plain file", be_lun->dev_path);
2242	}
2243	VOP_UNLOCK(be_lun->vn, 0);
2244
2245	if (error != 0)
2246		ctl_be_block_close(be_lun);
2247	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2248	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2249		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2250	value = ctl_get_opt(&cbe_lun->options, "serseq");
2251	if (value != NULL && strcmp(value, "on") == 0)
2252		cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
2253	else if (value != NULL && strcmp(value, "read") == 0)
2254		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2255	else if (value != NULL && strcmp(value, "off") == 0)
2256		cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2257	return (0);
2258}
2259
2260static int
2261ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2262{
2263	struct ctl_be_lun *cbe_lun;
2264	struct ctl_be_block_lun *be_lun;
2265	struct ctl_lun_create_params *params;
2266	char num_thread_str[16];
2267	char tmpstr[32];
2268	char *value;
2269	int retval, num_threads;
2270	int tmp_num_threads;
2271
2272	params = &req->reqdata.create;
2273	retval = 0;
2274	req->status = CTL_LUN_OK;
2275
2276	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2277	cbe_lun = &be_lun->cbe_lun;
2278	cbe_lun->be_lun = be_lun;
2279	be_lun->params = req->reqdata.create;
2280	be_lun->softc = softc;
2281	STAILQ_INIT(&be_lun->input_queue);
2282	STAILQ_INIT(&be_lun->config_read_queue);
2283	STAILQ_INIT(&be_lun->config_write_queue);
2284	STAILQ_INIT(&be_lun->datamove_queue);
2285	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
2286	mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF);
2287	mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF);
2288	ctl_init_opts(&cbe_lun->options,
2289	    req->num_be_args, req->kern_be_args);
2290	be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG,
2291	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
2292	if (be_lun->lun_zone == NULL) {
2293		snprintf(req->error_str, sizeof(req->error_str),
2294			 "error allocating UMA zone");
2295		goto bailout_error;
2296	}
2297
2298	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2299		cbe_lun->lun_type = params->device_type;
2300	else
2301		cbe_lun->lun_type = T_DIRECT;
2302	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
2303	cbe_lun->flags = 0;
2304	value = ctl_get_opt(&cbe_lun->options, "ha_role");
2305	if (value != NULL) {
2306		if (strcmp(value, "primary") == 0)
2307			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2308	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2309		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2310
2311	if (cbe_lun->lun_type == T_DIRECT) {
2312		be_lun->size_bytes = params->lun_size_bytes;
2313		if (params->blocksize_bytes != 0)
2314			cbe_lun->blocksize = params->blocksize_bytes;
2315		else
2316			cbe_lun->blocksize = 512;
2317		be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2318		cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2319		    0 : (be_lun->size_blocks - 1);
2320
2321		if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2322		    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2323			retval = ctl_be_block_open(softc, be_lun, req);
2324			if (retval != 0) {
2325				retval = 0;
2326				req->status = CTL_LUN_WARNING;
2327			}
2328		}
2329		num_threads = cbb_num_threads;
2330	} else {
2331		num_threads = 1;
2332	}
2333
2334	/*
2335	 * XXX This searching loop might be refactored to be combined with
2336	 * the loop above,
2337	 */
2338	value = ctl_get_opt(&cbe_lun->options, "num_threads");
2339	if (value != NULL) {
2340		tmp_num_threads = strtol(value, NULL, 0);
2341
2342		/*
2343		 * We don't let the user specify less than one
2344		 * thread, but hope he's clueful enough not to
2345		 * specify 1000 threads.
2346		 */
2347		if (tmp_num_threads < 1) {
2348			snprintf(req->error_str, sizeof(req->error_str),
2349				 "invalid number of threads %s",
2350				 num_thread_str);
2351			goto bailout_error;
2352		}
2353		num_threads = tmp_num_threads;
2354	}
2355
2356	if (be_lun->vn == NULL)
2357		cbe_lun->flags |= CTL_LUN_FLAG_OFFLINE;
2358	/* Tell the user the blocksize we ended up using */
2359	params->lun_size_bytes = be_lun->size_bytes;
2360	params->blocksize_bytes = cbe_lun->blocksize;
2361	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2362		cbe_lun->req_lun_id = params->req_lun_id;
2363		cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ;
2364	} else
2365		cbe_lun->req_lun_id = 0;
2366
2367	cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
2368	cbe_lun->lun_config_status = ctl_be_block_lun_config_status;
2369	cbe_lun->be = &ctl_be_block_driver;
2370
2371	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2372		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
2373			 softc->num_luns);
2374		strncpy((char *)cbe_lun->serial_num, tmpstr,
2375			MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));
2376
2377		/* Tell the user what we used for a serial number */
2378		strncpy((char *)params->serial_num, tmpstr,
2379			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2380	} else {
2381		strncpy((char *)cbe_lun->serial_num, params->serial_num,
2382			MIN(sizeof(cbe_lun->serial_num),
2383			sizeof(params->serial_num)));
2384	}
2385	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2386		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
2387		strncpy((char *)cbe_lun->device_id, tmpstr,
2388			MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));
2389
2390		/* Tell the user what we used for a device ID */
2391		strncpy((char *)params->device_id, tmpstr,
2392			MIN(sizeof(params->device_id), sizeof(tmpstr)));
2393	} else {
2394		strncpy((char *)cbe_lun->device_id, params->device_id,
2395			MIN(sizeof(cbe_lun->device_id),
2396			    sizeof(params->device_id)));
2397	}
2398
2399	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2400
2401	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
2402	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2403
2404	if (be_lun->io_taskqueue == NULL) {
2405		snprintf(req->error_str, sizeof(req->error_str),
2406			 "unable to create taskqueue");
2407		goto bailout_error;
2408	}
2409
2410	/*
2411	 * Note that we start the same number of threads by default for
2412	 * both the file case and the block device case.  For the file
2413	 * case, we need multiple threads to allow concurrency, because the
2414	 * vnode interface is designed to be a blocking interface.  For the
2415	 * block device case, ZFS zvols at least will block the caller's
2416	 * context in many instances, and so we need multiple threads to
2417	 * overcome that problem.  Other block devices don't need as many
2418	 * threads, but they shouldn't cause too many problems.
2419	 *
2420	 * If the user wants to just have a single thread for a block
2421	 * device, he can specify that when the LUN is created, or change
2422	 * the tunable/sysctl to alter the default number of threads.
2423	 */
2424	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
2425					 /*num threads*/num_threads,
2426					 /*priority*/PWAIT,
2427					 /*thread name*/
2428					 "%s taskq", be_lun->lunname);
2429
2430	if (retval != 0)
2431		goto bailout_error;
2432
2433	be_lun->num_threads = num_threads;
2434
2435	mtx_lock(&softc->lock);
2436	softc->num_luns++;
2437	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
2438
2439	mtx_unlock(&softc->lock);
2440
2441	retval = ctl_add_lun(&be_lun->cbe_lun);
2442	if (retval != 0) {
2443		mtx_lock(&softc->lock);
2444		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2445			      links);
2446		softc->num_luns--;
2447		mtx_unlock(&softc->lock);
2448		snprintf(req->error_str, sizeof(req->error_str),
2449			 "ctl_add_lun() returned error %d, see dmesg for "
2450			 "details", retval);
2451		retval = 0;
2452		goto bailout_error;
2453	}
2454
2455	mtx_lock(&softc->lock);
2456
2457	/*
2458	 * Tell the config_status routine that we're waiting so it won't
2459	 * clean up the LUN in the event of an error.
2460	 */
2461	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2462
2463	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2464		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2465		if (retval == EINTR)
2466			break;
2467	}
2468	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2469
2470	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
2471		snprintf(req->error_str, sizeof(req->error_str),
2472			 "LUN configuration error, see dmesg for details");
2473		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2474			      links);
2475		softc->num_luns--;
2476		mtx_unlock(&softc->lock);
2477		goto bailout_error;
2478	} else {
2479		params->req_lun_id = cbe_lun->lun_id;
2480	}
2481
2482	mtx_unlock(&softc->lock);
2483
2484	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
2485					       cbe_lun->blocksize,
2486					       DEVSTAT_ALL_SUPPORTED,
2487					       cbe_lun->lun_type
2488					       | DEVSTAT_TYPE_IF_OTHER,
2489					       DEVSTAT_PRIORITY_OTHER);
2490
2491	return (retval);
2492
2493bailout_error:
2494	req->status = CTL_LUN_ERROR;
2495
2496	if (be_lun->io_taskqueue != NULL)
2497		taskqueue_free(be_lun->io_taskqueue);
2498	ctl_be_block_close(be_lun);
2499	if (be_lun->dev_path != NULL)
2500		free(be_lun->dev_path, M_CTLBLK);
2501	if (be_lun->lun_zone != NULL)
2502		uma_zdestroy(be_lun->lun_zone);
2503	ctl_free_opts(&cbe_lun->options);
2504	mtx_destroy(&be_lun->queue_lock);
2505	mtx_destroy(&be_lun->io_lock);
2506	free(be_lun, M_CTLBLK);
2507
2508	return (retval);
2509}
2510
2511static int
2512ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2513{
2514	struct ctl_lun_rm_params *params;
2515	struct ctl_be_block_lun *be_lun;
2516	struct ctl_be_lun *cbe_lun;
2517	int retval;
2518
2519	params = &req->reqdata.rm;
2520
2521	mtx_lock(&softc->lock);
2522	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2523		if (be_lun->cbe_lun.lun_id == params->lun_id)
2524			break;
2525	}
2526	mtx_unlock(&softc->lock);
2527
2528	if (be_lun == NULL) {
2529		snprintf(req->error_str, sizeof(req->error_str),
2530			 "LUN %u is not managed by the block backend",
2531			 params->lun_id);
2532		goto bailout_error;
2533	}
2534	cbe_lun = &be_lun->cbe_lun;
2535
2536	retval = ctl_disable_lun(cbe_lun);
2537	if (retval != 0) {
2538		snprintf(req->error_str, sizeof(req->error_str),
2539			 "error %d returned from ctl_disable_lun() for "
2540			 "LUN %d", retval, params->lun_id);
2541		goto bailout_error;
2542	}
2543
2544	if (be_lun->vn != NULL) {
2545		cbe_lun->flags |= CTL_LUN_FLAG_OFFLINE;
2546		ctl_lun_offline(cbe_lun);
2547		taskqueue_drain_all(be_lun->io_taskqueue);
2548		ctl_be_block_close(be_lun);
2549	}
2550
2551	retval = ctl_invalidate_lun(cbe_lun);
2552	if (retval != 0) {
2553		snprintf(req->error_str, sizeof(req->error_str),
2554			 "error %d returned from ctl_invalidate_lun() for "
2555			 "LUN %d", retval, params->lun_id);
2556		goto bailout_error;
2557	}
2558
2559	mtx_lock(&softc->lock);
2560	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2561	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2562                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2563                if (retval == EINTR)
2564                        break;
2565        }
2566	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2567
2568	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2569		snprintf(req->error_str, sizeof(req->error_str),
2570			 "interrupted waiting for LUN to be freed");
2571		mtx_unlock(&softc->lock);
2572		goto bailout_error;
2573	}
2574
2575	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2576
2577	softc->num_luns--;
2578	mtx_unlock(&softc->lock);
2579
2580	taskqueue_drain_all(be_lun->io_taskqueue);
2581	taskqueue_free(be_lun->io_taskqueue);
2582
2583	if (be_lun->disk_stats != NULL)
2584		devstat_remove_entry(be_lun->disk_stats);
2585
2586	uma_zdestroy(be_lun->lun_zone);
2587
2588	ctl_free_opts(&cbe_lun->options);
2589	free(be_lun->dev_path, M_CTLBLK);
2590	mtx_destroy(&be_lun->queue_lock);
2591	mtx_destroy(&be_lun->io_lock);
2592	free(be_lun, M_CTLBLK);
2593
2594	req->status = CTL_LUN_OK;
2595
2596	return (0);
2597
2598bailout_error:
2599
2600	req->status = CTL_LUN_ERROR;
2601
2602	return (0);
2603}
2604
2605static int
2606ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2607{
2608	struct ctl_lun_modify_params *params;
2609	struct ctl_be_block_lun *be_lun;
2610	struct ctl_be_lun *cbe_lun;
2611	char *value;
2612	uint64_t oldsize;
2613	int error, wasprim;
2614
2615	params = &req->reqdata.modify;
2616
2617	mtx_lock(&softc->lock);
2618	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2619		if (be_lun->cbe_lun.lun_id == params->lun_id)
2620			break;
2621	}
2622	mtx_unlock(&softc->lock);
2623
2624	if (be_lun == NULL) {
2625		snprintf(req->error_str, sizeof(req->error_str),
2626			 "LUN %u is not managed by the block backend",
2627			 params->lun_id);
2628		goto bailout_error;
2629	}
2630	cbe_lun = &be_lun->cbe_lun;
2631
2632	if (params->lun_size_bytes != 0)
2633		be_lun->params.lun_size_bytes = params->lun_size_bytes;
2634	ctl_update_opts(&cbe_lun->options, req->num_be_args, req->kern_be_args);
2635
2636	wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
2637	value = ctl_get_opt(&cbe_lun->options, "ha_role");
2638	if (value != NULL) {
2639		if (strcmp(value, "primary") == 0)
2640			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2641		else
2642			cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2643	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2644		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2645	else
2646		cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2647	if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
2648		if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
2649			ctl_lun_primary(cbe_lun);
2650		else
2651			ctl_lun_secondary(cbe_lun);
2652	}
2653
2654	oldsize = be_lun->size_blocks;
2655	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2656	    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2657		if (be_lun->vn == NULL)
2658			error = ctl_be_block_open(softc, be_lun, req);
2659		else if (vn_isdisk(be_lun->vn, &error))
2660			error = ctl_be_block_open_dev(be_lun, req);
2661		else if (be_lun->vn->v_type == VREG)
2662			error = ctl_be_block_open_file(be_lun, req);
2663		else
2664			error = EINVAL;
2665		if ((cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) &&
2666		    be_lun->vn != NULL) {
2667			cbe_lun->flags &= ~CTL_LUN_FLAG_OFFLINE;
2668			ctl_lun_online(cbe_lun);
2669		}
2670	} else {
2671		if (be_lun->vn != NULL) {
2672			cbe_lun->flags |= CTL_LUN_FLAG_OFFLINE;
2673			ctl_lun_offline(cbe_lun);
2674			taskqueue_drain_all(be_lun->io_taskqueue);
2675			error = ctl_be_block_close(be_lun);
2676		} else
2677			error = 0;
2678	}
2679	if (be_lun->size_blocks != oldsize)
2680		ctl_lun_capacity_changed(cbe_lun);
2681
2682	/* Tell the user the exact size we ended up using */
2683	params->lun_size_bytes = be_lun->size_bytes;
2684
2685	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
2686	return (0);
2687
2688bailout_error:
2689	req->status = CTL_LUN_ERROR;
2690	return (0);
2691}
2692
2693static void
2694ctl_be_block_lun_shutdown(void *be_lun)
2695{
2696	struct ctl_be_block_lun *lun;
2697	struct ctl_be_block_softc *softc;
2698
2699	lun = (struct ctl_be_block_lun *)be_lun;
2700
2701	softc = lun->softc;
2702
2703	mtx_lock(&softc->lock);
2704	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2705	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2706		wakeup(lun);
2707	mtx_unlock(&softc->lock);
2708
2709}
2710
2711static void
2712ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2713{
2714	struct ctl_be_block_lun *lun;
2715	struct ctl_be_block_softc *softc;
2716
2717	lun = (struct ctl_be_block_lun *)be_lun;
2718	softc = lun->softc;
2719
2720	if (status == CTL_LUN_CONFIG_OK) {
2721		mtx_lock(&softc->lock);
2722		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2723		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2724			wakeup(lun);
2725		mtx_unlock(&softc->lock);
2726
2727		/*
2728		 * We successfully added the LUN, attempt to enable it.
2729		 */
2730		if (ctl_enable_lun(&lun->cbe_lun) != 0) {
2731			printf("%s: ctl_enable_lun() failed!\n", __func__);
2732			if (ctl_invalidate_lun(&lun->cbe_lun) != 0) {
2733				printf("%s: ctl_invalidate_lun() failed!\n",
2734				       __func__);
2735			}
2736		}
2737
2738		return;
2739	}
2740
2741
2742	mtx_lock(&softc->lock);
2743	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2744	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2745	wakeup(lun);
2746	mtx_unlock(&softc->lock);
2747}
2748
2749
2750static int
2751ctl_be_block_config_write(union ctl_io *io)
2752{
2753	struct ctl_be_block_lun *be_lun;
2754	struct ctl_be_lun *cbe_lun;
2755	int retval;
2756
2757	retval = 0;
2758
2759	DPRINTF("entered\n");
2760
2761	cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2762		CTL_PRIV_BACKEND_LUN].ptr;
2763	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
2764
2765	switch (io->scsiio.cdb[0]) {
2766	case SYNCHRONIZE_CACHE:
2767	case SYNCHRONIZE_CACHE_16:
2768	case WRITE_SAME_10:
2769	case WRITE_SAME_16:
2770	case UNMAP:
2771		/*
2772		 * The upper level CTL code will filter out any CDBs with
2773		 * the immediate bit set and return the proper error.
2774		 *
2775		 * We don't really need to worry about what LBA range the
2776		 * user asked to be synced out.  When they issue a sync
2777		 * cache command, we'll sync out the whole thing.
2778		 */
2779		mtx_lock(&be_lun->queue_lock);
2780		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2781				   links);
2782		mtx_unlock(&be_lun->queue_lock);
2783		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2784		break;
2785	case START_STOP_UNIT: {
2786		struct scsi_start_stop_unit *cdb;
2787
2788		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2789
2790		if (cdb->how & SSS_START)
2791			retval = ctl_start_lun(cbe_lun);
2792		else {
2793			retval = ctl_stop_lun(cbe_lun);
2794			/*
2795			 * XXX KDM Copan-specific offline behavior.
2796			 * Figure out a reasonable way to port this?
2797			 */
2798#ifdef NEEDTOPORT
2799			if ((retval == 0)
2800			 && (cdb->byte2 & SSS_ONOFFLINE))
2801				retval = ctl_lun_offline(cbe_lun);
2802#endif
2803		}
2804
2805		/*
2806		 * In general, the above routines should not fail.  They
2807		 * just set state for the LUN.  So we've got something
2808		 * pretty wrong here if we can't start or stop the LUN.
2809		 */
2810		if (retval != 0) {
2811			ctl_set_internal_failure(&io->scsiio,
2812						 /*sks_valid*/ 1,
2813						 /*retry_count*/ 0xf051);
2814			retval = CTL_RETVAL_COMPLETE;
2815		} else {
2816			ctl_set_success(&io->scsiio);
2817		}
2818		ctl_config_write_done(io);
2819		break;
2820	}
2821	default:
2822		ctl_set_invalid_opcode(&io->scsiio);
2823		ctl_config_write_done(io);
2824		retval = CTL_RETVAL_COMPLETE;
2825		break;
2826	}
2827
2828	return (retval);
2829}
2830
2831static int
2832ctl_be_block_config_read(union ctl_io *io)
2833{
2834	struct ctl_be_block_lun *be_lun;
2835	struct ctl_be_lun *cbe_lun;
2836	int retval = 0;
2837
2838	DPRINTF("entered\n");
2839
2840	cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2841		CTL_PRIV_BACKEND_LUN].ptr;
2842	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
2843
2844	switch (io->scsiio.cdb[0]) {
2845	case SERVICE_ACTION_IN:
2846		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
2847			mtx_lock(&be_lun->queue_lock);
2848			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
2849			    &io->io_hdr, links);
2850			mtx_unlock(&be_lun->queue_lock);
2851			taskqueue_enqueue(be_lun->io_taskqueue,
2852			    &be_lun->io_task);
2853			retval = CTL_RETVAL_QUEUED;
2854			break;
2855		}
2856		ctl_set_invalid_field(&io->scsiio,
2857				      /*sks_valid*/ 1,
2858				      /*command*/ 1,
2859				      /*field*/ 1,
2860				      /*bit_valid*/ 1,
2861				      /*bit*/ 4);
2862		ctl_config_read_done(io);
2863		retval = CTL_RETVAL_COMPLETE;
2864		break;
2865	default:
2866		ctl_set_invalid_opcode(&io->scsiio);
2867		ctl_config_read_done(io);
2868		retval = CTL_RETVAL_COMPLETE;
2869		break;
2870	}
2871
2872	return (retval);
2873}
2874
2875static int
2876ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2877{
2878	struct ctl_be_block_lun *lun;
2879	int retval;
2880
2881	lun = (struct ctl_be_block_lun *)be_lun;
2882	retval = 0;
2883
2884	retval = sbuf_printf(sb, "\t<num_threads>");
2885
2886	if (retval != 0)
2887		goto bailout;
2888
2889	retval = sbuf_printf(sb, "%d", lun->num_threads);
2890
2891	if (retval != 0)
2892		goto bailout;
2893
2894	retval = sbuf_printf(sb, "</num_threads>\n");
2895
2896bailout:
2897
2898	return (retval);
2899}
2900
2901static uint64_t
2902ctl_be_block_lun_attr(void *be_lun, const char *attrname)
2903{
2904	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun;
2905
2906	if (lun->getattr == NULL)
2907		return (UINT64_MAX);
2908	return (lun->getattr(lun, attrname));
2909}
2910
2911int
2912ctl_be_block_init(void)
2913{
2914	struct ctl_be_block_softc *softc;
2915	int retval;
2916
2917	softc = &backend_block_softc;
2918	retval = 0;
2919
2920	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
2921	beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2922	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2923	STAILQ_INIT(&softc->lun_list);
2924
2925	return (retval);
2926}
2927