ctl_backend_block.c revision 285391
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Edward Tomasz Napierala
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17 *    substantially similar to the "NO WARRANTY" disclaimer below
18 *    ("Disclaimer") and any redistribution must be conditioned upon
19 *    including a substantially similar Disclaimer requirement for further
20 *    binary redistribution.
21 *
22 * NO WARRANTY
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGES.
34 *
35 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36 */
37/*
38 * CAM Target Layer driver backend for block devices.
39 *
40 * Author: Ken Merry <ken@FreeBSD.org>
41 */
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/cam/ctl/ctl_backend_block.c 285391 2015-07-11 16:22:48Z mjg $");
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kernel.h>
48#include <sys/types.h>
49#include <sys/kthread.h>
50#include <sys/bio.h>
51#include <sys/fcntl.h>
52#include <sys/limits.h>
53#include <sys/lock.h>
54#include <sys/mutex.h>
55#include <sys/condvar.h>
56#include <sys/malloc.h>
57#include <sys/conf.h>
58#include <sys/ioccom.h>
59#include <sys/queue.h>
60#include <sys/sbuf.h>
61#include <sys/endian.h>
62#include <sys/uio.h>
63#include <sys/buf.h>
64#include <sys/taskqueue.h>
65#include <sys/vnode.h>
66#include <sys/namei.h>
67#include <sys/mount.h>
68#include <sys/disk.h>
69#include <sys/fcntl.h>
70#include <sys/filedesc.h>
71#include <sys/filio.h>
72#include <sys/proc.h>
73#include <sys/pcpu.h>
74#include <sys/module.h>
75#include <sys/sdt.h>
76#include <sys/devicestat.h>
77#include <sys/sysctl.h>
78
79#include <geom/geom.h>
80
81#include <cam/cam.h>
82#include <cam/scsi/scsi_all.h>
83#include <cam/scsi/scsi_da.h>
84#include <cam/ctl/ctl_io.h>
85#include <cam/ctl/ctl.h>
86#include <cam/ctl/ctl_backend.h>
87#include <cam/ctl/ctl_frontend_internal.h>
88#include <cam/ctl/ctl_ioctl.h>
89#include <cam/ctl/ctl_scsi_all.h>
90#include <cam/ctl/ctl_error.h>
91
92/*
93 * The idea here is that we'll allocate enough S/G space to hold a 1MB
94 * I/O.  If we get an I/O larger than that, we'll split it.
95 */
96#define	CTLBLK_HALF_IO_SIZE	(512 * 1024)
97#define	CTLBLK_MAX_IO_SIZE	(CTLBLK_HALF_IO_SIZE * 2)
98#define	CTLBLK_MAX_SEG		MAXPHYS
99#define	CTLBLK_HALF_SEGS	MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
100#define	CTLBLK_MAX_SEGS		(CTLBLK_HALF_SEGS * 2)
101
102#ifdef CTLBLK_DEBUG
103#define DPRINTF(fmt, args...) \
104    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
105#else
106#define DPRINTF(fmt, args...) do {} while(0)
107#endif
108
109#define PRIV(io)	\
110    ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
111#define ARGS(io)	\
112    ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
113
114SDT_PROVIDER_DEFINE(cbb);
115
116typedef enum {
117	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
118	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
119	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
120	CTL_BE_BLOCK_LUN_MULTI_THREAD	= 0x08
121} ctl_be_block_lun_flags;
122
123typedef enum {
124	CTL_BE_BLOCK_NONE,
125	CTL_BE_BLOCK_DEV,
126	CTL_BE_BLOCK_FILE
127} ctl_be_block_type;
128
129struct ctl_be_block_devdata {
130	struct cdev *cdev;
131	struct cdevsw *csw;
132	int dev_ref;
133};
134
135struct ctl_be_block_filedata {
136	struct ucred *cred;
137};
138
139union ctl_be_block_bedata {
140	struct ctl_be_block_devdata dev;
141	struct ctl_be_block_filedata file;
142};
143
144struct ctl_be_block_io;
145struct ctl_be_block_lun;
146
147typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
148			       struct ctl_be_block_io *beio);
149typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
150				  const char *attrname);
151
152/*
153 * Backend LUN structure.  There is a 1:1 mapping between a block device
154 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
155 */
156struct ctl_be_block_lun {
157	struct ctl_lun_create_params params;
158	struct ctl_block_disk *disk;
159	char lunname[32];
160	char *dev_path;
161	ctl_be_block_type dev_type;
162	struct vnode *vn;
163	union ctl_be_block_bedata backend;
164	cbb_dispatch_t dispatch;
165	cbb_dispatch_t lun_flush;
166	cbb_dispatch_t unmap;
167	cbb_dispatch_t get_lba_status;
168	cbb_getattr_t getattr;
169	uma_zone_t lun_zone;
170	uint64_t size_blocks;
171	uint64_t size_bytes;
172	uint32_t blocksize;
173	int blocksize_shift;
174	uint16_t pblockexp;
175	uint16_t pblockoff;
176	uint16_t ublockexp;
177	uint16_t ublockoff;
178	uint32_t atomicblock;
179	uint32_t opttxferlen;
180	struct ctl_be_block_softc *softc;
181	struct devstat *disk_stats;
182	ctl_be_block_lun_flags flags;
183	STAILQ_ENTRY(ctl_be_block_lun) links;
184	struct ctl_be_lun ctl_be_lun;
185	struct taskqueue *io_taskqueue;
186	struct task io_task;
187	int num_threads;
188	STAILQ_HEAD(, ctl_io_hdr) input_queue;
189	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
190	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
191	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
192	struct mtx_padalign io_lock;
193	struct mtx_padalign queue_lock;
194};
195
196/*
197 * Overall softc structure for the block backend module.
198 */
199struct ctl_be_block_softc {
200	struct mtx			 lock;
201	int				 num_disks;
202	STAILQ_HEAD(, ctl_block_disk)	 disk_list;
203	int				 num_luns;
204	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
205};
206
207static struct ctl_be_block_softc backend_block_softc;
208
209/*
210 * Per-I/O information.
211 */
212struct ctl_be_block_io {
213	union ctl_io			*io;
214	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
215	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
216	int				bio_cmd;
217	int				num_segs;
218	int				num_bios_sent;
219	int				num_bios_done;
220	int				send_complete;
221	int				num_errors;
222	struct bintime			ds_t0;
223	devstat_tag_type		ds_tag_type;
224	devstat_trans_flags		ds_trans_type;
225	uint64_t			io_len;
226	uint64_t			io_offset;
227	struct ctl_be_block_softc	*softc;
228	struct ctl_be_block_lun		*lun;
229	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
230};
231
232static int cbb_num_threads = 14;
233SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
234	    "CAM Target Layer Block Backend");
235SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN,
236           &cbb_num_threads, 0, "Number of threads per backing file");
237
238static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
239static void ctl_free_beio(struct ctl_be_block_io *beio);
240static void ctl_complete_beio(struct ctl_be_block_io *beio);
241static int ctl_be_block_move_done(union ctl_io *io);
242static void ctl_be_block_biodone(struct bio *bio);
243static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
244				    struct ctl_be_block_io *beio);
245static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
246				       struct ctl_be_block_io *beio);
247static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
248				  struct ctl_be_block_io *beio);
249static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
250					 const char *attrname);
251static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
252				   struct ctl_be_block_io *beio);
253static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
254				   struct ctl_be_block_io *beio);
255static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
256				      struct ctl_be_block_io *beio);
257static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
258					 const char *attrname);
259static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
260				    union ctl_io *io);
261static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
262				    union ctl_io *io);
263static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
264				  union ctl_io *io);
265static void ctl_be_block_worker(void *context, int pending);
266static int ctl_be_block_submit(union ctl_io *io);
267static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
268				   int flag, struct thread *td);
269static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
270				  struct ctl_lun_req *req);
271static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
272				 struct ctl_lun_req *req);
273static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
274static int ctl_be_block_open(struct ctl_be_block_softc *softc,
275			     struct ctl_be_block_lun *be_lun,
276			     struct ctl_lun_req *req);
277static int ctl_be_block_create(struct ctl_be_block_softc *softc,
278			       struct ctl_lun_req *req);
279static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
280			   struct ctl_lun_req *req);
281static int ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
282				  struct ctl_lun_req *req);
283static int ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
284				 struct ctl_lun_req *req);
285static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
286			   struct ctl_lun_req *req);
287static void ctl_be_block_lun_shutdown(void *be_lun);
288static void ctl_be_block_lun_config_status(void *be_lun,
289					   ctl_lun_config_status status);
290static int ctl_be_block_config_write(union ctl_io *io);
291static int ctl_be_block_config_read(union ctl_io *io);
292static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
293static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname);
294int ctl_be_block_init(void);
295
296static struct ctl_backend_driver ctl_be_block_driver =
297{
298	.name = "block",
299	.flags = CTL_BE_FLAG_HAS_CONFIG,
300	.init = ctl_be_block_init,
301	.data_submit = ctl_be_block_submit,
302	.data_move_done = ctl_be_block_move_done,
303	.config_read = ctl_be_block_config_read,
304	.config_write = ctl_be_block_config_write,
305	.ioctl = ctl_be_block_ioctl,
306	.lun_info = ctl_be_block_lun_info,
307	.lun_attr = ctl_be_block_lun_attr
308};
309
310MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
311CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
312
313static uma_zone_t beio_zone;
314
315static struct ctl_be_block_io *
316ctl_alloc_beio(struct ctl_be_block_softc *softc)
317{
318	struct ctl_be_block_io *beio;
319
320	beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO);
321	beio->softc = softc;
322	return (beio);
323}
324
325static void
326ctl_free_beio(struct ctl_be_block_io *beio)
327{
328	int duplicate_free;
329	int i;
330
331	duplicate_free = 0;
332
333	for (i = 0; i < beio->num_segs; i++) {
334		if (beio->sg_segs[i].addr == NULL)
335			duplicate_free++;
336
337		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
338		beio->sg_segs[i].addr = NULL;
339
340		/* For compare we had two equal S/G lists. */
341		if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) {
342			uma_zfree(beio->lun->lun_zone,
343			    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
344			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL;
345		}
346	}
347
348	if (duplicate_free > 0) {
349		printf("%s: %d duplicate frees out of %d segments\n", __func__,
350		       duplicate_free, beio->num_segs);
351	}
352
353	uma_zfree(beio_zone, beio);
354}
355
356static void
357ctl_complete_beio(struct ctl_be_block_io *beio)
358{
359	union ctl_io *io = beio->io;
360
361	if (beio->beio_cont != NULL) {
362		beio->beio_cont(beio);
363	} else {
364		ctl_free_beio(beio);
365		ctl_data_submit_done(io);
366	}
367}
368
369static int
370ctl_be_block_move_done(union ctl_io *io)
371{
372	struct ctl_be_block_io *beio;
373	struct ctl_be_block_lun *be_lun;
374	struct ctl_lba_len_flags *lbalen;
375#ifdef CTL_TIME_IO
376	struct bintime cur_bt;
377#endif
378	int i;
379
380	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
381	be_lun = beio->lun;
382
383	DPRINTF("entered\n");
384
385#ifdef CTL_TIME_IO
386	getbintime(&cur_bt);
387	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
388	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
389	io->io_hdr.num_dmas++;
390#endif
391	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
392
393	/*
394	 * We set status at this point for read commands, and write
395	 * commands with errors.
396	 */
397	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
398		;
399	} else if ((io->io_hdr.port_status == 0) &&
400	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
401		lbalen = ARGS(beio->io);
402		if (lbalen->flags & CTL_LLF_READ) {
403			ctl_set_success(&io->scsiio);
404		} else if (lbalen->flags & CTL_LLF_COMPARE) {
405			/* We have two data blocks ready for comparison. */
406			for (i = 0; i < beio->num_segs; i++) {
407				if (memcmp(beio->sg_segs[i].addr,
408				    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
409				    beio->sg_segs[i].len) != 0)
410					break;
411			}
412			if (i < beio->num_segs)
413				ctl_set_sense(&io->scsiio,
414				    /*current_error*/ 1,
415				    /*sense_key*/ SSD_KEY_MISCOMPARE,
416				    /*asc*/ 0x1D,
417				    /*ascq*/ 0x00,
418				    SSD_ELEM_NONE);
419			else
420				ctl_set_success(&io->scsiio);
421		}
422	} else if ((io->io_hdr.port_status != 0) &&
423	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
424	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
425		/*
426		 * For hardware error sense keys, the sense key
427		 * specific value is defined to be a retry count,
428		 * but we use it to pass back an internal FETD
429		 * error code.  XXX KDM  Hopefully the FETD is only
430		 * using 16 bits for an error code, since that's
431		 * all the space we have in the sks field.
432		 */
433		ctl_set_internal_failure(&io->scsiio,
434					 /*sks_valid*/ 1,
435					 /*retry_count*/
436					 io->io_hdr.port_status);
437	}
438
439	/*
440	 * If this is a read, or a write with errors, it is done.
441	 */
442	if ((beio->bio_cmd == BIO_READ)
443	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
444	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
445		ctl_complete_beio(beio);
446		return (0);
447	}
448
449	/*
450	 * At this point, we have a write and the DMA completed
451	 * successfully.  We now have to queue it to the task queue to
452	 * execute the backend I/O.  That is because we do blocking
453	 * memory allocations, and in the file backing case, blocking I/O.
454	 * This move done routine is generally called in the SIM's
455	 * interrupt context, and therefore we cannot block.
456	 */
457	mtx_lock(&be_lun->queue_lock);
458	/*
459	 * XXX KDM make sure that links is okay to use at this point.
460	 * Otherwise, we either need to add another field to ctl_io_hdr,
461	 * or deal with resource allocation here.
462	 */
463	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
464	mtx_unlock(&be_lun->queue_lock);
465
466	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
467
468	return (0);
469}
470
471static void
472ctl_be_block_biodone(struct bio *bio)
473{
474	struct ctl_be_block_io *beio;
475	struct ctl_be_block_lun *be_lun;
476	union ctl_io *io;
477	int error;
478
479	beio = bio->bio_caller1;
480	be_lun = beio->lun;
481	io = beio->io;
482
483	DPRINTF("entered\n");
484
485	error = bio->bio_error;
486	mtx_lock(&be_lun->io_lock);
487	if (error != 0)
488		beio->num_errors++;
489
490	beio->num_bios_done++;
491
492	/*
493	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
494	 * during the free might cause it to complain.
495	 */
496	g_destroy_bio(bio);
497
498	/*
499	 * If the send complete bit isn't set, or we aren't the last I/O to
500	 * complete, then we're done.
501	 */
502	if ((beio->send_complete == 0)
503	 || (beio->num_bios_done < beio->num_bios_sent)) {
504		mtx_unlock(&be_lun->io_lock);
505		return;
506	}
507
508	/*
509	 * At this point, we've verified that we are the last I/O to
510	 * complete, so it's safe to drop the lock.
511	 */
512	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
513	    beio->ds_tag_type, beio->ds_trans_type,
514	    /*now*/ NULL, /*then*/&beio->ds_t0);
515	mtx_unlock(&be_lun->io_lock);
516
517	/*
518	 * If there are any errors from the backing device, we fail the
519	 * entire I/O with a medium error.
520	 */
521	if (beio->num_errors > 0) {
522		if (error == EOPNOTSUPP) {
523			ctl_set_invalid_opcode(&io->scsiio);
524		} else if (error == ENOSPC || error == EDQUOT) {
525			ctl_set_space_alloc_fail(&io->scsiio);
526		} else if (beio->bio_cmd == BIO_FLUSH) {
527			/* XXX KDM is there is a better error here? */
528			ctl_set_internal_failure(&io->scsiio,
529						 /*sks_valid*/ 1,
530						 /*retry_count*/ 0xbad2);
531		} else
532			ctl_set_medium_error(&io->scsiio);
533		ctl_complete_beio(beio);
534		return;
535	}
536
537	/*
538	 * If this is a write, a flush, a delete or verify, we're all done.
539	 * If this is a read, we can now send the data to the user.
540	 */
541	if ((beio->bio_cmd == BIO_WRITE)
542	 || (beio->bio_cmd == BIO_FLUSH)
543	 || (beio->bio_cmd == BIO_DELETE)
544	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
545		ctl_set_success(&io->scsiio);
546		ctl_complete_beio(beio);
547	} else {
548		if ((ARGS(io)->flags & CTL_LLF_READ) &&
549		    beio->beio_cont == NULL)
550			ctl_set_success(&io->scsiio);
551#ifdef CTL_TIME_IO
552        	getbintime(&io->io_hdr.dma_start_bt);
553#endif
554		ctl_datamove(io);
555	}
556}
557
558static void
559ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
560			struct ctl_be_block_io *beio)
561{
562	union ctl_io *io = beio->io;
563	struct mount *mountpoint;
564	int error, lock_flags;
565
566	DPRINTF("entered\n");
567
568	binuptime(&beio->ds_t0);
569	mtx_lock(&be_lun->io_lock);
570	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
571	mtx_unlock(&be_lun->io_lock);
572
573	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
574
575	if (MNT_SHARED_WRITES(mountpoint)
576	 || ((mountpoint == NULL)
577	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
578		lock_flags = LK_SHARED;
579	else
580		lock_flags = LK_EXCLUSIVE;
581
582	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
583
584	error = VOP_FSYNC(be_lun->vn, MNT_WAIT, curthread);
585	VOP_UNLOCK(be_lun->vn, 0);
586
587	vn_finished_write(mountpoint);
588
589	mtx_lock(&be_lun->io_lock);
590	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
591	    beio->ds_tag_type, beio->ds_trans_type,
592	    /*now*/ NULL, /*then*/&beio->ds_t0);
593	mtx_unlock(&be_lun->io_lock);
594
595	if (error == 0)
596		ctl_set_success(&io->scsiio);
597	else {
598		/* XXX KDM is there is a better error here? */
599		ctl_set_internal_failure(&io->scsiio,
600					 /*sks_valid*/ 1,
601					 /*retry_count*/ 0xbad1);
602	}
603
604	ctl_complete_beio(beio);
605}
606
607SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, "uint64_t");
608SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, "uint64_t");
609SDT_PROBE_DEFINE1(cbb, kernel, read, file_done,"uint64_t");
610SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, "uint64_t");
611
612static void
613ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
614			   struct ctl_be_block_io *beio)
615{
616	struct ctl_be_block_filedata *file_data;
617	union ctl_io *io;
618	struct uio xuio;
619	struct iovec *xiovec;
620	int flags;
621	int error, i;
622
623	DPRINTF("entered\n");
624
625	file_data = &be_lun->backend.file;
626	io = beio->io;
627	flags = 0;
628	if (ARGS(io)->flags & CTL_LLF_DPO)
629		flags |= IO_DIRECT;
630	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
631		flags |= IO_SYNC;
632
633	bzero(&xuio, sizeof(xuio));
634	if (beio->bio_cmd == BIO_READ) {
635		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
636		xuio.uio_rw = UIO_READ;
637	} else {
638		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
639		xuio.uio_rw = UIO_WRITE;
640	}
641	xuio.uio_offset = beio->io_offset;
642	xuio.uio_resid = beio->io_len;
643	xuio.uio_segflg = UIO_SYSSPACE;
644	xuio.uio_iov = beio->xiovecs;
645	xuio.uio_iovcnt = beio->num_segs;
646	xuio.uio_td = curthread;
647
648	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
649		xiovec->iov_base = beio->sg_segs[i].addr;
650		xiovec->iov_len = beio->sg_segs[i].len;
651	}
652
653	binuptime(&beio->ds_t0);
654	mtx_lock(&be_lun->io_lock);
655	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
656	mtx_unlock(&be_lun->io_lock);
657
658	if (beio->bio_cmd == BIO_READ) {
659		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
660
661		/*
662		 * UFS pays attention to IO_DIRECT for reads.  If the
663		 * DIRECTIO option is configured into the kernel, it calls
664		 * ffs_rawread().  But that only works for single-segment
665		 * uios with user space addresses.  In our case, with a
666		 * kernel uio, it still reads into the buffer cache, but it
667		 * will just try to release the buffer from the cache later
668		 * on in ffs_read().
669		 *
670		 * ZFS does not pay attention to IO_DIRECT for reads.
671		 *
672		 * UFS does not pay attention to IO_SYNC for reads.
673		 *
674		 * ZFS pays attention to IO_SYNC (which translates into the
675		 * Solaris define FRSYNC for zfs_read()) for reads.  It
676		 * attempts to sync the file before reading.
677		 *
678		 * So, to attempt to provide some barrier semantics in the
679		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
680		 */
681		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
682
683		VOP_UNLOCK(be_lun->vn, 0);
684		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
685	} else {
686		struct mount *mountpoint;
687		int lock_flags;
688
689		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
690
691		if (MNT_SHARED_WRITES(mountpoint)
692		 || ((mountpoint == NULL)
693		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
694			lock_flags = LK_SHARED;
695		else
696			lock_flags = LK_EXCLUSIVE;
697
698		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
699
700		/*
701		 * UFS pays attention to IO_DIRECT for writes.  The write
702		 * is done asynchronously.  (Normally the write would just
703		 * get put into cache.
704		 *
705		 * UFS pays attention to IO_SYNC for writes.  It will
706		 * attempt to write the buffer out synchronously if that
707		 * flag is set.
708		 *
709		 * ZFS does not pay attention to IO_DIRECT for writes.
710		 *
711		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
712		 * for writes.  It will flush the transaction from the
713		 * cache before returning.
714		 *
715		 * So if we've got the BIO_ORDERED flag set, we want
716		 * IO_SYNC in either the UFS or ZFS case.
717		 */
718		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
719		VOP_UNLOCK(be_lun->vn, 0);
720
721		vn_finished_write(mountpoint);
722		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
723        }
724
725	mtx_lock(&be_lun->io_lock);
726	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
727	    beio->ds_tag_type, beio->ds_trans_type,
728	    /*now*/ NULL, /*then*/&beio->ds_t0);
729	mtx_unlock(&be_lun->io_lock);
730
731	/*
732	 * If we got an error, set the sense data to "MEDIUM ERROR" and
733	 * return the I/O to the user.
734	 */
735	if (error != 0) {
736		char path_str[32];
737
738		ctl_scsi_path_string(io, path_str, sizeof(path_str));
739		printf("%s%s command returned errno %d\n", path_str,
740		       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", error);
741		if (error == ENOSPC || error == EDQUOT) {
742			ctl_set_space_alloc_fail(&io->scsiio);
743		} else
744			ctl_set_medium_error(&io->scsiio);
745		ctl_complete_beio(beio);
746		return;
747	}
748
749	/*
750	 * If this is a write or a verify, we're all done.
751	 * If this is a read, we can now send the data to the user.
752	 */
753	if ((beio->bio_cmd == BIO_WRITE) ||
754	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
755		ctl_set_success(&io->scsiio);
756		ctl_complete_beio(beio);
757	} else {
758		if ((ARGS(io)->flags & CTL_LLF_READ) &&
759		    beio->beio_cont == NULL)
760			ctl_set_success(&io->scsiio);
761#ifdef CTL_TIME_IO
762        	getbintime(&io->io_hdr.dma_start_bt);
763#endif
764		ctl_datamove(io);
765	}
766}
767
768static void
769ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
770			struct ctl_be_block_io *beio)
771{
772	union ctl_io *io = beio->io;
773	struct ctl_lba_len_flags *lbalen = ARGS(io);
774	struct scsi_get_lba_status_data *data;
775	off_t roff, off;
776	int error, status;
777
778	DPRINTF("entered\n");
779
780	off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
781	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
782	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
783	    0, curthread->td_ucred, curthread);
784	if (error == 0 && off > roff)
785		status = 0;	/* mapped up to off */
786	else {
787		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
788		    0, curthread->td_ucred, curthread);
789		if (error == 0 && off > roff)
790			status = 1;	/* deallocated up to off */
791		else {
792			status = 0;	/* unknown up to the end */
793			off = be_lun->size_bytes;
794		}
795	}
796	VOP_UNLOCK(be_lun->vn, 0);
797
798	off >>= be_lun->blocksize_shift;
799	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
800	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
801	scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
802	    data->descr[0].length);
803	data->descr[0].status = status;
804
805	ctl_complete_beio(beio);
806}
807
808static uint64_t
809ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
810{
811	struct vattr		vattr;
812	struct statfs		statfs;
813	uint64_t		val;
814	int			error;
815
816	val = UINT64_MAX;
817	if (be_lun->vn == NULL)
818		return (val);
819	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
820	if (strcmp(attrname, "blocksused") == 0) {
821		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
822		if (error == 0)
823			val = vattr.va_bytes >> be_lun->blocksize_shift;
824	}
825	if (strcmp(attrname, "blocksavail") == 0 &&
826	    (be_lun->vn->v_iflag & VI_DOOMED) == 0) {
827		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
828		if (error == 0)
829			val = (statfs.f_bavail * statfs.f_bsize) >>
830			    be_lun->blocksize_shift;
831	}
832	VOP_UNLOCK(be_lun->vn, 0);
833	return (val);
834}
835
836static void
837ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
838			   struct ctl_be_block_io *beio)
839{
840	struct ctl_be_block_devdata *dev_data;
841	union ctl_io *io;
842	struct uio xuio;
843	struct iovec *xiovec;
844	int flags;
845	int error, i;
846
847	DPRINTF("entered\n");
848
849	dev_data = &be_lun->backend.dev;
850	io = beio->io;
851	flags = 0;
852	if (ARGS(io)->flags & CTL_LLF_DPO)
853		flags |= IO_DIRECT;
854	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
855		flags |= IO_SYNC;
856
857	bzero(&xuio, sizeof(xuio));
858	if (beio->bio_cmd == BIO_READ) {
859		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
860		xuio.uio_rw = UIO_READ;
861	} else {
862		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
863		xuio.uio_rw = UIO_WRITE;
864	}
865	xuio.uio_offset = beio->io_offset;
866	xuio.uio_resid = beio->io_len;
867	xuio.uio_segflg = UIO_SYSSPACE;
868	xuio.uio_iov = beio->xiovecs;
869	xuio.uio_iovcnt = beio->num_segs;
870	xuio.uio_td = curthread;
871
872	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
873		xiovec->iov_base = beio->sg_segs[i].addr;
874		xiovec->iov_len = beio->sg_segs[i].len;
875	}
876
877	binuptime(&beio->ds_t0);
878	mtx_lock(&be_lun->io_lock);
879	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
880	mtx_unlock(&be_lun->io_lock);
881
882	if (beio->bio_cmd == BIO_READ) {
883		error = (*dev_data->csw->d_read)(dev_data->cdev, &xuio, flags);
884		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
885	} else {
886		error = (*dev_data->csw->d_write)(dev_data->cdev, &xuio, flags);
887		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
888	}
889
890	mtx_lock(&be_lun->io_lock);
891	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
892	    beio->ds_tag_type, beio->ds_trans_type,
893	    /*now*/ NULL, /*then*/&beio->ds_t0);
894	mtx_unlock(&be_lun->io_lock);
895
896	/*
897	 * If we got an error, set the sense data to "MEDIUM ERROR" and
898	 * return the I/O to the user.
899	 */
900	if (error != 0) {
901		if (error == ENOSPC || error == EDQUOT) {
902			ctl_set_space_alloc_fail(&io->scsiio);
903		} else
904			ctl_set_medium_error(&io->scsiio);
905		ctl_complete_beio(beio);
906		return;
907	}
908
909	/*
910	 * If this is a write or a verify, we're all done.
911	 * If this is a read, we can now send the data to the user.
912	 */
913	if ((beio->bio_cmd == BIO_WRITE) ||
914	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
915		ctl_set_success(&io->scsiio);
916		ctl_complete_beio(beio);
917	} else {
918		if ((ARGS(io)->flags & CTL_LLF_READ) &&
919		    beio->beio_cont == NULL)
920			ctl_set_success(&io->scsiio);
921#ifdef CTL_TIME_IO
922        	getbintime(&io->io_hdr.dma_start_bt);
923#endif
924		ctl_datamove(io);
925	}
926}
927
928static void
929ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
930			struct ctl_be_block_io *beio)
931{
932	struct ctl_be_block_devdata *dev_data = &be_lun->backend.dev;
933	union ctl_io *io = beio->io;
934	struct ctl_lba_len_flags *lbalen = ARGS(io);
935	struct scsi_get_lba_status_data *data;
936	off_t roff, off;
937	int error, status;
938
939	DPRINTF("entered\n");
940
941	off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
942	error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKHOLE,
943	    (caddr_t)&off, FREAD, curthread);
944	if (error == 0 && off > roff)
945		status = 0;	/* mapped up to off */
946	else {
947		error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKDATA,
948		    (caddr_t)&off, FREAD, curthread);
949		if (error == 0 && off > roff)
950			status = 1;	/* deallocated up to off */
951		else {
952			status = 0;	/* unknown up to the end */
953			off = be_lun->size_bytes;
954		}
955	}
956
957	off >>= be_lun->blocksize_shift;
958	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
959	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
960	scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
961	    data->descr[0].length);
962	data->descr[0].status = status;
963
964	ctl_complete_beio(beio);
965}
966
967static void
968ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
969		       struct ctl_be_block_io *beio)
970{
971	struct bio *bio;
972	union ctl_io *io;
973	struct ctl_be_block_devdata *dev_data;
974
975	dev_data = &be_lun->backend.dev;
976	io = beio->io;
977
978	DPRINTF("entered\n");
979
980	/* This can't fail, it's a blocking allocation. */
981	bio = g_alloc_bio();
982
983	bio->bio_cmd	    = BIO_FLUSH;
984	bio->bio_flags	   |= BIO_ORDERED;
985	bio->bio_dev	    = dev_data->cdev;
986	bio->bio_offset	    = 0;
987	bio->bio_data	    = 0;
988	bio->bio_done	    = ctl_be_block_biodone;
989	bio->bio_caller1    = beio;
990	bio->bio_pblkno	    = 0;
991
992	/*
993	 * We don't need to acquire the LUN lock here, because we are only
994	 * sending one bio, and so there is no other context to synchronize
995	 * with.
996	 */
997	beio->num_bios_sent = 1;
998	beio->send_complete = 1;
999
1000	binuptime(&beio->ds_t0);
1001	mtx_lock(&be_lun->io_lock);
1002	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1003	mtx_unlock(&be_lun->io_lock);
1004
1005	(*dev_data->csw->d_strategy)(bio);
1006}
1007
1008static void
1009ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1010		       struct ctl_be_block_io *beio,
1011		       uint64_t off, uint64_t len, int last)
1012{
1013	struct bio *bio;
1014	struct ctl_be_block_devdata *dev_data;
1015	uint64_t maxlen;
1016
1017	dev_data = &be_lun->backend.dev;
1018	maxlen = LONG_MAX - (LONG_MAX % be_lun->blocksize);
1019	while (len > 0) {
1020		bio = g_alloc_bio();
1021		bio->bio_cmd	    = BIO_DELETE;
1022		bio->bio_dev	    = dev_data->cdev;
1023		bio->bio_offset	    = off;
1024		bio->bio_length	    = MIN(len, maxlen);
1025		bio->bio_data	    = 0;
1026		bio->bio_done	    = ctl_be_block_biodone;
1027		bio->bio_caller1    = beio;
1028		bio->bio_pblkno     = off / be_lun->blocksize;
1029
1030		off += bio->bio_length;
1031		len -= bio->bio_length;
1032
1033		mtx_lock(&be_lun->io_lock);
1034		beio->num_bios_sent++;
1035		if (last && len == 0)
1036			beio->send_complete = 1;
1037		mtx_unlock(&be_lun->io_lock);
1038
1039		(*dev_data->csw->d_strategy)(bio);
1040	}
1041}
1042
1043static void
1044ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1045		       struct ctl_be_block_io *beio)
1046{
1047	union ctl_io *io;
1048	struct ctl_be_block_devdata *dev_data;
1049	struct ctl_ptr_len_flags *ptrlen;
1050	struct scsi_unmap_desc *buf, *end;
1051	uint64_t len;
1052
1053	dev_data = &be_lun->backend.dev;
1054	io = beio->io;
1055
1056	DPRINTF("entered\n");
1057
1058	binuptime(&beio->ds_t0);
1059	mtx_lock(&be_lun->io_lock);
1060	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1061	mtx_unlock(&be_lun->io_lock);
1062
1063	if (beio->io_offset == -1) {
1064		beio->io_len = 0;
1065		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1066		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1067		end = buf + ptrlen->len / sizeof(*buf);
1068		for (; buf < end; buf++) {
1069			len = (uint64_t)scsi_4btoul(buf->length) *
1070			    be_lun->blocksize;
1071			beio->io_len += len;
1072			ctl_be_block_unmap_dev_range(be_lun, beio,
1073			    scsi_8btou64(buf->lba) * be_lun->blocksize, len,
1074			    (end - buf < 2) ? TRUE : FALSE);
1075		}
1076	} else
1077		ctl_be_block_unmap_dev_range(be_lun, beio,
1078		    beio->io_offset, beio->io_len, TRUE);
1079}
1080
1081static void
1082ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1083			  struct ctl_be_block_io *beio)
1084{
1085	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1086	int i;
1087	struct bio *bio;
1088	struct ctl_be_block_devdata *dev_data;
1089	off_t cur_offset;
1090	int max_iosize;
1091
1092	DPRINTF("entered\n");
1093
1094	dev_data = &be_lun->backend.dev;
1095
1096	/*
1097	 * We have to limit our I/O size to the maximum supported by the
1098	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
1099	 * set it properly, use DFLTPHYS.
1100	 */
1101	max_iosize = dev_data->cdev->si_iosize_max;
1102	if (max_iosize < PAGE_SIZE)
1103		max_iosize = DFLTPHYS;
1104
1105	cur_offset = beio->io_offset;
1106	for (i = 0; i < beio->num_segs; i++) {
1107		size_t cur_size;
1108		uint8_t *cur_ptr;
1109
1110		cur_size = beio->sg_segs[i].len;
1111		cur_ptr = beio->sg_segs[i].addr;
1112
1113		while (cur_size > 0) {
1114			/* This can't fail, it's a blocking allocation. */
1115			bio = g_alloc_bio();
1116
1117			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1118
1119			bio->bio_cmd = beio->bio_cmd;
1120			bio->bio_dev = dev_data->cdev;
1121			bio->bio_caller1 = beio;
1122			bio->bio_length = min(cur_size, max_iosize);
1123			bio->bio_offset = cur_offset;
1124			bio->bio_data = cur_ptr;
1125			bio->bio_done = ctl_be_block_biodone;
1126			bio->bio_pblkno = cur_offset / be_lun->blocksize;
1127
1128			cur_offset += bio->bio_length;
1129			cur_ptr += bio->bio_length;
1130			cur_size -= bio->bio_length;
1131
1132			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1133			beio->num_bios_sent++;
1134		}
1135	}
1136	binuptime(&beio->ds_t0);
1137	mtx_lock(&be_lun->io_lock);
1138	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1139	beio->send_complete = 1;
1140	mtx_unlock(&be_lun->io_lock);
1141
1142	/*
1143	 * Fire off all allocated requests!
1144	 */
1145	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1146		TAILQ_REMOVE(&queue, bio, bio_queue);
1147		(*dev_data->csw->d_strategy)(bio);
1148	}
1149}
1150
1151static uint64_t
1152ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1153{
1154	struct ctl_be_block_devdata	*dev_data = &be_lun->backend.dev;
1155	struct diocgattr_arg	arg;
1156	int			error;
1157
1158	if (dev_data->csw == NULL || dev_data->csw->d_ioctl == NULL)
1159		return (UINT64_MAX);
1160	strlcpy(arg.name, attrname, sizeof(arg.name));
1161	arg.len = sizeof(arg.value.off);
1162	error = dev_data->csw->d_ioctl(dev_data->cdev,
1163	    DIOCGATTR, (caddr_t)&arg, FREAD, curthread);
1164	if (error != 0)
1165		return (UINT64_MAX);
1166	return (arg.value.off);
1167}
1168
1169static void
1170ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1171{
1172	union ctl_io *io;
1173
1174	io = beio->io;
1175	ctl_free_beio(beio);
1176	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1177	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1178	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1179		ctl_config_write_done(io);
1180		return;
1181	}
1182
1183	ctl_be_block_config_write(io);
1184}
1185
1186static void
1187ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1188			    union ctl_io *io)
1189{
1190	struct ctl_be_block_io *beio;
1191	struct ctl_be_block_softc *softc;
1192	struct ctl_lba_len_flags *lbalen;
1193	uint64_t len_left, lba;
1194	uint32_t pb, pbo, adj;
1195	int i, seglen;
1196	uint8_t *buf, *end;
1197
1198	DPRINTF("entered\n");
1199
1200	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1201	softc = be_lun->softc;
1202	lbalen = ARGS(beio->io);
1203
1204	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1205	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1206		ctl_free_beio(beio);
1207		ctl_set_invalid_field(&io->scsiio,
1208				      /*sks_valid*/ 1,
1209				      /*command*/ 1,
1210				      /*field*/ 1,
1211				      /*bit_valid*/ 0,
1212				      /*bit*/ 0);
1213		ctl_config_write_done(io);
1214		return;
1215	}
1216
1217	switch (io->scsiio.tag_type) {
1218	case CTL_TAG_ORDERED:
1219		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1220		break;
1221	case CTL_TAG_HEAD_OF_QUEUE:
1222		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1223		break;
1224	case CTL_TAG_UNTAGGED:
1225	case CTL_TAG_SIMPLE:
1226	case CTL_TAG_ACA:
1227	default:
1228		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1229		break;
1230	}
1231
1232	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1233		beio->io_offset = lbalen->lba * be_lun->blocksize;
1234		beio->io_len = (uint64_t)lbalen->len * be_lun->blocksize;
1235		beio->bio_cmd = BIO_DELETE;
1236		beio->ds_trans_type = DEVSTAT_FREE;
1237
1238		be_lun->unmap(be_lun, beio);
1239		return;
1240	}
1241
1242	beio->bio_cmd = BIO_WRITE;
1243	beio->ds_trans_type = DEVSTAT_WRITE;
1244
1245	DPRINTF("WRITE SAME at LBA %jx len %u\n",
1246	       (uintmax_t)lbalen->lba, lbalen->len);
1247
1248	pb = be_lun->blocksize << be_lun->pblockexp;
1249	if (be_lun->pblockoff > 0)
1250		pbo = pb - be_lun->blocksize * be_lun->pblockoff;
1251	else
1252		pbo = 0;
1253	len_left = (uint64_t)lbalen->len * be_lun->blocksize;
1254	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1255
1256		/*
1257		 * Setup the S/G entry for this chunk.
1258		 */
1259		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1260		if (pb > be_lun->blocksize) {
1261			adj = ((lbalen->lba + lba) * be_lun->blocksize +
1262			    seglen - pbo) % pb;
1263			if (seglen > adj)
1264				seglen -= adj;
1265			else
1266				seglen -= seglen % be_lun->blocksize;
1267		} else
1268			seglen -= seglen % be_lun->blocksize;
1269		beio->sg_segs[i].len = seglen;
1270		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1271
1272		DPRINTF("segment %d addr %p len %zd\n", i,
1273			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1274
1275		beio->num_segs++;
1276		len_left -= seglen;
1277
1278		buf = beio->sg_segs[i].addr;
1279		end = buf + seglen;
1280		for (; buf < end; buf += be_lun->blocksize) {
1281			memcpy(buf, io->scsiio.kern_data_ptr, be_lun->blocksize);
1282			if (lbalen->flags & SWS_LBDATA)
1283				scsi_ulto4b(lbalen->lba + lba, buf);
1284			lba++;
1285		}
1286	}
1287
1288	beio->io_offset = lbalen->lba * be_lun->blocksize;
1289	beio->io_len = lba * be_lun->blocksize;
1290
1291	/* We can not do all in one run. Correct and schedule rerun. */
1292	if (len_left > 0) {
1293		lbalen->lba += lba;
1294		lbalen->len -= lba;
1295		beio->beio_cont = ctl_be_block_cw_done_ws;
1296	}
1297
1298	be_lun->dispatch(be_lun, beio);
1299}
1300
1301static void
1302ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1303			    union ctl_io *io)
1304{
1305	struct ctl_be_block_io *beio;
1306	struct ctl_be_block_softc *softc;
1307	struct ctl_ptr_len_flags *ptrlen;
1308
1309	DPRINTF("entered\n");
1310
1311	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1312	softc = be_lun->softc;
1313	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1314
1315	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1316		ctl_free_beio(beio);
1317		ctl_set_invalid_field(&io->scsiio,
1318				      /*sks_valid*/ 0,
1319				      /*command*/ 1,
1320				      /*field*/ 0,
1321				      /*bit_valid*/ 0,
1322				      /*bit*/ 0);
1323		ctl_config_write_done(io);
1324		return;
1325	}
1326
1327	switch (io->scsiio.tag_type) {
1328	case CTL_TAG_ORDERED:
1329		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1330		break;
1331	case CTL_TAG_HEAD_OF_QUEUE:
1332		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1333		break;
1334	case CTL_TAG_UNTAGGED:
1335	case CTL_TAG_SIMPLE:
1336	case CTL_TAG_ACA:
1337	default:
1338		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1339		break;
1340	}
1341
1342	beio->io_len = 0;
1343	beio->io_offset = -1;
1344
1345	beio->bio_cmd = BIO_DELETE;
1346	beio->ds_trans_type = DEVSTAT_FREE;
1347
1348	DPRINTF("UNMAP\n");
1349
1350	be_lun->unmap(be_lun, beio);
1351}
1352
1353static void
1354ctl_be_block_cr_done(struct ctl_be_block_io *beio)
1355{
1356	union ctl_io *io;
1357
1358	io = beio->io;
1359	ctl_free_beio(beio);
1360	ctl_config_read_done(io);
1361}
1362
1363static void
1364ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1365			 union ctl_io *io)
1366{
1367	struct ctl_be_block_io *beio;
1368	struct ctl_be_block_softc *softc;
1369
1370	DPRINTF("entered\n");
1371
1372	softc = be_lun->softc;
1373	beio = ctl_alloc_beio(softc);
1374	beio->io = io;
1375	beio->lun = be_lun;
1376	beio->beio_cont = ctl_be_block_cr_done;
1377	PRIV(io)->ptr = (void *)beio;
1378
1379	switch (io->scsiio.cdb[0]) {
1380	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
1381		beio->bio_cmd = -1;
1382		beio->ds_trans_type = DEVSTAT_NO_DATA;
1383		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1384		beio->io_len = 0;
1385		if (be_lun->get_lba_status)
1386			be_lun->get_lba_status(be_lun, beio);
1387		else
1388			ctl_be_block_cr_done(beio);
1389		break;
1390	default:
1391		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1392		break;
1393	}
1394}
1395
1396static void
1397ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1398{
1399	union ctl_io *io;
1400
1401	io = beio->io;
1402	ctl_free_beio(beio);
1403	ctl_config_write_done(io);
1404}
1405
1406static void
1407ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1408			 union ctl_io *io)
1409{
1410	struct ctl_be_block_io *beio;
1411	struct ctl_be_block_softc *softc;
1412
1413	DPRINTF("entered\n");
1414
1415	softc = be_lun->softc;
1416	beio = ctl_alloc_beio(softc);
1417	beio->io = io;
1418	beio->lun = be_lun;
1419	beio->beio_cont = ctl_be_block_cw_done;
1420	PRIV(io)->ptr = (void *)beio;
1421
1422	switch (io->scsiio.cdb[0]) {
1423	case SYNCHRONIZE_CACHE:
1424	case SYNCHRONIZE_CACHE_16:
1425		beio->bio_cmd = BIO_FLUSH;
1426		beio->ds_trans_type = DEVSTAT_NO_DATA;
1427		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1428		beio->io_len = 0;
1429		be_lun->lun_flush(be_lun, beio);
1430		break;
1431	case WRITE_SAME_10:
1432	case WRITE_SAME_16:
1433		ctl_be_block_cw_dispatch_ws(be_lun, io);
1434		break;
1435	case UNMAP:
1436		ctl_be_block_cw_dispatch_unmap(be_lun, io);
1437		break;
1438	default:
1439		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1440		break;
1441	}
1442}
1443
1444SDT_PROBE_DEFINE1(cbb, kernel, read, start, "uint64_t");
1445SDT_PROBE_DEFINE1(cbb, kernel, write, start, "uint64_t");
1446SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, "uint64_t");
1447SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, "uint64_t");
1448
1449static void
1450ctl_be_block_next(struct ctl_be_block_io *beio)
1451{
1452	struct ctl_be_block_lun *be_lun;
1453	union ctl_io *io;
1454
1455	io = beio->io;
1456	be_lun = beio->lun;
1457	ctl_free_beio(beio);
1458	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1459	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1460	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1461		ctl_data_submit_done(io);
1462		return;
1463	}
1464
1465	io->io_hdr.status &= ~CTL_STATUS_MASK;
1466	io->io_hdr.status |= CTL_STATUS_NONE;
1467
1468	mtx_lock(&be_lun->queue_lock);
1469	/*
1470	 * XXX KDM make sure that links is okay to use at this point.
1471	 * Otherwise, we either need to add another field to ctl_io_hdr,
1472	 * or deal with resource allocation here.
1473	 */
1474	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1475	mtx_unlock(&be_lun->queue_lock);
1476
1477	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1478}
1479
1480static void
1481ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1482			   union ctl_io *io)
1483{
1484	struct ctl_be_block_io *beio;
1485	struct ctl_be_block_softc *softc;
1486	struct ctl_lba_len_flags *lbalen;
1487	struct ctl_ptr_len_flags *bptrlen;
1488	uint64_t len_left, lbas;
1489	int i;
1490
1491	softc = be_lun->softc;
1492
1493	DPRINTF("entered\n");
1494
1495	lbalen = ARGS(io);
1496	if (lbalen->flags & CTL_LLF_WRITE) {
1497		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
1498	} else {
1499		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
1500	}
1501
1502	beio = ctl_alloc_beio(softc);
1503	beio->io = io;
1504	beio->lun = be_lun;
1505	bptrlen = PRIV(io);
1506	bptrlen->ptr = (void *)beio;
1507
1508	switch (io->scsiio.tag_type) {
1509	case CTL_TAG_ORDERED:
1510		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1511		break;
1512	case CTL_TAG_HEAD_OF_QUEUE:
1513		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1514		break;
1515	case CTL_TAG_UNTAGGED:
1516	case CTL_TAG_SIMPLE:
1517	case CTL_TAG_ACA:
1518	default:
1519		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1520		break;
1521	}
1522
1523	if (lbalen->flags & CTL_LLF_WRITE) {
1524		beio->bio_cmd = BIO_WRITE;
1525		beio->ds_trans_type = DEVSTAT_WRITE;
1526	} else {
1527		beio->bio_cmd = BIO_READ;
1528		beio->ds_trans_type = DEVSTAT_READ;
1529	}
1530
1531	DPRINTF("%s at LBA %jx len %u @%ju\n",
1532	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1533	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
1534	if (lbalen->flags & CTL_LLF_COMPARE)
1535		lbas = CTLBLK_HALF_IO_SIZE;
1536	else
1537		lbas = CTLBLK_MAX_IO_SIZE;
1538	lbas = MIN(lbalen->len - bptrlen->len, lbas / be_lun->blocksize);
1539	beio->io_offset = (lbalen->lba + bptrlen->len) * be_lun->blocksize;
1540	beio->io_len = lbas * be_lun->blocksize;
1541	bptrlen->len += lbas;
1542
1543	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
1544		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
1545		    i, CTLBLK_MAX_SEGS));
1546
1547		/*
1548		 * Setup the S/G entry for this chunk.
1549		 */
1550		beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
1551		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1552
1553		DPRINTF("segment %d addr %p len %zd\n", i,
1554			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1555
1556		/* Set up second segment for compare operation. */
1557		if (lbalen->flags & CTL_LLF_COMPARE) {
1558			beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
1559			    beio->sg_segs[i].len;
1560			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
1561			    uma_zalloc(be_lun->lun_zone, M_WAITOK);
1562		}
1563
1564		beio->num_segs++;
1565		len_left -= beio->sg_segs[i].len;
1566	}
1567	if (bptrlen->len < lbalen->len)
1568		beio->beio_cont = ctl_be_block_next;
1569	io->scsiio.be_move_done = ctl_be_block_move_done;
1570	/* For compare we have separate S/G lists for read and datamove. */
1571	if (lbalen->flags & CTL_LLF_COMPARE)
1572		io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
1573	else
1574		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1575	io->scsiio.kern_data_len = beio->io_len;
1576	io->scsiio.kern_data_resid = 0;
1577	io->scsiio.kern_sg_entries = beio->num_segs;
1578	io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
1579
1580	/*
1581	 * For the read case, we need to read the data into our buffers and
1582	 * then we can send it back to the user.  For the write case, we
1583	 * need to get the data from the user first.
1584	 */
1585	if (beio->bio_cmd == BIO_READ) {
1586		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
1587		be_lun->dispatch(be_lun, beio);
1588	} else {
1589		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1590#ifdef CTL_TIME_IO
1591        	getbintime(&io->io_hdr.dma_start_bt);
1592#endif
1593		ctl_datamove(io);
1594	}
1595}
1596
1597static void
1598ctl_be_block_worker(void *context, int pending)
1599{
1600	struct ctl_be_block_lun *be_lun;
1601	struct ctl_be_block_softc *softc;
1602	union ctl_io *io;
1603
1604	be_lun = (struct ctl_be_block_lun *)context;
1605	softc = be_lun->softc;
1606
1607	DPRINTF("entered\n");
1608
1609	mtx_lock(&be_lun->queue_lock);
1610	for (;;) {
1611		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1612		if (io != NULL) {
1613			struct ctl_be_block_io *beio;
1614
1615			DPRINTF("datamove queue\n");
1616
1617			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1618				      ctl_io_hdr, links);
1619
1620			mtx_unlock(&be_lun->queue_lock);
1621
1622			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1623
1624			be_lun->dispatch(be_lun, beio);
1625
1626			mtx_lock(&be_lun->queue_lock);
1627			continue;
1628		}
1629		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1630		if (io != NULL) {
1631			DPRINTF("config write queue\n");
1632			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1633				      ctl_io_hdr, links);
1634			mtx_unlock(&be_lun->queue_lock);
1635			ctl_be_block_cw_dispatch(be_lun, io);
1636			mtx_lock(&be_lun->queue_lock);
1637			continue;
1638		}
1639		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
1640		if (io != NULL) {
1641			DPRINTF("config read queue\n");
1642			STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
1643				      ctl_io_hdr, links);
1644			mtx_unlock(&be_lun->queue_lock);
1645			ctl_be_block_cr_dispatch(be_lun, io);
1646			mtx_lock(&be_lun->queue_lock);
1647			continue;
1648		}
1649		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1650		if (io != NULL) {
1651			DPRINTF("input queue\n");
1652
1653			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1654				      ctl_io_hdr, links);
1655			mtx_unlock(&be_lun->queue_lock);
1656
1657			/*
1658			 * We must drop the lock, since this routine and
1659			 * its children may sleep.
1660			 */
1661			ctl_be_block_dispatch(be_lun, io);
1662
1663			mtx_lock(&be_lun->queue_lock);
1664			continue;
1665		}
1666
1667		/*
1668		 * If we get here, there is no work left in the queues, so
1669		 * just break out and let the task queue go to sleep.
1670		 */
1671		break;
1672	}
1673	mtx_unlock(&be_lun->queue_lock);
1674}
1675
1676/*
1677 * Entry point from CTL to the backend for I/O.  We queue everything to a
1678 * work thread, so this just puts the I/O on a queue and wakes up the
1679 * thread.
1680 */
1681static int
1682ctl_be_block_submit(union ctl_io *io)
1683{
1684	struct ctl_be_block_lun *be_lun;
1685	struct ctl_be_lun *ctl_be_lun;
1686
1687	DPRINTF("entered\n");
1688
1689	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1690		CTL_PRIV_BACKEND_LUN].ptr;
1691	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
1692
1693	/*
1694	 * Make sure we only get SCSI I/O.
1695	 */
1696	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1697		"%#x) encountered", io->io_hdr.io_type));
1698
1699	PRIV(io)->len = 0;
1700
1701	mtx_lock(&be_lun->queue_lock);
1702	/*
1703	 * XXX KDM make sure that links is okay to use at this point.
1704	 * Otherwise, we either need to add another field to ctl_io_hdr,
1705	 * or deal with resource allocation here.
1706	 */
1707	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1708	mtx_unlock(&be_lun->queue_lock);
1709	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1710
1711	return (CTL_RETVAL_COMPLETE);
1712}
1713
1714static int
1715ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1716			int flag, struct thread *td)
1717{
1718	struct ctl_be_block_softc *softc;
1719	int error;
1720
1721	softc = &backend_block_softc;
1722
1723	error = 0;
1724
1725	switch (cmd) {
1726	case CTL_LUN_REQ: {
1727		struct ctl_lun_req *lun_req;
1728
1729		lun_req = (struct ctl_lun_req *)addr;
1730
1731		switch (lun_req->reqtype) {
1732		case CTL_LUNREQ_CREATE:
1733			error = ctl_be_block_create(softc, lun_req);
1734			break;
1735		case CTL_LUNREQ_RM:
1736			error = ctl_be_block_rm(softc, lun_req);
1737			break;
1738		case CTL_LUNREQ_MODIFY:
1739			error = ctl_be_block_modify(softc, lun_req);
1740			break;
1741		default:
1742			lun_req->status = CTL_LUN_ERROR;
1743			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1744				 "invalid LUN request type %d",
1745				 lun_req->reqtype);
1746			break;
1747		}
1748		break;
1749	}
1750	default:
1751		error = ENOTTY;
1752		break;
1753	}
1754
1755	return (error);
1756}
1757
1758static int
1759ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1760{
1761	struct ctl_be_block_filedata *file_data;
1762	struct ctl_lun_create_params *params;
1763	char			     *value;
1764	struct vattr		      vattr;
1765	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1766	int			      error;
1767
1768	error = 0;
1769	file_data = &be_lun->backend.file;
1770	params = &be_lun->params;
1771
1772	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1773	be_lun->dispatch = ctl_be_block_dispatch_file;
1774	be_lun->lun_flush = ctl_be_block_flush_file;
1775	be_lun->get_lba_status = ctl_be_block_gls_file;
1776	be_lun->getattr = ctl_be_block_getattr_file;
1777
1778	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1779	if (error != 0) {
1780		snprintf(req->error_str, sizeof(req->error_str),
1781			 "error calling VOP_GETATTR() for file %s",
1782			 be_lun->dev_path);
1783		return (error);
1784	}
1785
1786	/*
1787	 * Verify that we have the ability to upgrade to exclusive
1788	 * access on this file so we can trap errors at open instead
1789	 * of reporting them during first access.
1790	 */
1791	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1792		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1793		if (be_lun->vn->v_iflag & VI_DOOMED) {
1794			error = EBADF;
1795			snprintf(req->error_str, sizeof(req->error_str),
1796				 "error locking file %s", be_lun->dev_path);
1797			return (error);
1798		}
1799	}
1800
1801
1802	file_data->cred = crhold(curthread->td_ucred);
1803	if (params->lun_size_bytes != 0)
1804		be_lun->size_bytes = params->lun_size_bytes;
1805	else
1806		be_lun->size_bytes = vattr.va_size;
1807	/*
1808	 * We set the multi thread flag for file operations because all
1809	 * filesystems (in theory) are capable of allowing multiple readers
1810	 * of a file at once.  So we want to get the maximum possible
1811	 * concurrency.
1812	 */
1813	be_lun->flags |= CTL_BE_BLOCK_LUN_MULTI_THREAD;
1814
1815	/*
1816	 * For files we can use any logical block size.  Prefer 512 bytes
1817	 * for compatibility reasons.  If file's vattr.va_blocksize
1818	 * (preferred I/O block size) is bigger and multiple to chosen
1819	 * logical block size -- report it as physical block size.
1820	 */
1821	if (params->blocksize_bytes != 0)
1822		be_lun->blocksize = params->blocksize_bytes;
1823	else
1824		be_lun->blocksize = 512;
1825
1826	us = ps = vattr.va_blocksize;
1827	uo = po = 0;
1828
1829	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblocksize");
1830	if (value != NULL)
1831		ctl_expand_number(value, &ps);
1832	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblockoffset");
1833	if (value != NULL)
1834		ctl_expand_number(value, &po);
1835	pss = ps / be_lun->blocksize;
1836	pos = po / be_lun->blocksize;
1837	if ((pss > 0) && (pss * be_lun->blocksize == ps) && (pss >= pos) &&
1838	    ((pss & (pss - 1)) == 0) && (pos * be_lun->blocksize == po)) {
1839		be_lun->pblockexp = fls(pss) - 1;
1840		be_lun->pblockoff = (pss - pos) % pss;
1841	}
1842
1843	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublocksize");
1844	if (value != NULL)
1845		ctl_expand_number(value, &us);
1846	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublockoffset");
1847	if (value != NULL)
1848		ctl_expand_number(value, &uo);
1849	uss = us / be_lun->blocksize;
1850	uos = uo / be_lun->blocksize;
1851	if ((uss > 0) && (uss * be_lun->blocksize == us) && (uss >= uos) &&
1852	    ((uss & (uss - 1)) == 0) && (uos * be_lun->blocksize == uo)) {
1853		be_lun->ublockexp = fls(uss) - 1;
1854		be_lun->ublockoff = (uss - uos) % uss;
1855	}
1856
1857	/*
1858	 * Sanity check.  The media size has to be at least one
1859	 * sector long.
1860	 */
1861	if (be_lun->size_bytes < be_lun->blocksize) {
1862		error = EINVAL;
1863		snprintf(req->error_str, sizeof(req->error_str),
1864			 "file %s size %ju < block size %u", be_lun->dev_path,
1865			 (uintmax_t)be_lun->size_bytes, be_lun->blocksize);
1866	}
1867
1868	be_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / be_lun->blocksize;
1869	return (error);
1870}
1871
1872static int
1873ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1874{
1875	struct ctl_lun_create_params *params;
1876	struct vattr		      vattr;
1877	struct cdev		     *dev;
1878	struct cdevsw		     *devsw;
1879	char			     *value;
1880	int			      error, atomic, maxio, unmap;
1881	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1882
1883	params = &be_lun->params;
1884
1885	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1886	be_lun->backend.dev.cdev = be_lun->vn->v_rdev;
1887	be_lun->backend.dev.csw = dev_refthread(be_lun->backend.dev.cdev,
1888					     &be_lun->backend.dev.dev_ref);
1889	if (be_lun->backend.dev.csw == NULL)
1890		panic("Unable to retrieve device switch");
1891	if (strcmp(be_lun->backend.dev.csw->d_name, "zvol") == 0) {
1892		be_lun->dispatch = ctl_be_block_dispatch_zvol;
1893		be_lun->get_lba_status = ctl_be_block_gls_zvol;
1894		atomic = maxio = CTLBLK_MAX_IO_SIZE;
1895	} else {
1896		be_lun->dispatch = ctl_be_block_dispatch_dev;
1897		atomic = 0;
1898		maxio = be_lun->backend.dev.cdev->si_iosize_max;
1899		if (maxio <= 0)
1900			maxio = DFLTPHYS;
1901		if (maxio > CTLBLK_MAX_IO_SIZE)
1902			maxio = CTLBLK_MAX_IO_SIZE;
1903	}
1904	be_lun->lun_flush = ctl_be_block_flush_dev;
1905	be_lun->getattr = ctl_be_block_getattr_dev;
1906
1907	error = VOP_GETATTR(be_lun->vn, &vattr, NOCRED);
1908	if (error) {
1909		snprintf(req->error_str, sizeof(req->error_str),
1910			 "error getting vnode attributes for device %s",
1911			 be_lun->dev_path);
1912		return (error);
1913	}
1914
1915	dev = be_lun->vn->v_rdev;
1916	devsw = dev->si_devsw;
1917	if (!devsw->d_ioctl) {
1918		snprintf(req->error_str, sizeof(req->error_str),
1919			 "no d_ioctl for device %s!",
1920			 be_lun->dev_path);
1921		return (ENODEV);
1922	}
1923
1924	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
1925			       (caddr_t)&be_lun->blocksize, FREAD,
1926			       curthread);
1927	if (error) {
1928		snprintf(req->error_str, sizeof(req->error_str),
1929			 "error %d returned for DIOCGSECTORSIZE ioctl "
1930			 "on %s!", error, be_lun->dev_path);
1931		return (error);
1932	}
1933
1934	/*
1935	 * If the user has asked for a blocksize that is greater than the
1936	 * backing device's blocksize, we can do it only if the blocksize
1937	 * the user is asking for is an even multiple of the underlying
1938	 * device's blocksize.
1939	 */
1940	if ((params->blocksize_bytes != 0)
1941	 && (params->blocksize_bytes > be_lun->blocksize)) {
1942		uint32_t bs_multiple, tmp_blocksize;
1943
1944		bs_multiple = params->blocksize_bytes / be_lun->blocksize;
1945
1946		tmp_blocksize = bs_multiple * be_lun->blocksize;
1947
1948		if (tmp_blocksize == params->blocksize_bytes) {
1949			be_lun->blocksize = params->blocksize_bytes;
1950		} else {
1951			snprintf(req->error_str, sizeof(req->error_str),
1952				 "requested blocksize %u is not an even "
1953				 "multiple of backing device blocksize %u",
1954				 params->blocksize_bytes,
1955				 be_lun->blocksize);
1956			return (EINVAL);
1957
1958		}
1959	} else if ((params->blocksize_bytes != 0)
1960		&& (params->blocksize_bytes != be_lun->blocksize)) {
1961		snprintf(req->error_str, sizeof(req->error_str),
1962			 "requested blocksize %u < backing device "
1963			 "blocksize %u", params->blocksize_bytes,
1964			 be_lun->blocksize);
1965		return (EINVAL);
1966	}
1967
1968	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
1969			       (caddr_t)&be_lun->size_bytes, FREAD,
1970			       curthread);
1971	if (error) {
1972		snprintf(req->error_str, sizeof(req->error_str),
1973			 "error %d returned for DIOCGMEDIASIZE "
1974			 " ioctl on %s!", error,
1975			 be_lun->dev_path);
1976		return (error);
1977	}
1978
1979	if (params->lun_size_bytes != 0) {
1980		if (params->lun_size_bytes > be_lun->size_bytes) {
1981			snprintf(req->error_str, sizeof(req->error_str),
1982				 "requested LUN size %ju > backing device "
1983				 "size %ju",
1984				 (uintmax_t)params->lun_size_bytes,
1985				 (uintmax_t)be_lun->size_bytes);
1986			return (EINVAL);
1987		}
1988
1989		be_lun->size_bytes = params->lun_size_bytes;
1990	}
1991
1992	error = devsw->d_ioctl(dev, DIOCGSTRIPESIZE,
1993			       (caddr_t)&ps, FREAD, curthread);
1994	if (error)
1995		ps = po = 0;
1996	else {
1997		error = devsw->d_ioctl(dev, DIOCGSTRIPEOFFSET,
1998				       (caddr_t)&po, FREAD, curthread);
1999		if (error)
2000			po = 0;
2001	}
2002	us = ps;
2003	uo = po;
2004
2005	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblocksize");
2006	if (value != NULL)
2007		ctl_expand_number(value, &ps);
2008	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblockoffset");
2009	if (value != NULL)
2010		ctl_expand_number(value, &po);
2011	pss = ps / be_lun->blocksize;
2012	pos = po / be_lun->blocksize;
2013	if ((pss > 0) && (pss * be_lun->blocksize == ps) && (pss >= pos) &&
2014	    ((pss & (pss - 1)) == 0) && (pos * be_lun->blocksize == po)) {
2015		be_lun->pblockexp = fls(pss) - 1;
2016		be_lun->pblockoff = (pss - pos) % pss;
2017	}
2018
2019	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublocksize");
2020	if (value != NULL)
2021		ctl_expand_number(value, &us);
2022	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublockoffset");
2023	if (value != NULL)
2024		ctl_expand_number(value, &uo);
2025	uss = us / be_lun->blocksize;
2026	uos = uo / be_lun->blocksize;
2027	if ((uss > 0) && (uss * be_lun->blocksize == us) && (uss >= uos) &&
2028	    ((uss & (uss - 1)) == 0) && (uos * be_lun->blocksize == uo)) {
2029		be_lun->ublockexp = fls(uss) - 1;
2030		be_lun->ublockoff = (uss - uos) % uss;
2031	}
2032
2033	be_lun->atomicblock = atomic / be_lun->blocksize;
2034	be_lun->opttxferlen = maxio / be_lun->blocksize;
2035
2036	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2037		unmap = 1;
2038	} else {
2039		struct diocgattr_arg	arg;
2040
2041		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2042		arg.len = sizeof(arg.value.i);
2043		error = devsw->d_ioctl(dev, DIOCGATTR,
2044		    (caddr_t)&arg, FREAD, curthread);
2045		unmap = (error == 0) ? arg.value.i : 0;
2046	}
2047	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "unmap");
2048	if (value != NULL)
2049		unmap = (strcmp(value, "on") == 0);
2050	if (unmap)
2051		be_lun->unmap = ctl_be_block_unmap_dev;
2052
2053	return (0);
2054}
2055
2056static int
2057ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2058{
2059	DROP_GIANT();
2060	if (be_lun->vn) {
2061		int flags = FREAD | FWRITE;
2062
2063		switch (be_lun->dev_type) {
2064		case CTL_BE_BLOCK_DEV:
2065			if (be_lun->backend.dev.csw) {
2066				dev_relthread(be_lun->backend.dev.cdev,
2067					      be_lun->backend.dev.dev_ref);
2068				be_lun->backend.dev.csw  = NULL;
2069				be_lun->backend.dev.cdev = NULL;
2070			}
2071			break;
2072		case CTL_BE_BLOCK_FILE:
2073			break;
2074		case CTL_BE_BLOCK_NONE:
2075			break;
2076		default:
2077			panic("Unexpected backend type.");
2078			break;
2079		}
2080
2081		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2082		be_lun->vn = NULL;
2083
2084		switch (be_lun->dev_type) {
2085		case CTL_BE_BLOCK_DEV:
2086			break;
2087		case CTL_BE_BLOCK_FILE:
2088			if (be_lun->backend.file.cred != NULL) {
2089				crfree(be_lun->backend.file.cred);
2090				be_lun->backend.file.cred = NULL;
2091			}
2092			break;
2093		case CTL_BE_BLOCK_NONE:
2094			break;
2095		default:
2096			panic("Unexpected backend type.");
2097			break;
2098		}
2099		be_lun->dev_type = CTL_BE_BLOCK_NONE;
2100	}
2101	PICKUP_GIANT();
2102
2103	return (0);
2104}
2105
2106static int
2107ctl_be_block_open(struct ctl_be_block_softc *softc,
2108		       struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2109{
2110	struct nameidata nd;
2111	int		 flags;
2112	int		 error;
2113
2114	/*
2115	 * XXX KDM allow a read-only option?
2116	 */
2117	flags = FREAD | FWRITE;
2118	error = 0;
2119
2120	if (rootvnode == NULL) {
2121		snprintf(req->error_str, sizeof(req->error_str),
2122			 "Root filesystem is not mounted");
2123		return (1);
2124	}
2125
2126	pwd_ensure_dirs();
2127
2128 again:
2129	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
2130	error = vn_open(&nd, &flags, 0, NULL);
2131	if (error) {
2132		/*
2133		 * This is the only reasonable guess we can make as far as
2134		 * path if the user doesn't give us a fully qualified path.
2135		 * If they want to specify a file, they need to specify the
2136		 * full path.
2137		 */
2138		if (be_lun->dev_path[0] != '/') {
2139			char *dev_path = "/dev/";
2140			char *dev_name;
2141
2142			/* Try adding device path at beginning of name */
2143			dev_name = malloc(strlen(be_lun->dev_path)
2144					+ strlen(dev_path) + 1,
2145					  M_CTLBLK, M_WAITOK);
2146			if (dev_name) {
2147				sprintf(dev_name, "%s%s", dev_path,
2148					be_lun->dev_path);
2149				free(be_lun->dev_path, M_CTLBLK);
2150				be_lun->dev_path = dev_name;
2151				goto again;
2152			}
2153		}
2154		snprintf(req->error_str, sizeof(req->error_str),
2155		    "error opening %s: %d", be_lun->dev_path, error);
2156		return (error);
2157	}
2158
2159	NDFREE(&nd, NDF_ONLY_PNBUF);
2160
2161	be_lun->vn = nd.ni_vp;
2162
2163	/* We only support disks and files. */
2164	if (vn_isdisk(be_lun->vn, &error)) {
2165		error = ctl_be_block_open_dev(be_lun, req);
2166	} else if (be_lun->vn->v_type == VREG) {
2167		error = ctl_be_block_open_file(be_lun, req);
2168	} else {
2169		error = EINVAL;
2170		snprintf(req->error_str, sizeof(req->error_str),
2171			 "%s is not a disk or plain file", be_lun->dev_path);
2172	}
2173	VOP_UNLOCK(be_lun->vn, 0);
2174
2175	if (error != 0) {
2176		ctl_be_block_close(be_lun);
2177		return (error);
2178	}
2179
2180	be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
2181	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
2182
2183	return (0);
2184}
2185
2186static int
2187ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2188{
2189	struct ctl_be_block_lun *be_lun;
2190	struct ctl_lun_create_params *params;
2191	char num_thread_str[16];
2192	char tmpstr[32];
2193	char *value;
2194	int retval, num_threads;
2195	int tmp_num_threads;
2196
2197	params = &req->reqdata.create;
2198	retval = 0;
2199	req->status = CTL_LUN_OK;
2200
2201	num_threads = cbb_num_threads;
2202
2203	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2204
2205	be_lun->params = req->reqdata.create;
2206	be_lun->softc = softc;
2207	STAILQ_INIT(&be_lun->input_queue);
2208	STAILQ_INIT(&be_lun->config_read_queue);
2209	STAILQ_INIT(&be_lun->config_write_queue);
2210	STAILQ_INIT(&be_lun->datamove_queue);
2211	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
2212	mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF);
2213	mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF);
2214	ctl_init_opts(&be_lun->ctl_be_lun.options,
2215	    req->num_be_args, req->kern_be_args);
2216
2217	be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG,
2218	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
2219
2220	if (be_lun->lun_zone == NULL) {
2221		snprintf(req->error_str, sizeof(req->error_str),
2222			 "error allocating UMA zone");
2223		goto bailout_error;
2224	}
2225
2226	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2227		be_lun->ctl_be_lun.lun_type = params->device_type;
2228	else
2229		be_lun->ctl_be_lun.lun_type = T_DIRECT;
2230
2231	if (be_lun->ctl_be_lun.lun_type == T_DIRECT) {
2232		value = ctl_get_opt(&be_lun->ctl_be_lun.options, "file");
2233		if (value == NULL) {
2234			snprintf(req->error_str, sizeof(req->error_str),
2235				 "no file argument specified");
2236			goto bailout_error;
2237		}
2238		be_lun->dev_path = strdup(value, M_CTLBLK);
2239		be_lun->blocksize = 512;
2240		be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
2241
2242		retval = ctl_be_block_open(softc, be_lun, req);
2243		if (retval != 0) {
2244			retval = 0;
2245			req->status = CTL_LUN_WARNING;
2246		}
2247	} else {
2248		/*
2249		 * For processor devices, we don't have any size.
2250		 */
2251		be_lun->blocksize = 0;
2252		be_lun->pblockexp = 0;
2253		be_lun->pblockoff = 0;
2254		be_lun->ublockexp = 0;
2255		be_lun->ublockoff = 0;
2256		be_lun->size_blocks = 0;
2257		be_lun->size_bytes = 0;
2258		be_lun->ctl_be_lun.maxlba = 0;
2259
2260		/*
2261		 * Default to just 1 thread for processor devices.
2262		 */
2263		num_threads = 1;
2264	}
2265
2266	/*
2267	 * XXX This searching loop might be refactored to be combined with
2268	 * the loop above,
2269	 */
2270	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "num_threads");
2271	if (value != NULL) {
2272		tmp_num_threads = strtol(value, NULL, 0);
2273
2274		/*
2275		 * We don't let the user specify less than one
2276		 * thread, but hope he's clueful enough not to
2277		 * specify 1000 threads.
2278		 */
2279		if (tmp_num_threads < 1) {
2280			snprintf(req->error_str, sizeof(req->error_str),
2281				 "invalid number of threads %s",
2282				 num_thread_str);
2283			goto bailout_error;
2284		}
2285		num_threads = tmp_num_threads;
2286	}
2287
2288	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
2289	be_lun->ctl_be_lun.flags = CTL_LUN_FLAG_PRIMARY;
2290	if (be_lun->vn == NULL)
2291		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_OFFLINE;
2292	if (be_lun->unmap != NULL)
2293		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_UNMAP;
2294	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2295		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_SERSEQ_READ;
2296	be_lun->ctl_be_lun.be_lun = be_lun;
2297	be_lun->ctl_be_lun.maxlba = (be_lun->size_blocks == 0) ?
2298	    0 : (be_lun->size_blocks - 1);
2299	be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
2300	be_lun->ctl_be_lun.pblockexp = be_lun->pblockexp;
2301	be_lun->ctl_be_lun.pblockoff = be_lun->pblockoff;
2302	be_lun->ctl_be_lun.ublockexp = be_lun->ublockexp;
2303	be_lun->ctl_be_lun.ublockoff = be_lun->ublockoff;
2304	be_lun->ctl_be_lun.atomicblock = be_lun->atomicblock;
2305	be_lun->ctl_be_lun.opttxferlen = be_lun->opttxferlen;
2306	/* Tell the user the blocksize we ended up using */
2307	params->lun_size_bytes = be_lun->size_bytes;
2308	params->blocksize_bytes = be_lun->blocksize;
2309	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2310		be_lun->ctl_be_lun.req_lun_id = params->req_lun_id;
2311		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_ID_REQ;
2312	} else
2313		be_lun->ctl_be_lun.req_lun_id = 0;
2314
2315	be_lun->ctl_be_lun.lun_shutdown = ctl_be_block_lun_shutdown;
2316	be_lun->ctl_be_lun.lun_config_status =
2317		ctl_be_block_lun_config_status;
2318	be_lun->ctl_be_lun.be = &ctl_be_block_driver;
2319
2320	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2321		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
2322			 softc->num_luns);
2323		strncpy((char *)be_lun->ctl_be_lun.serial_num, tmpstr,
2324			MIN(sizeof(be_lun->ctl_be_lun.serial_num),
2325			sizeof(tmpstr)));
2326
2327		/* Tell the user what we used for a serial number */
2328		strncpy((char *)params->serial_num, tmpstr,
2329			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2330	} else {
2331		strncpy((char *)be_lun->ctl_be_lun.serial_num,
2332			params->serial_num,
2333			MIN(sizeof(be_lun->ctl_be_lun.serial_num),
2334			sizeof(params->serial_num)));
2335	}
2336	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2337		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
2338		strncpy((char *)be_lun->ctl_be_lun.device_id, tmpstr,
2339			MIN(sizeof(be_lun->ctl_be_lun.device_id),
2340			sizeof(tmpstr)));
2341
2342		/* Tell the user what we used for a device ID */
2343		strncpy((char *)params->device_id, tmpstr,
2344			MIN(sizeof(params->device_id), sizeof(tmpstr)));
2345	} else {
2346		strncpy((char *)be_lun->ctl_be_lun.device_id,
2347			params->device_id,
2348			MIN(sizeof(be_lun->ctl_be_lun.device_id),
2349			    sizeof(params->device_id)));
2350	}
2351
2352	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2353
2354	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
2355	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2356
2357	if (be_lun->io_taskqueue == NULL) {
2358		snprintf(req->error_str, sizeof(req->error_str),
2359			 "unable to create taskqueue");
2360		goto bailout_error;
2361	}
2362
2363	/*
2364	 * Note that we start the same number of threads by default for
2365	 * both the file case and the block device case.  For the file
2366	 * case, we need multiple threads to allow concurrency, because the
2367	 * vnode interface is designed to be a blocking interface.  For the
2368	 * block device case, ZFS zvols at least will block the caller's
2369	 * context in many instances, and so we need multiple threads to
2370	 * overcome that problem.  Other block devices don't need as many
2371	 * threads, but they shouldn't cause too many problems.
2372	 *
2373	 * If the user wants to just have a single thread for a block
2374	 * device, he can specify that when the LUN is created, or change
2375	 * the tunable/sysctl to alter the default number of threads.
2376	 */
2377	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
2378					 /*num threads*/num_threads,
2379					 /*priority*/PWAIT,
2380					 /*thread name*/
2381					 "%s taskq", be_lun->lunname);
2382
2383	if (retval != 0)
2384		goto bailout_error;
2385
2386	be_lun->num_threads = num_threads;
2387
2388	mtx_lock(&softc->lock);
2389	softc->num_luns++;
2390	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
2391
2392	mtx_unlock(&softc->lock);
2393
2394	retval = ctl_add_lun(&be_lun->ctl_be_lun);
2395	if (retval != 0) {
2396		mtx_lock(&softc->lock);
2397		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2398			      links);
2399		softc->num_luns--;
2400		mtx_unlock(&softc->lock);
2401		snprintf(req->error_str, sizeof(req->error_str),
2402			 "ctl_add_lun() returned error %d, see dmesg for "
2403			 "details", retval);
2404		retval = 0;
2405		goto bailout_error;
2406	}
2407
2408	mtx_lock(&softc->lock);
2409
2410	/*
2411	 * Tell the config_status routine that we're waiting so it won't
2412	 * clean up the LUN in the event of an error.
2413	 */
2414	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2415
2416	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2417		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2418		if (retval == EINTR)
2419			break;
2420	}
2421	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2422
2423	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
2424		snprintf(req->error_str, sizeof(req->error_str),
2425			 "LUN configuration error, see dmesg for details");
2426		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2427			      links);
2428		softc->num_luns--;
2429		mtx_unlock(&softc->lock);
2430		goto bailout_error;
2431	} else {
2432		params->req_lun_id = be_lun->ctl_be_lun.lun_id;
2433	}
2434
2435	mtx_unlock(&softc->lock);
2436
2437	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
2438					       be_lun->blocksize,
2439					       DEVSTAT_ALL_SUPPORTED,
2440					       be_lun->ctl_be_lun.lun_type
2441					       | DEVSTAT_TYPE_IF_OTHER,
2442					       DEVSTAT_PRIORITY_OTHER);
2443
2444	return (retval);
2445
2446bailout_error:
2447	req->status = CTL_LUN_ERROR;
2448
2449	if (be_lun->io_taskqueue != NULL)
2450		taskqueue_free(be_lun->io_taskqueue);
2451	ctl_be_block_close(be_lun);
2452	if (be_lun->dev_path != NULL)
2453		free(be_lun->dev_path, M_CTLBLK);
2454	if (be_lun->lun_zone != NULL)
2455		uma_zdestroy(be_lun->lun_zone);
2456	ctl_free_opts(&be_lun->ctl_be_lun.options);
2457	mtx_destroy(&be_lun->queue_lock);
2458	mtx_destroy(&be_lun->io_lock);
2459	free(be_lun, M_CTLBLK);
2460
2461	return (retval);
2462}
2463
2464static int
2465ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2466{
2467	struct ctl_lun_rm_params *params;
2468	struct ctl_be_block_lun *be_lun;
2469	int retval;
2470
2471	params = &req->reqdata.rm;
2472
2473	mtx_lock(&softc->lock);
2474
2475	be_lun = NULL;
2476
2477	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2478		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2479			break;
2480	}
2481	mtx_unlock(&softc->lock);
2482
2483	if (be_lun == NULL) {
2484		snprintf(req->error_str, sizeof(req->error_str),
2485			 "LUN %u is not managed by the block backend",
2486			 params->lun_id);
2487		goto bailout_error;
2488	}
2489
2490	retval = ctl_disable_lun(&be_lun->ctl_be_lun);
2491
2492	if (retval != 0) {
2493		snprintf(req->error_str, sizeof(req->error_str),
2494			 "error %d returned from ctl_disable_lun() for "
2495			 "LUN %d", retval, params->lun_id);
2496		goto bailout_error;
2497
2498	}
2499
2500	retval = ctl_invalidate_lun(&be_lun->ctl_be_lun);
2501	if (retval != 0) {
2502		snprintf(req->error_str, sizeof(req->error_str),
2503			 "error %d returned from ctl_invalidate_lun() for "
2504			 "LUN %d", retval, params->lun_id);
2505		goto bailout_error;
2506	}
2507
2508	mtx_lock(&softc->lock);
2509
2510	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2511
2512	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2513                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2514                if (retval == EINTR)
2515                        break;
2516        }
2517
2518	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2519
2520	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2521		snprintf(req->error_str, sizeof(req->error_str),
2522			 "interrupted waiting for LUN to be freed");
2523		mtx_unlock(&softc->lock);
2524		goto bailout_error;
2525	}
2526
2527	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2528
2529	softc->num_luns--;
2530	mtx_unlock(&softc->lock);
2531
2532	taskqueue_drain(be_lun->io_taskqueue, &be_lun->io_task);
2533
2534	taskqueue_free(be_lun->io_taskqueue);
2535
2536	ctl_be_block_close(be_lun);
2537
2538	if (be_lun->disk_stats != NULL)
2539		devstat_remove_entry(be_lun->disk_stats);
2540
2541	uma_zdestroy(be_lun->lun_zone);
2542
2543	ctl_free_opts(&be_lun->ctl_be_lun.options);
2544	free(be_lun->dev_path, M_CTLBLK);
2545	mtx_destroy(&be_lun->queue_lock);
2546	mtx_destroy(&be_lun->io_lock);
2547	free(be_lun, M_CTLBLK);
2548
2549	req->status = CTL_LUN_OK;
2550
2551	return (0);
2552
2553bailout_error:
2554
2555	req->status = CTL_LUN_ERROR;
2556
2557	return (0);
2558}
2559
2560static int
2561ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
2562			 struct ctl_lun_req *req)
2563{
2564	struct vattr vattr;
2565	int error;
2566	struct ctl_lun_create_params *params = &be_lun->params;
2567
2568	if (params->lun_size_bytes != 0) {
2569		be_lun->size_bytes = params->lun_size_bytes;
2570	} else  {
2571		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2572		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
2573		VOP_UNLOCK(be_lun->vn, 0);
2574		if (error != 0) {
2575			snprintf(req->error_str, sizeof(req->error_str),
2576				 "error calling VOP_GETATTR() for file %s",
2577				 be_lun->dev_path);
2578			return (error);
2579		}
2580
2581		be_lun->size_bytes = vattr.va_size;
2582	}
2583
2584	return (0);
2585}
2586
2587static int
2588ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
2589			struct ctl_lun_req *req)
2590{
2591	struct ctl_be_block_devdata *dev_data;
2592	int error;
2593	struct ctl_lun_create_params *params = &be_lun->params;
2594	uint64_t size_bytes;
2595
2596	dev_data = &be_lun->backend.dev;
2597	if (!dev_data->csw->d_ioctl) {
2598		snprintf(req->error_str, sizeof(req->error_str),
2599			 "no d_ioctl for device %s!", be_lun->dev_path);
2600		return (ENODEV);
2601	}
2602
2603	error = dev_data->csw->d_ioctl(dev_data->cdev, DIOCGMEDIASIZE,
2604			       (caddr_t)&size_bytes, FREAD,
2605			       curthread);
2606	if (error) {
2607		snprintf(req->error_str, sizeof(req->error_str),
2608			 "error %d returned for DIOCGMEDIASIZE ioctl "
2609			 "on %s!", error, be_lun->dev_path);
2610		return (error);
2611	}
2612
2613	if (params->lun_size_bytes != 0) {
2614		if (params->lun_size_bytes > size_bytes) {
2615			snprintf(req->error_str, sizeof(req->error_str),
2616				 "requested LUN size %ju > backing device "
2617				 "size %ju",
2618				 (uintmax_t)params->lun_size_bytes,
2619				 (uintmax_t)size_bytes);
2620			return (EINVAL);
2621		}
2622
2623		be_lun->size_bytes = params->lun_size_bytes;
2624	} else {
2625		be_lun->size_bytes = size_bytes;
2626	}
2627
2628	return (0);
2629}
2630
2631static int
2632ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2633{
2634	struct ctl_lun_modify_params *params;
2635	struct ctl_be_block_lun *be_lun;
2636	uint64_t oldsize;
2637	int error;
2638
2639	params = &req->reqdata.modify;
2640
2641	mtx_lock(&softc->lock);
2642	be_lun = NULL;
2643	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2644		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2645			break;
2646	}
2647	mtx_unlock(&softc->lock);
2648
2649	if (be_lun == NULL) {
2650		snprintf(req->error_str, sizeof(req->error_str),
2651			 "LUN %u is not managed by the block backend",
2652			 params->lun_id);
2653		goto bailout_error;
2654	}
2655
2656	be_lun->params.lun_size_bytes = params->lun_size_bytes;
2657
2658	oldsize = be_lun->size_bytes;
2659	if (be_lun->vn == NULL)
2660		error = ctl_be_block_open(softc, be_lun, req);
2661	else if (vn_isdisk(be_lun->vn, &error))
2662		error = ctl_be_block_modify_dev(be_lun, req);
2663	else if (be_lun->vn->v_type == VREG)
2664		error = ctl_be_block_modify_file(be_lun, req);
2665	else
2666		error = EINVAL;
2667
2668	if (error == 0 && be_lun->size_bytes != oldsize) {
2669		be_lun->size_blocks = be_lun->size_bytes >>
2670		    be_lun->blocksize_shift;
2671
2672		/*
2673		 * The maximum LBA is the size - 1.
2674		 *
2675		 * XXX: Note that this field is being updated without locking,
2676		 * 	which might cause problems on 32-bit architectures.
2677		 */
2678		if (be_lun->unmap != NULL)
2679			be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_UNMAP;
2680		be_lun->ctl_be_lun.maxlba = (be_lun->size_blocks == 0) ?
2681		    0 : (be_lun->size_blocks - 1);
2682		be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
2683		be_lun->ctl_be_lun.pblockexp = be_lun->pblockexp;
2684		be_lun->ctl_be_lun.pblockoff = be_lun->pblockoff;
2685		be_lun->ctl_be_lun.ublockexp = be_lun->ublockexp;
2686		be_lun->ctl_be_lun.ublockoff = be_lun->ublockoff;
2687		be_lun->ctl_be_lun.atomicblock = be_lun->atomicblock;
2688		be_lun->ctl_be_lun.opttxferlen = be_lun->opttxferlen;
2689		ctl_lun_capacity_changed(&be_lun->ctl_be_lun);
2690		if (oldsize == 0 && be_lun->size_blocks != 0)
2691			ctl_lun_online(&be_lun->ctl_be_lun);
2692	}
2693
2694	/* Tell the user the exact size we ended up using */
2695	params->lun_size_bytes = be_lun->size_bytes;
2696
2697	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
2698
2699	return (0);
2700
2701bailout_error:
2702	req->status = CTL_LUN_ERROR;
2703
2704	return (0);
2705}
2706
2707static void
2708ctl_be_block_lun_shutdown(void *be_lun)
2709{
2710	struct ctl_be_block_lun *lun;
2711	struct ctl_be_block_softc *softc;
2712
2713	lun = (struct ctl_be_block_lun *)be_lun;
2714
2715	softc = lun->softc;
2716
2717	mtx_lock(&softc->lock);
2718	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2719	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2720		wakeup(lun);
2721	mtx_unlock(&softc->lock);
2722
2723}
2724
2725static void
2726ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2727{
2728	struct ctl_be_block_lun *lun;
2729	struct ctl_be_block_softc *softc;
2730
2731	lun = (struct ctl_be_block_lun *)be_lun;
2732	softc = lun->softc;
2733
2734	if (status == CTL_LUN_CONFIG_OK) {
2735		mtx_lock(&softc->lock);
2736		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2737		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2738			wakeup(lun);
2739		mtx_unlock(&softc->lock);
2740
2741		/*
2742		 * We successfully added the LUN, attempt to enable it.
2743		 */
2744		if (ctl_enable_lun(&lun->ctl_be_lun) != 0) {
2745			printf("%s: ctl_enable_lun() failed!\n", __func__);
2746			if (ctl_invalidate_lun(&lun->ctl_be_lun) != 0) {
2747				printf("%s: ctl_invalidate_lun() failed!\n",
2748				       __func__);
2749			}
2750		}
2751
2752		return;
2753	}
2754
2755
2756	mtx_lock(&softc->lock);
2757	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2758	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2759	wakeup(lun);
2760	mtx_unlock(&softc->lock);
2761}
2762
2763
2764static int
2765ctl_be_block_config_write(union ctl_io *io)
2766{
2767	struct ctl_be_block_lun *be_lun;
2768	struct ctl_be_lun *ctl_be_lun;
2769	int retval;
2770
2771	retval = 0;
2772
2773	DPRINTF("entered\n");
2774
2775	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2776		CTL_PRIV_BACKEND_LUN].ptr;
2777	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2778
2779	switch (io->scsiio.cdb[0]) {
2780	case SYNCHRONIZE_CACHE:
2781	case SYNCHRONIZE_CACHE_16:
2782	case WRITE_SAME_10:
2783	case WRITE_SAME_16:
2784	case UNMAP:
2785		/*
2786		 * The upper level CTL code will filter out any CDBs with
2787		 * the immediate bit set and return the proper error.
2788		 *
2789		 * We don't really need to worry about what LBA range the
2790		 * user asked to be synced out.  When they issue a sync
2791		 * cache command, we'll sync out the whole thing.
2792		 */
2793		mtx_lock(&be_lun->queue_lock);
2794		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2795				   links);
2796		mtx_unlock(&be_lun->queue_lock);
2797		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2798		break;
2799	case START_STOP_UNIT: {
2800		struct scsi_start_stop_unit *cdb;
2801
2802		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2803
2804		if (cdb->how & SSS_START)
2805			retval = ctl_start_lun(ctl_be_lun);
2806		else {
2807			retval = ctl_stop_lun(ctl_be_lun);
2808			/*
2809			 * XXX KDM Copan-specific offline behavior.
2810			 * Figure out a reasonable way to port this?
2811			 */
2812#ifdef NEEDTOPORT
2813			if ((retval == 0)
2814			 && (cdb->byte2 & SSS_ONOFFLINE))
2815				retval = ctl_lun_offline(ctl_be_lun);
2816#endif
2817		}
2818
2819		/*
2820		 * In general, the above routines should not fail.  They
2821		 * just set state for the LUN.  So we've got something
2822		 * pretty wrong here if we can't start or stop the LUN.
2823		 */
2824		if (retval != 0) {
2825			ctl_set_internal_failure(&io->scsiio,
2826						 /*sks_valid*/ 1,
2827						 /*retry_count*/ 0xf051);
2828			retval = CTL_RETVAL_COMPLETE;
2829		} else {
2830			ctl_set_success(&io->scsiio);
2831		}
2832		ctl_config_write_done(io);
2833		break;
2834	}
2835	default:
2836		ctl_set_invalid_opcode(&io->scsiio);
2837		ctl_config_write_done(io);
2838		retval = CTL_RETVAL_COMPLETE;
2839		break;
2840	}
2841
2842	return (retval);
2843}
2844
2845static int
2846ctl_be_block_config_read(union ctl_io *io)
2847{
2848	struct ctl_be_block_lun *be_lun;
2849	struct ctl_be_lun *ctl_be_lun;
2850	int retval = 0;
2851
2852	DPRINTF("entered\n");
2853
2854	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2855		CTL_PRIV_BACKEND_LUN].ptr;
2856	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2857
2858	switch (io->scsiio.cdb[0]) {
2859	case SERVICE_ACTION_IN:
2860		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
2861			mtx_lock(&be_lun->queue_lock);
2862			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
2863			    &io->io_hdr, links);
2864			mtx_unlock(&be_lun->queue_lock);
2865			taskqueue_enqueue(be_lun->io_taskqueue,
2866			    &be_lun->io_task);
2867			retval = CTL_RETVAL_QUEUED;
2868			break;
2869		}
2870		ctl_set_invalid_field(&io->scsiio,
2871				      /*sks_valid*/ 1,
2872				      /*command*/ 1,
2873				      /*field*/ 1,
2874				      /*bit_valid*/ 1,
2875				      /*bit*/ 4);
2876		ctl_config_read_done(io);
2877		retval = CTL_RETVAL_COMPLETE;
2878		break;
2879	default:
2880		ctl_set_invalid_opcode(&io->scsiio);
2881		ctl_config_read_done(io);
2882		retval = CTL_RETVAL_COMPLETE;
2883		break;
2884	}
2885
2886	return (retval);
2887}
2888
2889static int
2890ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2891{
2892	struct ctl_be_block_lun *lun;
2893	int retval;
2894
2895	lun = (struct ctl_be_block_lun *)be_lun;
2896	retval = 0;
2897
2898	retval = sbuf_printf(sb, "\t<num_threads>");
2899
2900	if (retval != 0)
2901		goto bailout;
2902
2903	retval = sbuf_printf(sb, "%d", lun->num_threads);
2904
2905	if (retval != 0)
2906		goto bailout;
2907
2908	retval = sbuf_printf(sb, "</num_threads>\n");
2909
2910bailout:
2911
2912	return (retval);
2913}
2914
2915static uint64_t
2916ctl_be_block_lun_attr(void *be_lun, const char *attrname)
2917{
2918	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun;
2919
2920	if (lun->getattr == NULL)
2921		return (UINT64_MAX);
2922	return (lun->getattr(lun, attrname));
2923}
2924
2925int
2926ctl_be_block_init(void)
2927{
2928	struct ctl_be_block_softc *softc;
2929	int retval;
2930
2931	softc = &backend_block_softc;
2932	retval = 0;
2933
2934	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
2935	beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2936	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2937	STAILQ_INIT(&softc->disk_list);
2938	STAILQ_INIT(&softc->lun_list);
2939
2940	return (retval);
2941}
2942