ctl_backend_block.c revision 288794
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Edward Tomasz Napierala
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17 *    substantially similar to the "NO WARRANTY" disclaimer below
18 *    ("Disclaimer") and any redistribution must be conditioned upon
19 *    including a substantially similar Disclaimer requirement for further
20 *    binary redistribution.
21 *
22 * NO WARRANTY
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGES.
34 *
35 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36 */
37/*
38 * CAM Target Layer driver backend for block devices.
39 *
40 * Author: Ken Merry <ken@FreeBSD.org>
41 */
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: stable/10/sys/cam/ctl/ctl_backend_block.c 288794 2015-10-05 10:57:50Z mav $");
44
45#include <opt_kdtrace.h>
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kernel.h>
50#include <sys/types.h>
51#include <sys/kthread.h>
52#include <sys/bio.h>
53#include <sys/fcntl.h>
54#include <sys/limits.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/condvar.h>
58#include <sys/malloc.h>
59#include <sys/conf.h>
60#include <sys/ioccom.h>
61#include <sys/queue.h>
62#include <sys/sbuf.h>
63#include <sys/endian.h>
64#include <sys/uio.h>
65#include <sys/buf.h>
66#include <sys/taskqueue.h>
67#include <sys/vnode.h>
68#include <sys/namei.h>
69#include <sys/mount.h>
70#include <sys/disk.h>
71#include <sys/fcntl.h>
72#include <sys/filedesc.h>
73#include <sys/filio.h>
74#include <sys/proc.h>
75#include <sys/pcpu.h>
76#include <sys/module.h>
77#include <sys/sdt.h>
78#include <sys/devicestat.h>
79#include <sys/sysctl.h>
80
81#include <geom/geom.h>
82
83#include <cam/cam.h>
84#include <cam/scsi/scsi_all.h>
85#include <cam/scsi/scsi_da.h>
86#include <cam/ctl/ctl_io.h>
87#include <cam/ctl/ctl.h>
88#include <cam/ctl/ctl_backend.h>
89#include <cam/ctl/ctl_ioctl.h>
90#include <cam/ctl/ctl_ha.h>
91#include <cam/ctl/ctl_scsi_all.h>
92#include <cam/ctl/ctl_private.h>
93#include <cam/ctl/ctl_error.h>
94
95/*
96 * The idea here is that we'll allocate enough S/G space to hold a 1MB
97 * I/O.  If we get an I/O larger than that, we'll split it.
98 */
99#define	CTLBLK_HALF_IO_SIZE	(512 * 1024)
100#define	CTLBLK_MAX_IO_SIZE	(CTLBLK_HALF_IO_SIZE * 2)
101#define	CTLBLK_MAX_SEG		MAXPHYS
102#define	CTLBLK_HALF_SEGS	MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
103#define	CTLBLK_MAX_SEGS		(CTLBLK_HALF_SEGS * 2)
104
105#ifdef CTLBLK_DEBUG
106#define DPRINTF(fmt, args...) \
107    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
108#else
109#define DPRINTF(fmt, args...) do {} while(0)
110#endif
111
112#define PRIV(io)	\
113    ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
114#define ARGS(io)	\
115    ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
116
117SDT_PROVIDER_DEFINE(cbb);
118
119typedef enum {
120	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
121	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
122	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
123} ctl_be_block_lun_flags;
124
125typedef enum {
126	CTL_BE_BLOCK_NONE,
127	CTL_BE_BLOCK_DEV,
128	CTL_BE_BLOCK_FILE
129} ctl_be_block_type;
130
131struct ctl_be_block_filedata {
132	struct ucred *cred;
133};
134
135union ctl_be_block_bedata {
136	struct ctl_be_block_filedata file;
137};
138
139struct ctl_be_block_io;
140struct ctl_be_block_lun;
141
142typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
143			       struct ctl_be_block_io *beio);
144typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
145				  const char *attrname);
146
147/*
148 * Backend LUN structure.  There is a 1:1 mapping between a block device
149 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
150 */
151struct ctl_be_block_lun {
152	struct ctl_lun_create_params params;
153	char lunname[32];
154	char *dev_path;
155	ctl_be_block_type dev_type;
156	struct vnode *vn;
157	union ctl_be_block_bedata backend;
158	cbb_dispatch_t dispatch;
159	cbb_dispatch_t lun_flush;
160	cbb_dispatch_t unmap;
161	cbb_dispatch_t get_lba_status;
162	cbb_getattr_t getattr;
163	uma_zone_t lun_zone;
164	uint64_t size_blocks;
165	uint64_t size_bytes;
166	struct ctl_be_block_softc *softc;
167	struct devstat *disk_stats;
168	ctl_be_block_lun_flags flags;
169	STAILQ_ENTRY(ctl_be_block_lun) links;
170	struct ctl_be_lun cbe_lun;
171	struct taskqueue *io_taskqueue;
172	struct task io_task;
173	int num_threads;
174	STAILQ_HEAD(, ctl_io_hdr) input_queue;
175	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
176	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
177	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
178	struct mtx_padalign io_lock;
179	struct mtx_padalign queue_lock;
180};
181
182/*
183 * Overall softc structure for the block backend module.
184 */
185struct ctl_be_block_softc {
186	struct mtx			 lock;
187	int				 num_luns;
188	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
189};
190
191static struct ctl_be_block_softc backend_block_softc;
192
193/*
194 * Per-I/O information.
195 */
196struct ctl_be_block_io {
197	union ctl_io			*io;
198	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
199	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
200	int				bio_cmd;
201	int				num_segs;
202	int				num_bios_sent;
203	int				num_bios_done;
204	int				send_complete;
205	int				num_errors;
206	struct bintime			ds_t0;
207	devstat_tag_type		ds_tag_type;
208	devstat_trans_flags		ds_trans_type;
209	uint64_t			io_len;
210	uint64_t			io_offset;
211	int				io_arg;
212	struct ctl_be_block_softc	*softc;
213	struct ctl_be_block_lun		*lun;
214	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
215};
216
217extern struct ctl_softc *control_softc;
218
219static int cbb_num_threads = 14;
220TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
221SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
222	    "CAM Target Layer Block Backend");
223SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
224           &cbb_num_threads, 0, "Number of threads per backing file");
225
226static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
227static void ctl_free_beio(struct ctl_be_block_io *beio);
228static void ctl_complete_beio(struct ctl_be_block_io *beio);
229static int ctl_be_block_move_done(union ctl_io *io);
230static void ctl_be_block_biodone(struct bio *bio);
231static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
232				    struct ctl_be_block_io *beio);
233static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
234				       struct ctl_be_block_io *beio);
235static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
236				  struct ctl_be_block_io *beio);
237static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
238					 const char *attrname);
239static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
240				   struct ctl_be_block_io *beio);
241static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
242				   struct ctl_be_block_io *beio);
243static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
244				      struct ctl_be_block_io *beio);
245static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
246					 const char *attrname);
247static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
248				    union ctl_io *io);
249static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
250				    union ctl_io *io);
251static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
252				  union ctl_io *io);
253static void ctl_be_block_worker(void *context, int pending);
254static int ctl_be_block_submit(union ctl_io *io);
255static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
256				   int flag, struct thread *td);
257static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
258				  struct ctl_lun_req *req);
259static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
260				 struct ctl_lun_req *req);
261static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
262static int ctl_be_block_open(struct ctl_be_block_softc *softc,
263			     struct ctl_be_block_lun *be_lun,
264			     struct ctl_lun_req *req);
265static int ctl_be_block_create(struct ctl_be_block_softc *softc,
266			       struct ctl_lun_req *req);
267static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
268			   struct ctl_lun_req *req);
269static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
270			   struct ctl_lun_req *req);
271static void ctl_be_block_lun_shutdown(void *be_lun);
272static void ctl_be_block_lun_config_status(void *be_lun,
273					   ctl_lun_config_status status);
274static int ctl_be_block_config_write(union ctl_io *io);
275static int ctl_be_block_config_read(union ctl_io *io);
276static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
277static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname);
278int ctl_be_block_init(void);
279
280static struct ctl_backend_driver ctl_be_block_driver =
281{
282	.name = "block",
283	.flags = CTL_BE_FLAG_HAS_CONFIG,
284	.init = ctl_be_block_init,
285	.data_submit = ctl_be_block_submit,
286	.data_move_done = ctl_be_block_move_done,
287	.config_read = ctl_be_block_config_read,
288	.config_write = ctl_be_block_config_write,
289	.ioctl = ctl_be_block_ioctl,
290	.lun_info = ctl_be_block_lun_info,
291	.lun_attr = ctl_be_block_lun_attr
292};
293
294MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
295CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
296
297static uma_zone_t beio_zone;
298
299static struct ctl_be_block_io *
300ctl_alloc_beio(struct ctl_be_block_softc *softc)
301{
302	struct ctl_be_block_io *beio;
303
304	beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO);
305	beio->softc = softc;
306	return (beio);
307}
308
309static void
310ctl_free_beio(struct ctl_be_block_io *beio)
311{
312	int duplicate_free;
313	int i;
314
315	duplicate_free = 0;
316
317	for (i = 0; i < beio->num_segs; i++) {
318		if (beio->sg_segs[i].addr == NULL)
319			duplicate_free++;
320
321		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
322		beio->sg_segs[i].addr = NULL;
323
324		/* For compare we had two equal S/G lists. */
325		if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) {
326			uma_zfree(beio->lun->lun_zone,
327			    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
328			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL;
329		}
330	}
331
332	if (duplicate_free > 0) {
333		printf("%s: %d duplicate frees out of %d segments\n", __func__,
334		       duplicate_free, beio->num_segs);
335	}
336
337	uma_zfree(beio_zone, beio);
338}
339
340static void
341ctl_complete_beio(struct ctl_be_block_io *beio)
342{
343	union ctl_io *io = beio->io;
344
345	if (beio->beio_cont != NULL) {
346		beio->beio_cont(beio);
347	} else {
348		ctl_free_beio(beio);
349		ctl_data_submit_done(io);
350	}
351}
352
353static size_t
354cmp(uint8_t *a, uint8_t *b, size_t size)
355{
356	size_t i;
357
358	for (i = 0; i < size; i++) {
359		if (a[i] != b[i])
360			break;
361	}
362	return (i);
363}
364
365static void
366ctl_be_block_compare(union ctl_io *io)
367{
368	struct ctl_be_block_io *beio;
369	uint64_t off, res;
370	int i;
371	uint8_t info[8];
372
373	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
374	off = 0;
375	for (i = 0; i < beio->num_segs; i++) {
376		res = cmp(beio->sg_segs[i].addr,
377		    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
378		    beio->sg_segs[i].len);
379		off += res;
380		if (res < beio->sg_segs[i].len)
381			break;
382	}
383	if (i < beio->num_segs) {
384		scsi_u64to8b(off, info);
385		ctl_set_sense(&io->scsiio, /*current_error*/ 1,
386		    /*sense_key*/ SSD_KEY_MISCOMPARE,
387		    /*asc*/ 0x1D, /*ascq*/ 0x00,
388		    /*type*/ SSD_ELEM_INFO,
389		    /*size*/ sizeof(info), /*data*/ &info,
390		    /*type*/ SSD_ELEM_NONE);
391	} else
392		ctl_set_success(&io->scsiio);
393}
394
395static int
396ctl_be_block_move_done(union ctl_io *io)
397{
398	struct ctl_be_block_io *beio;
399	struct ctl_be_block_lun *be_lun;
400	struct ctl_lba_len_flags *lbalen;
401#ifdef CTL_TIME_IO
402	struct bintime cur_bt;
403#endif
404
405	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
406	be_lun = beio->lun;
407
408	DPRINTF("entered\n");
409
410#ifdef CTL_TIME_IO
411	getbintime(&cur_bt);
412	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
413	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
414	io->io_hdr.num_dmas++;
415#endif
416	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
417
418	/*
419	 * We set status at this point for read commands, and write
420	 * commands with errors.
421	 */
422	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
423		;
424	} else if ((io->io_hdr.port_status == 0) &&
425	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
426		lbalen = ARGS(beio->io);
427		if (lbalen->flags & CTL_LLF_READ) {
428			ctl_set_success(&io->scsiio);
429		} else if (lbalen->flags & CTL_LLF_COMPARE) {
430			/* We have two data blocks ready for comparison. */
431			ctl_be_block_compare(io);
432		}
433	} else if ((io->io_hdr.port_status != 0) &&
434	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
435	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
436		/*
437		 * For hardware error sense keys, the sense key
438		 * specific value is defined to be a retry count,
439		 * but we use it to pass back an internal FETD
440		 * error code.  XXX KDM  Hopefully the FETD is only
441		 * using 16 bits for an error code, since that's
442		 * all the space we have in the sks field.
443		 */
444		ctl_set_internal_failure(&io->scsiio,
445					 /*sks_valid*/ 1,
446					 /*retry_count*/
447					 io->io_hdr.port_status);
448	}
449
450	/*
451	 * If this is a read, or a write with errors, it is done.
452	 */
453	if ((beio->bio_cmd == BIO_READ)
454	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
455	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
456		ctl_complete_beio(beio);
457		return (0);
458	}
459
460	/*
461	 * At this point, we have a write and the DMA completed
462	 * successfully.  We now have to queue it to the task queue to
463	 * execute the backend I/O.  That is because we do blocking
464	 * memory allocations, and in the file backing case, blocking I/O.
465	 * This move done routine is generally called in the SIM's
466	 * interrupt context, and therefore we cannot block.
467	 */
468	mtx_lock(&be_lun->queue_lock);
469	/*
470	 * XXX KDM make sure that links is okay to use at this point.
471	 * Otherwise, we either need to add another field to ctl_io_hdr,
472	 * or deal with resource allocation here.
473	 */
474	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
475	mtx_unlock(&be_lun->queue_lock);
476
477	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
478
479	return (0);
480}
481
482static void
483ctl_be_block_biodone(struct bio *bio)
484{
485	struct ctl_be_block_io *beio;
486	struct ctl_be_block_lun *be_lun;
487	union ctl_io *io;
488	int error;
489
490	beio = bio->bio_caller1;
491	be_lun = beio->lun;
492	io = beio->io;
493
494	DPRINTF("entered\n");
495
496	error = bio->bio_error;
497	mtx_lock(&be_lun->io_lock);
498	if (error != 0)
499		beio->num_errors++;
500
501	beio->num_bios_done++;
502
503	/*
504	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
505	 * during the free might cause it to complain.
506	 */
507	g_destroy_bio(bio);
508
509	/*
510	 * If the send complete bit isn't set, or we aren't the last I/O to
511	 * complete, then we're done.
512	 */
513	if ((beio->send_complete == 0)
514	 || (beio->num_bios_done < beio->num_bios_sent)) {
515		mtx_unlock(&be_lun->io_lock);
516		return;
517	}
518
519	/*
520	 * At this point, we've verified that we are the last I/O to
521	 * complete, so it's safe to drop the lock.
522	 */
523	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
524	    beio->ds_tag_type, beio->ds_trans_type,
525	    /*now*/ NULL, /*then*/&beio->ds_t0);
526	mtx_unlock(&be_lun->io_lock);
527
528	/*
529	 * If there are any errors from the backing device, we fail the
530	 * entire I/O with a medium error.
531	 */
532	if (beio->num_errors > 0) {
533		if (error == EOPNOTSUPP) {
534			ctl_set_invalid_opcode(&io->scsiio);
535		} else if (error == ENOSPC || error == EDQUOT) {
536			ctl_set_space_alloc_fail(&io->scsiio);
537		} else if (error == EROFS || error == EACCES) {
538			ctl_set_hw_write_protected(&io->scsiio);
539		} else if (beio->bio_cmd == BIO_FLUSH) {
540			/* XXX KDM is there is a better error here? */
541			ctl_set_internal_failure(&io->scsiio,
542						 /*sks_valid*/ 1,
543						 /*retry_count*/ 0xbad2);
544		} else {
545			ctl_set_medium_error(&io->scsiio,
546			    beio->bio_cmd == BIO_READ);
547		}
548		ctl_complete_beio(beio);
549		return;
550	}
551
552	/*
553	 * If this is a write, a flush, a delete or verify, we're all done.
554	 * If this is a read, we can now send the data to the user.
555	 */
556	if ((beio->bio_cmd == BIO_WRITE)
557	 || (beio->bio_cmd == BIO_FLUSH)
558	 || (beio->bio_cmd == BIO_DELETE)
559	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
560		ctl_set_success(&io->scsiio);
561		ctl_complete_beio(beio);
562	} else {
563		if ((ARGS(io)->flags & CTL_LLF_READ) &&
564		    beio->beio_cont == NULL) {
565			ctl_set_success(&io->scsiio);
566			ctl_serseq_done(io);
567		}
568#ifdef CTL_TIME_IO
569        	getbintime(&io->io_hdr.dma_start_bt);
570#endif
571		ctl_datamove(io);
572	}
573}
574
575static void
576ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
577			struct ctl_be_block_io *beio)
578{
579	union ctl_io *io = beio->io;
580	struct mount *mountpoint;
581	int error, lock_flags;
582
583	DPRINTF("entered\n");
584
585	binuptime(&beio->ds_t0);
586	mtx_lock(&be_lun->io_lock);
587	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
588	mtx_unlock(&be_lun->io_lock);
589
590	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
591
592	if (MNT_SHARED_WRITES(mountpoint)
593	 || ((mountpoint == NULL)
594	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
595		lock_flags = LK_SHARED;
596	else
597		lock_flags = LK_EXCLUSIVE;
598
599	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
600
601	error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
602	    curthread);
603	VOP_UNLOCK(be_lun->vn, 0);
604
605	vn_finished_write(mountpoint);
606
607	mtx_lock(&be_lun->io_lock);
608	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
609	    beio->ds_tag_type, beio->ds_trans_type,
610	    /*now*/ NULL, /*then*/&beio->ds_t0);
611	mtx_unlock(&be_lun->io_lock);
612
613	if (error == 0)
614		ctl_set_success(&io->scsiio);
615	else {
616		/* XXX KDM is there is a better error here? */
617		ctl_set_internal_failure(&io->scsiio,
618					 /*sks_valid*/ 1,
619					 /*retry_count*/ 0xbad1);
620	}
621
622	ctl_complete_beio(beio);
623}
624
625SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, "uint64_t");
626SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, "uint64_t");
627SDT_PROBE_DEFINE1(cbb, kernel, read, file_done,"uint64_t");
628SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, "uint64_t");
629
630static void
631ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
632			   struct ctl_be_block_io *beio)
633{
634	struct ctl_be_block_filedata *file_data;
635	union ctl_io *io;
636	struct uio xuio;
637	struct iovec *xiovec;
638	size_t s;
639	int error, flags, i;
640
641	DPRINTF("entered\n");
642
643	file_data = &be_lun->backend.file;
644	io = beio->io;
645	flags = 0;
646	if (ARGS(io)->flags & CTL_LLF_DPO)
647		flags |= IO_DIRECT;
648	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
649		flags |= IO_SYNC;
650
651	bzero(&xuio, sizeof(xuio));
652	if (beio->bio_cmd == BIO_READ) {
653		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
654		xuio.uio_rw = UIO_READ;
655	} else {
656		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
657		xuio.uio_rw = UIO_WRITE;
658	}
659	xuio.uio_offset = beio->io_offset;
660	xuio.uio_resid = beio->io_len;
661	xuio.uio_segflg = UIO_SYSSPACE;
662	xuio.uio_iov = beio->xiovecs;
663	xuio.uio_iovcnt = beio->num_segs;
664	xuio.uio_td = curthread;
665
666	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
667		xiovec->iov_base = beio->sg_segs[i].addr;
668		xiovec->iov_len = beio->sg_segs[i].len;
669	}
670
671	binuptime(&beio->ds_t0);
672	mtx_lock(&be_lun->io_lock);
673	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
674	mtx_unlock(&be_lun->io_lock);
675
676	if (beio->bio_cmd == BIO_READ) {
677		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
678
679		/*
680		 * UFS pays attention to IO_DIRECT for reads.  If the
681		 * DIRECTIO option is configured into the kernel, it calls
682		 * ffs_rawread().  But that only works for single-segment
683		 * uios with user space addresses.  In our case, with a
684		 * kernel uio, it still reads into the buffer cache, but it
685		 * will just try to release the buffer from the cache later
686		 * on in ffs_read().
687		 *
688		 * ZFS does not pay attention to IO_DIRECT for reads.
689		 *
690		 * UFS does not pay attention to IO_SYNC for reads.
691		 *
692		 * ZFS pays attention to IO_SYNC (which translates into the
693		 * Solaris define FRSYNC for zfs_read()) for reads.  It
694		 * attempts to sync the file before reading.
695		 */
696		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
697
698		VOP_UNLOCK(be_lun->vn, 0);
699		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
700		if (error == 0 && xuio.uio_resid > 0) {
701			/*
702			 * If we red less then requested (EOF), then
703			 * we should clean the rest of the buffer.
704			 */
705			s = beio->io_len - xuio.uio_resid;
706			for (i = 0; i < beio->num_segs; i++) {
707				if (s >= beio->sg_segs[i].len) {
708					s -= beio->sg_segs[i].len;
709					continue;
710				}
711				bzero((uint8_t *)beio->sg_segs[i].addr + s,
712				    beio->sg_segs[i].len - s);
713				s = 0;
714			}
715		}
716	} else {
717		struct mount *mountpoint;
718		int lock_flags;
719
720		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
721
722		if (MNT_SHARED_WRITES(mountpoint)
723		 || ((mountpoint == NULL)
724		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
725			lock_flags = LK_SHARED;
726		else
727			lock_flags = LK_EXCLUSIVE;
728
729		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
730
731		/*
732		 * UFS pays attention to IO_DIRECT for writes.  The write
733		 * is done asynchronously.  (Normally the write would just
734		 * get put into cache.
735		 *
736		 * UFS pays attention to IO_SYNC for writes.  It will
737		 * attempt to write the buffer out synchronously if that
738		 * flag is set.
739		 *
740		 * ZFS does not pay attention to IO_DIRECT for writes.
741		 *
742		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
743		 * for writes.  It will flush the transaction from the
744		 * cache before returning.
745		 */
746		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
747		VOP_UNLOCK(be_lun->vn, 0);
748
749		vn_finished_write(mountpoint);
750		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
751        }
752
753	mtx_lock(&be_lun->io_lock);
754	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
755	    beio->ds_tag_type, beio->ds_trans_type,
756	    /*now*/ NULL, /*then*/&beio->ds_t0);
757	mtx_unlock(&be_lun->io_lock);
758
759	/*
760	 * If we got an error, set the sense data to "MEDIUM ERROR" and
761	 * return the I/O to the user.
762	 */
763	if (error != 0) {
764		if (error == ENOSPC || error == EDQUOT) {
765			ctl_set_space_alloc_fail(&io->scsiio);
766		} else if (error == EROFS || error == EACCES) {
767			ctl_set_hw_write_protected(&io->scsiio);
768		} else {
769			ctl_set_medium_error(&io->scsiio,
770			    beio->bio_cmd == BIO_READ);
771		}
772		ctl_complete_beio(beio);
773		return;
774	}
775
776	/*
777	 * If this is a write or a verify, we're all done.
778	 * If this is a read, we can now send the data to the user.
779	 */
780	if ((beio->bio_cmd == BIO_WRITE) ||
781	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
782		ctl_set_success(&io->scsiio);
783		ctl_complete_beio(beio);
784	} else {
785		if ((ARGS(io)->flags & CTL_LLF_READ) &&
786		    beio->beio_cont == NULL) {
787			ctl_set_success(&io->scsiio);
788			ctl_serseq_done(io);
789		}
790#ifdef CTL_TIME_IO
791        	getbintime(&io->io_hdr.dma_start_bt);
792#endif
793		ctl_datamove(io);
794	}
795}
796
797static void
798ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
799			struct ctl_be_block_io *beio)
800{
801	union ctl_io *io = beio->io;
802	struct ctl_lba_len_flags *lbalen = ARGS(io);
803	struct scsi_get_lba_status_data *data;
804	off_t roff, off;
805	int error, status;
806
807	DPRINTF("entered\n");
808
809	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
810	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
811	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
812	    0, curthread->td_ucred, curthread);
813	if (error == 0 && off > roff)
814		status = 0;	/* mapped up to off */
815	else {
816		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
817		    0, curthread->td_ucred, curthread);
818		if (error == 0 && off > roff)
819			status = 1;	/* deallocated up to off */
820		else {
821			status = 0;	/* unknown up to the end */
822			off = be_lun->size_bytes;
823		}
824	}
825	VOP_UNLOCK(be_lun->vn, 0);
826
827	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
828	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
829	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
830	    lbalen->lba), data->descr[0].length);
831	data->descr[0].status = status;
832
833	ctl_complete_beio(beio);
834}
835
836static uint64_t
837ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
838{
839	struct vattr		vattr;
840	struct statfs		statfs;
841	uint64_t		val;
842	int			error;
843
844	val = UINT64_MAX;
845	if (be_lun->vn == NULL)
846		return (val);
847	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
848	if (strcmp(attrname, "blocksused") == 0) {
849		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
850		if (error == 0)
851			val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
852	}
853	if (strcmp(attrname, "blocksavail") == 0 &&
854	    (be_lun->vn->v_iflag & VI_DOOMED) == 0) {
855		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
856		if (error == 0)
857			val = statfs.f_bavail * statfs.f_bsize /
858			    be_lun->cbe_lun.blocksize;
859	}
860	VOP_UNLOCK(be_lun->vn, 0);
861	return (val);
862}
863
864static void
865ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
866			   struct ctl_be_block_io *beio)
867{
868	union ctl_io *io;
869	struct cdevsw *csw;
870	struct cdev *dev;
871	struct uio xuio;
872	struct iovec *xiovec;
873	int error, flags, i, ref;
874
875	DPRINTF("entered\n");
876
877	io = beio->io;
878	flags = 0;
879	if (ARGS(io)->flags & CTL_LLF_DPO)
880		flags |= IO_DIRECT;
881	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
882		flags |= IO_SYNC;
883
884	bzero(&xuio, sizeof(xuio));
885	if (beio->bio_cmd == BIO_READ) {
886		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
887		xuio.uio_rw = UIO_READ;
888	} else {
889		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
890		xuio.uio_rw = UIO_WRITE;
891	}
892	xuio.uio_offset = beio->io_offset;
893	xuio.uio_resid = beio->io_len;
894	xuio.uio_segflg = UIO_SYSSPACE;
895	xuio.uio_iov = beio->xiovecs;
896	xuio.uio_iovcnt = beio->num_segs;
897	xuio.uio_td = curthread;
898
899	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
900		xiovec->iov_base = beio->sg_segs[i].addr;
901		xiovec->iov_len = beio->sg_segs[i].len;
902	}
903
904	binuptime(&beio->ds_t0);
905	mtx_lock(&be_lun->io_lock);
906	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
907	mtx_unlock(&be_lun->io_lock);
908
909	csw = devvn_refthread(be_lun->vn, &dev, &ref);
910	if (csw) {
911		if (beio->bio_cmd == BIO_READ)
912			error = csw->d_read(dev, &xuio, flags);
913		else
914			error = csw->d_write(dev, &xuio, flags);
915		dev_relthread(dev, ref);
916	} else
917		error = ENXIO;
918
919	if (beio->bio_cmd == BIO_READ)
920		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
921	else
922		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
923
924	mtx_lock(&be_lun->io_lock);
925	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
926	    beio->ds_tag_type, beio->ds_trans_type,
927	    /*now*/ NULL, /*then*/&beio->ds_t0);
928	mtx_unlock(&be_lun->io_lock);
929
930	/*
931	 * If we got an error, set the sense data to "MEDIUM ERROR" and
932	 * return the I/O to the user.
933	 */
934	if (error != 0) {
935		if (error == ENOSPC || error == EDQUOT) {
936			ctl_set_space_alloc_fail(&io->scsiio);
937		} else if (error == EROFS || error == EACCES) {
938			ctl_set_hw_write_protected(&io->scsiio);
939		} else {
940			ctl_set_medium_error(&io->scsiio,
941			    beio->bio_cmd == BIO_READ);
942		}
943		ctl_complete_beio(beio);
944		return;
945	}
946
947	/*
948	 * If this is a write or a verify, we're all done.
949	 * If this is a read, we can now send the data to the user.
950	 */
951	if ((beio->bio_cmd == BIO_WRITE) ||
952	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
953		ctl_set_success(&io->scsiio);
954		ctl_complete_beio(beio);
955	} else {
956		if ((ARGS(io)->flags & CTL_LLF_READ) &&
957		    beio->beio_cont == NULL) {
958			ctl_set_success(&io->scsiio);
959			ctl_serseq_done(io);
960		}
961#ifdef CTL_TIME_IO
962        	getbintime(&io->io_hdr.dma_start_bt);
963#endif
964		ctl_datamove(io);
965	}
966}
967
968static void
969ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
970			struct ctl_be_block_io *beio)
971{
972	union ctl_io *io = beio->io;
973	struct cdevsw *csw;
974	struct cdev *dev;
975	struct ctl_lba_len_flags *lbalen = ARGS(io);
976	struct scsi_get_lba_status_data *data;
977	off_t roff, off;
978	int error, ref, status;
979
980	DPRINTF("entered\n");
981
982	csw = devvn_refthread(be_lun->vn, &dev, &ref);
983	if (csw == NULL) {
984		status = 0;	/* unknown up to the end */
985		off = be_lun->size_bytes;
986		goto done;
987	}
988	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
989	error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
990	    curthread);
991	if (error == 0 && off > roff)
992		status = 0;	/* mapped up to off */
993	else {
994		error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
995		    curthread);
996		if (error == 0 && off > roff)
997			status = 1;	/* deallocated up to off */
998		else {
999			status = 0;	/* unknown up to the end */
1000			off = be_lun->size_bytes;
1001		}
1002	}
1003	dev_relthread(dev, ref);
1004
1005done:
1006	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
1007	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
1008	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
1009	    lbalen->lba), data->descr[0].length);
1010	data->descr[0].status = status;
1011
1012	ctl_complete_beio(beio);
1013}
1014
1015static void
1016ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
1017		       struct ctl_be_block_io *beio)
1018{
1019	struct bio *bio;
1020	union ctl_io *io;
1021	struct cdevsw *csw;
1022	struct cdev *dev;
1023	int ref;
1024
1025	io = beio->io;
1026
1027	DPRINTF("entered\n");
1028
1029	/* This can't fail, it's a blocking allocation. */
1030	bio = g_alloc_bio();
1031
1032	bio->bio_cmd	    = BIO_FLUSH;
1033	bio->bio_offset	    = 0;
1034	bio->bio_data	    = 0;
1035	bio->bio_done	    = ctl_be_block_biodone;
1036	bio->bio_caller1    = beio;
1037	bio->bio_pblkno	    = 0;
1038
1039	/*
1040	 * We don't need to acquire the LUN lock here, because we are only
1041	 * sending one bio, and so there is no other context to synchronize
1042	 * with.
1043	 */
1044	beio->num_bios_sent = 1;
1045	beio->send_complete = 1;
1046
1047	binuptime(&beio->ds_t0);
1048	mtx_lock(&be_lun->io_lock);
1049	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1050	mtx_unlock(&be_lun->io_lock);
1051
1052	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1053	if (csw) {
1054		bio->bio_dev = dev;
1055		csw->d_strategy(bio);
1056		dev_relthread(dev, ref);
1057	} else {
1058		bio->bio_error = ENXIO;
1059		ctl_be_block_biodone(bio);
1060	}
1061}
1062
1063static void
1064ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1065		       struct ctl_be_block_io *beio,
1066		       uint64_t off, uint64_t len, int last)
1067{
1068	struct bio *bio;
1069	uint64_t maxlen;
1070	struct cdevsw *csw;
1071	struct cdev *dev;
1072	int ref;
1073
1074	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1075	maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
1076	while (len > 0) {
1077		bio = g_alloc_bio();
1078		bio->bio_cmd	    = BIO_DELETE;
1079		bio->bio_dev	    = dev;
1080		bio->bio_offset	    = off;
1081		bio->bio_length	    = MIN(len, maxlen);
1082		bio->bio_data	    = 0;
1083		bio->bio_done	    = ctl_be_block_biodone;
1084		bio->bio_caller1    = beio;
1085		bio->bio_pblkno     = off / be_lun->cbe_lun.blocksize;
1086
1087		off += bio->bio_length;
1088		len -= bio->bio_length;
1089
1090		mtx_lock(&be_lun->io_lock);
1091		beio->num_bios_sent++;
1092		if (last && len == 0)
1093			beio->send_complete = 1;
1094		mtx_unlock(&be_lun->io_lock);
1095
1096		if (csw) {
1097			csw->d_strategy(bio);
1098		} else {
1099			bio->bio_error = ENXIO;
1100			ctl_be_block_biodone(bio);
1101		}
1102	}
1103	if (csw)
1104		dev_relthread(dev, ref);
1105}
1106
1107static void
1108ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1109		       struct ctl_be_block_io *beio)
1110{
1111	union ctl_io *io;
1112	struct ctl_ptr_len_flags *ptrlen;
1113	struct scsi_unmap_desc *buf, *end;
1114	uint64_t len;
1115
1116	io = beio->io;
1117
1118	DPRINTF("entered\n");
1119
1120	binuptime(&beio->ds_t0);
1121	mtx_lock(&be_lun->io_lock);
1122	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1123	mtx_unlock(&be_lun->io_lock);
1124
1125	if (beio->io_offset == -1) {
1126		beio->io_len = 0;
1127		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1128		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1129		end = buf + ptrlen->len / sizeof(*buf);
1130		for (; buf < end; buf++) {
1131			len = (uint64_t)scsi_4btoul(buf->length) *
1132			    be_lun->cbe_lun.blocksize;
1133			beio->io_len += len;
1134			ctl_be_block_unmap_dev_range(be_lun, beio,
1135			    scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
1136			    len, (end - buf < 2) ? TRUE : FALSE);
1137		}
1138	} else
1139		ctl_be_block_unmap_dev_range(be_lun, beio,
1140		    beio->io_offset, beio->io_len, TRUE);
1141}
1142
1143static void
1144ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1145			  struct ctl_be_block_io *beio)
1146{
1147	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1148	struct bio *bio;
1149	struct cdevsw *csw;
1150	struct cdev *dev;
1151	off_t cur_offset;
1152	int i, max_iosize, ref;
1153
1154	DPRINTF("entered\n");
1155	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1156
1157	/*
1158	 * We have to limit our I/O size to the maximum supported by the
1159	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
1160	 * set it properly, use DFLTPHYS.
1161	 */
1162	if (csw) {
1163		max_iosize = dev->si_iosize_max;
1164		if (max_iosize < PAGE_SIZE)
1165			max_iosize = DFLTPHYS;
1166	} else
1167		max_iosize = DFLTPHYS;
1168
1169	cur_offset = beio->io_offset;
1170	for (i = 0; i < beio->num_segs; i++) {
1171		size_t cur_size;
1172		uint8_t *cur_ptr;
1173
1174		cur_size = beio->sg_segs[i].len;
1175		cur_ptr = beio->sg_segs[i].addr;
1176
1177		while (cur_size > 0) {
1178			/* This can't fail, it's a blocking allocation. */
1179			bio = g_alloc_bio();
1180
1181			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1182
1183			bio->bio_cmd = beio->bio_cmd;
1184			bio->bio_dev = dev;
1185			bio->bio_caller1 = beio;
1186			bio->bio_length = min(cur_size, max_iosize);
1187			bio->bio_offset = cur_offset;
1188			bio->bio_data = cur_ptr;
1189			bio->bio_done = ctl_be_block_biodone;
1190			bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;
1191
1192			cur_offset += bio->bio_length;
1193			cur_ptr += bio->bio_length;
1194			cur_size -= bio->bio_length;
1195
1196			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1197			beio->num_bios_sent++;
1198		}
1199	}
1200	binuptime(&beio->ds_t0);
1201	mtx_lock(&be_lun->io_lock);
1202	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1203	beio->send_complete = 1;
1204	mtx_unlock(&be_lun->io_lock);
1205
1206	/*
1207	 * Fire off all allocated requests!
1208	 */
1209	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1210		TAILQ_REMOVE(&queue, bio, bio_queue);
1211		if (csw)
1212			csw->d_strategy(bio);
1213		else {
1214			bio->bio_error = ENXIO;
1215			ctl_be_block_biodone(bio);
1216		}
1217	}
1218	if (csw)
1219		dev_relthread(dev, ref);
1220}
1221
1222static uint64_t
1223ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1224{
1225	struct diocgattr_arg	arg;
1226	struct cdevsw *csw;
1227	struct cdev *dev;
1228	int error, ref;
1229
1230	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1231	if (csw == NULL)
1232		return (UINT64_MAX);
1233	strlcpy(arg.name, attrname, sizeof(arg.name));
1234	arg.len = sizeof(arg.value.off);
1235	if (csw->d_ioctl) {
1236		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
1237		    curthread);
1238	} else
1239		error = ENODEV;
1240	dev_relthread(dev, ref);
1241	if (error != 0)
1242		return (UINT64_MAX);
1243	return (arg.value.off);
1244}
1245
1246static void
1247ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
1248			    union ctl_io *io)
1249{
1250	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1251	struct ctl_be_block_io *beio;
1252	struct ctl_lba_len_flags *lbalen;
1253
1254	DPRINTF("entered\n");
1255	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1256	lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1257
1258	beio->io_len = lbalen->len * cbe_lun->blocksize;
1259	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1260	beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
1261	beio->bio_cmd = BIO_FLUSH;
1262	beio->ds_trans_type = DEVSTAT_NO_DATA;
1263	DPRINTF("SYNC\n");
1264	be_lun->lun_flush(be_lun, beio);
1265}
1266
1267static void
1268ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1269{
1270	union ctl_io *io;
1271
1272	io = beio->io;
1273	ctl_free_beio(beio);
1274	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1275	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1276	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1277		ctl_config_write_done(io);
1278		return;
1279	}
1280
1281	ctl_be_block_config_write(io);
1282}
1283
1284static void
1285ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1286			    union ctl_io *io)
1287{
1288	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1289	struct ctl_be_block_io *beio;
1290	struct ctl_lba_len_flags *lbalen;
1291	uint64_t len_left, lba;
1292	uint32_t pb, pbo, adj;
1293	int i, seglen;
1294	uint8_t *buf, *end;
1295
1296	DPRINTF("entered\n");
1297
1298	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1299	lbalen = ARGS(beio->io);
1300
1301	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1302	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1303		ctl_free_beio(beio);
1304		ctl_set_invalid_field(&io->scsiio,
1305				      /*sks_valid*/ 1,
1306				      /*command*/ 1,
1307				      /*field*/ 1,
1308				      /*bit_valid*/ 0,
1309				      /*bit*/ 0);
1310		ctl_config_write_done(io);
1311		return;
1312	}
1313
1314	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1315		beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1316		beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1317		beio->bio_cmd = BIO_DELETE;
1318		beio->ds_trans_type = DEVSTAT_FREE;
1319
1320		be_lun->unmap(be_lun, beio);
1321		return;
1322	}
1323
1324	beio->bio_cmd = BIO_WRITE;
1325	beio->ds_trans_type = DEVSTAT_WRITE;
1326
1327	DPRINTF("WRITE SAME at LBA %jx len %u\n",
1328	       (uintmax_t)lbalen->lba, lbalen->len);
1329
1330	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
1331	if (be_lun->cbe_lun.pblockoff > 0)
1332		pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
1333	else
1334		pbo = 0;
1335	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
1336	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1337
1338		/*
1339		 * Setup the S/G entry for this chunk.
1340		 */
1341		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1342		if (pb > cbe_lun->blocksize) {
1343			adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
1344			    seglen - pbo) % pb;
1345			if (seglen > adj)
1346				seglen -= adj;
1347			else
1348				seglen -= seglen % cbe_lun->blocksize;
1349		} else
1350			seglen -= seglen % cbe_lun->blocksize;
1351		beio->sg_segs[i].len = seglen;
1352		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1353
1354		DPRINTF("segment %d addr %p len %zd\n", i,
1355			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1356
1357		beio->num_segs++;
1358		len_left -= seglen;
1359
1360		buf = beio->sg_segs[i].addr;
1361		end = buf + seglen;
1362		for (; buf < end; buf += cbe_lun->blocksize) {
1363			if (lbalen->flags & SWS_NDOB) {
1364				memset(buf, 0, cbe_lun->blocksize);
1365			} else {
1366				memcpy(buf, io->scsiio.kern_data_ptr,
1367				    cbe_lun->blocksize);
1368			}
1369			if (lbalen->flags & SWS_LBDATA)
1370				scsi_ulto4b(lbalen->lba + lba, buf);
1371			lba++;
1372		}
1373	}
1374
1375	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1376	beio->io_len = lba * cbe_lun->blocksize;
1377
1378	/* We can not do all in one run. Correct and schedule rerun. */
1379	if (len_left > 0) {
1380		lbalen->lba += lba;
1381		lbalen->len -= lba;
1382		beio->beio_cont = ctl_be_block_cw_done_ws;
1383	}
1384
1385	be_lun->dispatch(be_lun, beio);
1386}
1387
1388static void
1389ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1390			    union ctl_io *io)
1391{
1392	struct ctl_be_block_io *beio;
1393	struct ctl_ptr_len_flags *ptrlen;
1394
1395	DPRINTF("entered\n");
1396
1397	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1398	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1399
1400	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1401		ctl_free_beio(beio);
1402		ctl_set_invalid_field(&io->scsiio,
1403				      /*sks_valid*/ 0,
1404				      /*command*/ 1,
1405				      /*field*/ 0,
1406				      /*bit_valid*/ 0,
1407				      /*bit*/ 0);
1408		ctl_config_write_done(io);
1409		return;
1410	}
1411
1412	beio->io_len = 0;
1413	beio->io_offset = -1;
1414	beio->bio_cmd = BIO_DELETE;
1415	beio->ds_trans_type = DEVSTAT_FREE;
1416	DPRINTF("UNMAP\n");
1417	be_lun->unmap(be_lun, beio);
1418}
1419
1420static void
1421ctl_be_block_cr_done(struct ctl_be_block_io *beio)
1422{
1423	union ctl_io *io;
1424
1425	io = beio->io;
1426	ctl_free_beio(beio);
1427	ctl_config_read_done(io);
1428}
1429
1430static void
1431ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1432			 union ctl_io *io)
1433{
1434	struct ctl_be_block_io *beio;
1435	struct ctl_be_block_softc *softc;
1436
1437	DPRINTF("entered\n");
1438
1439	softc = be_lun->softc;
1440	beio = ctl_alloc_beio(softc);
1441	beio->io = io;
1442	beio->lun = be_lun;
1443	beio->beio_cont = ctl_be_block_cr_done;
1444	PRIV(io)->ptr = (void *)beio;
1445
1446	switch (io->scsiio.cdb[0]) {
1447	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
1448		beio->bio_cmd = -1;
1449		beio->ds_trans_type = DEVSTAT_NO_DATA;
1450		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1451		beio->io_len = 0;
1452		if (be_lun->get_lba_status)
1453			be_lun->get_lba_status(be_lun, beio);
1454		else
1455			ctl_be_block_cr_done(beio);
1456		break;
1457	default:
1458		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1459		break;
1460	}
1461}
1462
1463static void
1464ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1465{
1466	union ctl_io *io;
1467
1468	io = beio->io;
1469	ctl_free_beio(beio);
1470	ctl_config_write_done(io);
1471}
1472
1473static void
1474ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1475			 union ctl_io *io)
1476{
1477	struct ctl_be_block_io *beio;
1478	struct ctl_be_block_softc *softc;
1479
1480	DPRINTF("entered\n");
1481
1482	softc = be_lun->softc;
1483	beio = ctl_alloc_beio(softc);
1484	beio->io = io;
1485	beio->lun = be_lun;
1486	beio->beio_cont = ctl_be_block_cw_done;
1487	switch (io->scsiio.tag_type) {
1488	case CTL_TAG_ORDERED:
1489		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1490		break;
1491	case CTL_TAG_HEAD_OF_QUEUE:
1492		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1493		break;
1494	case CTL_TAG_UNTAGGED:
1495	case CTL_TAG_SIMPLE:
1496	case CTL_TAG_ACA:
1497	default:
1498		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1499		break;
1500	}
1501	PRIV(io)->ptr = (void *)beio;
1502
1503	switch (io->scsiio.cdb[0]) {
1504	case SYNCHRONIZE_CACHE:
1505	case SYNCHRONIZE_CACHE_16:
1506		ctl_be_block_cw_dispatch_sync(be_lun, io);
1507		break;
1508	case WRITE_SAME_10:
1509	case WRITE_SAME_16:
1510		ctl_be_block_cw_dispatch_ws(be_lun, io);
1511		break;
1512	case UNMAP:
1513		ctl_be_block_cw_dispatch_unmap(be_lun, io);
1514		break;
1515	default:
1516		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1517		break;
1518	}
1519}
1520
1521SDT_PROBE_DEFINE1(cbb, kernel, read, start, "uint64_t");
1522SDT_PROBE_DEFINE1(cbb, kernel, write, start, "uint64_t");
1523SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, "uint64_t");
1524SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, "uint64_t");
1525
1526static void
1527ctl_be_block_next(struct ctl_be_block_io *beio)
1528{
1529	struct ctl_be_block_lun *be_lun;
1530	union ctl_io *io;
1531
1532	io = beio->io;
1533	be_lun = beio->lun;
1534	ctl_free_beio(beio);
1535	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1536	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1537	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1538		ctl_data_submit_done(io);
1539		return;
1540	}
1541
1542	io->io_hdr.status &= ~CTL_STATUS_MASK;
1543	io->io_hdr.status |= CTL_STATUS_NONE;
1544
1545	mtx_lock(&be_lun->queue_lock);
1546	/*
1547	 * XXX KDM make sure that links is okay to use at this point.
1548	 * Otherwise, we either need to add another field to ctl_io_hdr,
1549	 * or deal with resource allocation here.
1550	 */
1551	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1552	mtx_unlock(&be_lun->queue_lock);
1553
1554	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1555}
1556
1557static void
1558ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1559			   union ctl_io *io)
1560{
1561	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1562	struct ctl_be_block_io *beio;
1563	struct ctl_be_block_softc *softc;
1564	struct ctl_lba_len_flags *lbalen;
1565	struct ctl_ptr_len_flags *bptrlen;
1566	uint64_t len_left, lbas;
1567	int i;
1568
1569	softc = be_lun->softc;
1570
1571	DPRINTF("entered\n");
1572
1573	lbalen = ARGS(io);
1574	if (lbalen->flags & CTL_LLF_WRITE) {
1575		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
1576	} else {
1577		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
1578	}
1579
1580	beio = ctl_alloc_beio(softc);
1581	beio->io = io;
1582	beio->lun = be_lun;
1583	bptrlen = PRIV(io);
1584	bptrlen->ptr = (void *)beio;
1585
1586	switch (io->scsiio.tag_type) {
1587	case CTL_TAG_ORDERED:
1588		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1589		break;
1590	case CTL_TAG_HEAD_OF_QUEUE:
1591		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1592		break;
1593	case CTL_TAG_UNTAGGED:
1594	case CTL_TAG_SIMPLE:
1595	case CTL_TAG_ACA:
1596	default:
1597		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1598		break;
1599	}
1600
1601	if (lbalen->flags & CTL_LLF_WRITE) {
1602		beio->bio_cmd = BIO_WRITE;
1603		beio->ds_trans_type = DEVSTAT_WRITE;
1604	} else {
1605		beio->bio_cmd = BIO_READ;
1606		beio->ds_trans_type = DEVSTAT_READ;
1607	}
1608
1609	DPRINTF("%s at LBA %jx len %u @%ju\n",
1610	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1611	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
1612	if (lbalen->flags & CTL_LLF_COMPARE)
1613		lbas = CTLBLK_HALF_IO_SIZE;
1614	else
1615		lbas = CTLBLK_MAX_IO_SIZE;
1616	lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
1617	beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
1618	beio->io_len = lbas * cbe_lun->blocksize;
1619	bptrlen->len += lbas;
1620
1621	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
1622		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
1623		    i, CTLBLK_MAX_SEGS));
1624
1625		/*
1626		 * Setup the S/G entry for this chunk.
1627		 */
1628		beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
1629		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1630
1631		DPRINTF("segment %d addr %p len %zd\n", i,
1632			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1633
1634		/* Set up second segment for compare operation. */
1635		if (lbalen->flags & CTL_LLF_COMPARE) {
1636			beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
1637			    beio->sg_segs[i].len;
1638			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
1639			    uma_zalloc(be_lun->lun_zone, M_WAITOK);
1640		}
1641
1642		beio->num_segs++;
1643		len_left -= beio->sg_segs[i].len;
1644	}
1645	if (bptrlen->len < lbalen->len)
1646		beio->beio_cont = ctl_be_block_next;
1647	io->scsiio.be_move_done = ctl_be_block_move_done;
1648	/* For compare we have separate S/G lists for read and datamove. */
1649	if (lbalen->flags & CTL_LLF_COMPARE)
1650		io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
1651	else
1652		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1653	io->scsiio.kern_data_len = beio->io_len;
1654	io->scsiio.kern_data_resid = 0;
1655	io->scsiio.kern_sg_entries = beio->num_segs;
1656	io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
1657
1658	/*
1659	 * For the read case, we need to read the data into our buffers and
1660	 * then we can send it back to the user.  For the write case, we
1661	 * need to get the data from the user first.
1662	 */
1663	if (beio->bio_cmd == BIO_READ) {
1664		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
1665		be_lun->dispatch(be_lun, beio);
1666	} else {
1667		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1668#ifdef CTL_TIME_IO
1669        	getbintime(&io->io_hdr.dma_start_bt);
1670#endif
1671		ctl_datamove(io);
1672	}
1673}
1674
1675static void
1676ctl_be_block_worker(void *context, int pending)
1677{
1678	struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context;
1679	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1680	union ctl_io *io;
1681	struct ctl_be_block_io *beio;
1682
1683	DPRINTF("entered\n");
1684	/*
1685	 * Fetch and process I/Os from all queues.  If we detect LUN
1686	 * CTL_LUN_FLAG_OFFLINE status here -- it is result of a race,
1687	 * so make response maximally opaque to not confuse initiator.
1688	 */
1689	for (;;) {
1690		mtx_lock(&be_lun->queue_lock);
1691		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1692		if (io != NULL) {
1693			DPRINTF("datamove queue\n");
1694			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1695				      ctl_io_hdr, links);
1696			mtx_unlock(&be_lun->queue_lock);
1697			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1698			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1699				ctl_set_busy(&io->scsiio);
1700				ctl_complete_beio(beio);
1701				return;
1702			}
1703			be_lun->dispatch(be_lun, beio);
1704			continue;
1705		}
1706		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1707		if (io != NULL) {
1708			DPRINTF("config write queue\n");
1709			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1710				      ctl_io_hdr, links);
1711			mtx_unlock(&be_lun->queue_lock);
1712			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1713				ctl_set_busy(&io->scsiio);
1714				ctl_config_write_done(io);
1715				return;
1716			}
1717			ctl_be_block_cw_dispatch(be_lun, io);
1718			continue;
1719		}
1720		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
1721		if (io != NULL) {
1722			DPRINTF("config read queue\n");
1723			STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
1724				      ctl_io_hdr, links);
1725			mtx_unlock(&be_lun->queue_lock);
1726			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1727				ctl_set_busy(&io->scsiio);
1728				ctl_config_read_done(io);
1729				return;
1730			}
1731			ctl_be_block_cr_dispatch(be_lun, io);
1732			continue;
1733		}
1734		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1735		if (io != NULL) {
1736			DPRINTF("input queue\n");
1737			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1738				      ctl_io_hdr, links);
1739			mtx_unlock(&be_lun->queue_lock);
1740			if (cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) {
1741				ctl_set_busy(&io->scsiio);
1742				ctl_data_submit_done(io);
1743				return;
1744			}
1745			ctl_be_block_dispatch(be_lun, io);
1746			continue;
1747		}
1748
1749		/*
1750		 * If we get here, there is no work left in the queues, so
1751		 * just break out and let the task queue go to sleep.
1752		 */
1753		mtx_unlock(&be_lun->queue_lock);
1754		break;
1755	}
1756}
1757
1758/*
1759 * Entry point from CTL to the backend for I/O.  We queue everything to a
1760 * work thread, so this just puts the I/O on a queue and wakes up the
1761 * thread.
1762 */
1763static int
1764ctl_be_block_submit(union ctl_io *io)
1765{
1766	struct ctl_be_block_lun *be_lun;
1767	struct ctl_be_lun *cbe_lun;
1768
1769	DPRINTF("entered\n");
1770
1771	cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1772		CTL_PRIV_BACKEND_LUN].ptr;
1773	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
1774
1775	/*
1776	 * Make sure we only get SCSI I/O.
1777	 */
1778	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1779		"%#x) encountered", io->io_hdr.io_type));
1780
1781	PRIV(io)->len = 0;
1782
1783	mtx_lock(&be_lun->queue_lock);
1784	/*
1785	 * XXX KDM make sure that links is okay to use at this point.
1786	 * Otherwise, we either need to add another field to ctl_io_hdr,
1787	 * or deal with resource allocation here.
1788	 */
1789	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1790	mtx_unlock(&be_lun->queue_lock);
1791	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1792
1793	return (CTL_RETVAL_COMPLETE);
1794}
1795
1796static int
1797ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1798			int flag, struct thread *td)
1799{
1800	struct ctl_be_block_softc *softc;
1801	int error;
1802
1803	softc = &backend_block_softc;
1804
1805	error = 0;
1806
1807	switch (cmd) {
1808	case CTL_LUN_REQ: {
1809		struct ctl_lun_req *lun_req;
1810
1811		lun_req = (struct ctl_lun_req *)addr;
1812
1813		switch (lun_req->reqtype) {
1814		case CTL_LUNREQ_CREATE:
1815			error = ctl_be_block_create(softc, lun_req);
1816			break;
1817		case CTL_LUNREQ_RM:
1818			error = ctl_be_block_rm(softc, lun_req);
1819			break;
1820		case CTL_LUNREQ_MODIFY:
1821			error = ctl_be_block_modify(softc, lun_req);
1822			break;
1823		default:
1824			lun_req->status = CTL_LUN_ERROR;
1825			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1826				 "invalid LUN request type %d",
1827				 lun_req->reqtype);
1828			break;
1829		}
1830		break;
1831	}
1832	default:
1833		error = ENOTTY;
1834		break;
1835	}
1836
1837	return (error);
1838}
1839
1840static int
1841ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1842{
1843	struct ctl_be_lun *cbe_lun;
1844	struct ctl_be_block_filedata *file_data;
1845	struct ctl_lun_create_params *params;
1846	char			     *value;
1847	struct vattr		      vattr;
1848	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1849	int			      error;
1850
1851	error = 0;
1852	cbe_lun = &be_lun->cbe_lun;
1853	file_data = &be_lun->backend.file;
1854	params = &be_lun->params;
1855
1856	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1857	be_lun->dispatch = ctl_be_block_dispatch_file;
1858	be_lun->lun_flush = ctl_be_block_flush_file;
1859	be_lun->get_lba_status = ctl_be_block_gls_file;
1860	be_lun->getattr = ctl_be_block_getattr_file;
1861	be_lun->unmap = NULL;
1862	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
1863
1864	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1865	if (error != 0) {
1866		snprintf(req->error_str, sizeof(req->error_str),
1867			 "error calling VOP_GETATTR() for file %s",
1868			 be_lun->dev_path);
1869		return (error);
1870	}
1871
1872	/*
1873	 * Verify that we have the ability to upgrade to exclusive
1874	 * access on this file so we can trap errors at open instead
1875	 * of reporting them during first access.
1876	 */
1877	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1878		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1879		if (be_lun->vn->v_iflag & VI_DOOMED) {
1880			error = EBADF;
1881			snprintf(req->error_str, sizeof(req->error_str),
1882				 "error locking file %s", be_lun->dev_path);
1883			return (error);
1884		}
1885	}
1886
1887	file_data->cred = crhold(curthread->td_ucred);
1888	if (params->lun_size_bytes != 0)
1889		be_lun->size_bytes = params->lun_size_bytes;
1890	else
1891		be_lun->size_bytes = vattr.va_size;
1892
1893	/*
1894	 * For files we can use any logical block size.  Prefer 512 bytes
1895	 * for compatibility reasons.  If file's vattr.va_blocksize
1896	 * (preferred I/O block size) is bigger and multiple to chosen
1897	 * logical block size -- report it as physical block size.
1898	 */
1899	if (params->blocksize_bytes != 0)
1900		cbe_lun->blocksize = params->blocksize_bytes;
1901	else
1902		cbe_lun->blocksize = 512;
1903	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
1904	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
1905	    0 : (be_lun->size_blocks - 1);
1906
1907	us = ps = vattr.va_blocksize;
1908	uo = po = 0;
1909
1910	value = ctl_get_opt(&cbe_lun->options, "pblocksize");
1911	if (value != NULL)
1912		ctl_expand_number(value, &ps);
1913	value = ctl_get_opt(&cbe_lun->options, "pblockoffset");
1914	if (value != NULL)
1915		ctl_expand_number(value, &po);
1916	pss = ps / cbe_lun->blocksize;
1917	pos = po / cbe_lun->blocksize;
1918	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
1919	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
1920		cbe_lun->pblockexp = fls(pss) - 1;
1921		cbe_lun->pblockoff = (pss - pos) % pss;
1922	}
1923
1924	value = ctl_get_opt(&cbe_lun->options, "ublocksize");
1925	if (value != NULL)
1926		ctl_expand_number(value, &us);
1927	value = ctl_get_opt(&cbe_lun->options, "ublockoffset");
1928	if (value != NULL)
1929		ctl_expand_number(value, &uo);
1930	uss = us / cbe_lun->blocksize;
1931	uos = uo / cbe_lun->blocksize;
1932	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
1933	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
1934		cbe_lun->ublockexp = fls(uss) - 1;
1935		cbe_lun->ublockoff = (uss - uos) % uss;
1936	}
1937
1938	/*
1939	 * Sanity check.  The media size has to be at least one
1940	 * sector long.
1941	 */
1942	if (be_lun->size_bytes < cbe_lun->blocksize) {
1943		error = EINVAL;
1944		snprintf(req->error_str, sizeof(req->error_str),
1945			 "file %s size %ju < block size %u", be_lun->dev_path,
1946			 (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
1947	}
1948
1949	cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
1950	return (error);
1951}
1952
1953static int
1954ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1955{
1956	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1957	struct ctl_lun_create_params *params;
1958	struct cdevsw		     *csw;
1959	struct cdev		     *dev;
1960	char			     *value;
1961	int			      error, atomic, maxio, ref, unmap, tmp;
1962	off_t			      ps, pss, po, pos, us, uss, uo, uos, otmp;
1963
1964	params = &be_lun->params;
1965
1966	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1967	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1968	if (csw == NULL)
1969		return (ENXIO);
1970	if (strcmp(csw->d_name, "zvol") == 0) {
1971		be_lun->dispatch = ctl_be_block_dispatch_zvol;
1972		be_lun->get_lba_status = ctl_be_block_gls_zvol;
1973		atomic = maxio = CTLBLK_MAX_IO_SIZE;
1974	} else {
1975		be_lun->dispatch = ctl_be_block_dispatch_dev;
1976		be_lun->get_lba_status = NULL;
1977		atomic = 0;
1978		maxio = dev->si_iosize_max;
1979		if (maxio <= 0)
1980			maxio = DFLTPHYS;
1981		if (maxio > CTLBLK_MAX_IO_SIZE)
1982			maxio = CTLBLK_MAX_IO_SIZE;
1983	}
1984	be_lun->lun_flush = ctl_be_block_flush_dev;
1985	be_lun->getattr = ctl_be_block_getattr_dev;
1986	be_lun->unmap = ctl_be_block_unmap_dev;
1987
1988	if (!csw->d_ioctl) {
1989		dev_relthread(dev, ref);
1990		snprintf(req->error_str, sizeof(req->error_str),
1991			 "no d_ioctl for device %s!", be_lun->dev_path);
1992		return (ENODEV);
1993	}
1994
1995	error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
1996			       curthread);
1997	if (error) {
1998		dev_relthread(dev, ref);
1999		snprintf(req->error_str, sizeof(req->error_str),
2000			 "error %d returned for DIOCGSECTORSIZE ioctl "
2001			 "on %s!", error, be_lun->dev_path);
2002		return (error);
2003	}
2004
2005	/*
2006	 * If the user has asked for a blocksize that is greater than the
2007	 * backing device's blocksize, we can do it only if the blocksize
2008	 * the user is asking for is an even multiple of the underlying
2009	 * device's blocksize.
2010	 */
2011	if ((params->blocksize_bytes != 0) &&
2012	    (params->blocksize_bytes >= tmp)) {
2013		if (params->blocksize_bytes % tmp == 0) {
2014			cbe_lun->blocksize = params->blocksize_bytes;
2015		} else {
2016			dev_relthread(dev, ref);
2017			snprintf(req->error_str, sizeof(req->error_str),
2018				 "requested blocksize %u is not an even "
2019				 "multiple of backing device blocksize %u",
2020				 params->blocksize_bytes, tmp);
2021			return (EINVAL);
2022		}
2023	} else if (params->blocksize_bytes != 0) {
2024		dev_relthread(dev, ref);
2025		snprintf(req->error_str, sizeof(req->error_str),
2026			 "requested blocksize %u < backing device "
2027			 "blocksize %u", params->blocksize_bytes, tmp);
2028		return (EINVAL);
2029	} else
2030		cbe_lun->blocksize = tmp;
2031
2032	error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
2033			     curthread);
2034	if (error) {
2035		dev_relthread(dev, ref);
2036		snprintf(req->error_str, sizeof(req->error_str),
2037			 "error %d returned for DIOCGMEDIASIZE "
2038			 " ioctl on %s!", error,
2039			 be_lun->dev_path);
2040		return (error);
2041	}
2042
2043	if (params->lun_size_bytes != 0) {
2044		if (params->lun_size_bytes > otmp) {
2045			dev_relthread(dev, ref);
2046			snprintf(req->error_str, sizeof(req->error_str),
2047				 "requested LUN size %ju > backing device "
2048				 "size %ju",
2049				 (uintmax_t)params->lun_size_bytes,
2050				 (uintmax_t)otmp);
2051			return (EINVAL);
2052		}
2053
2054		be_lun->size_bytes = params->lun_size_bytes;
2055	} else
2056		be_lun->size_bytes = otmp;
2057	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2058	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2059	    0 : (be_lun->size_blocks - 1);
2060
2061	error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
2062	    curthread);
2063	if (error)
2064		ps = po = 0;
2065	else {
2066		error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
2067		    FREAD, curthread);
2068		if (error)
2069			po = 0;
2070	}
2071	us = ps;
2072	uo = po;
2073
2074	value = ctl_get_opt(&cbe_lun->options, "pblocksize");
2075	if (value != NULL)
2076		ctl_expand_number(value, &ps);
2077	value = ctl_get_opt(&cbe_lun->options, "pblockoffset");
2078	if (value != NULL)
2079		ctl_expand_number(value, &po);
2080	pss = ps / cbe_lun->blocksize;
2081	pos = po / cbe_lun->blocksize;
2082	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
2083	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
2084		cbe_lun->pblockexp = fls(pss) - 1;
2085		cbe_lun->pblockoff = (pss - pos) % pss;
2086	}
2087
2088	value = ctl_get_opt(&cbe_lun->options, "ublocksize");
2089	if (value != NULL)
2090		ctl_expand_number(value, &us);
2091	value = ctl_get_opt(&cbe_lun->options, "ublockoffset");
2092	if (value != NULL)
2093		ctl_expand_number(value, &uo);
2094	uss = us / cbe_lun->blocksize;
2095	uos = uo / cbe_lun->blocksize;
2096	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
2097	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
2098		cbe_lun->ublockexp = fls(uss) - 1;
2099		cbe_lun->ublockoff = (uss - uos) % uss;
2100	}
2101
2102	cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
2103	cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;
2104
2105	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2106		unmap = 1;
2107	} else {
2108		struct diocgattr_arg	arg;
2109
2110		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2111		arg.len = sizeof(arg.value.i);
2112		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
2113		    curthread);
2114		unmap = (error == 0) ? arg.value.i : 0;
2115	}
2116	value = ctl_get_opt(&cbe_lun->options, "unmap");
2117	if (value != NULL)
2118		unmap = (strcmp(value, "on") == 0);
2119	if (unmap)
2120		cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
2121	else
2122		cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
2123
2124	dev_relthread(dev, ref);
2125	return (0);
2126}
2127
2128static int
2129ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2130{
2131	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2132	int flags;
2133
2134	if (be_lun->vn) {
2135		flags = FREAD;
2136		if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
2137			flags |= FWRITE;
2138		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2139		be_lun->vn = NULL;
2140
2141		switch (be_lun->dev_type) {
2142		case CTL_BE_BLOCK_DEV:
2143			break;
2144		case CTL_BE_BLOCK_FILE:
2145			if (be_lun->backend.file.cred != NULL) {
2146				crfree(be_lun->backend.file.cred);
2147				be_lun->backend.file.cred = NULL;
2148			}
2149			break;
2150		case CTL_BE_BLOCK_NONE:
2151			break;
2152		default:
2153			panic("Unexpected backend type.");
2154			break;
2155		}
2156		be_lun->dev_type = CTL_BE_BLOCK_NONE;
2157	}
2158	return (0);
2159}
2160
2161static int
2162ctl_be_block_open(struct ctl_be_block_softc *softc,
2163		  struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2164{
2165	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2166	struct nameidata nd;
2167	char		*value;
2168	int		 error, flags;
2169
2170	error = 0;
2171	if (rootvnode == NULL) {
2172		snprintf(req->error_str, sizeof(req->error_str),
2173			 "Root filesystem is not mounted");
2174		return (1);
2175	}
2176	if (!curthread->td_proc->p_fd->fd_cdir) {
2177		curthread->td_proc->p_fd->fd_cdir = rootvnode;
2178		VREF(rootvnode);
2179	}
2180	if (!curthread->td_proc->p_fd->fd_rdir) {
2181		curthread->td_proc->p_fd->fd_rdir = rootvnode;
2182		VREF(rootvnode);
2183	}
2184	if (!curthread->td_proc->p_fd->fd_jdir) {
2185		curthread->td_proc->p_fd->fd_jdir = rootvnode;
2186		VREF(rootvnode);
2187	}
2188
2189	value = ctl_get_opt(&cbe_lun->options, "file");
2190	if (value == NULL) {
2191		snprintf(req->error_str, sizeof(req->error_str),
2192			 "no file argument specified");
2193		return (1);
2194	}
2195	free(be_lun->dev_path, M_CTLBLK);
2196	be_lun->dev_path = strdup(value, M_CTLBLK);
2197
2198	flags = FREAD;
2199	value = ctl_get_opt(&cbe_lun->options, "readonly");
2200	if (value == NULL || strcmp(value, "on") != 0)
2201		flags |= FWRITE;
2202
2203again:
2204	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
2205	error = vn_open(&nd, &flags, 0, NULL);
2206	if ((error == EROFS || error == EACCES) && (flags & FWRITE)) {
2207		flags &= ~FWRITE;
2208		goto again;
2209	}
2210	if (error) {
2211		/*
2212		 * This is the only reasonable guess we can make as far as
2213		 * path if the user doesn't give us a fully qualified path.
2214		 * If they want to specify a file, they need to specify the
2215		 * full path.
2216		 */
2217		if (be_lun->dev_path[0] != '/') {
2218			char *dev_name;
2219
2220			asprintf(&dev_name, M_CTLBLK, "/dev/%s",
2221				be_lun->dev_path);
2222			free(be_lun->dev_path, M_CTLBLK);
2223			be_lun->dev_path = dev_name;
2224			goto again;
2225		}
2226		snprintf(req->error_str, sizeof(req->error_str),
2227		    "error opening %s: %d", be_lun->dev_path, error);
2228		return (error);
2229	}
2230	if (flags & FWRITE)
2231		cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
2232	else
2233		cbe_lun->flags |= CTL_LUN_FLAG_READONLY;
2234
2235	NDFREE(&nd, NDF_ONLY_PNBUF);
2236	be_lun->vn = nd.ni_vp;
2237
2238	/* We only support disks and files. */
2239	if (vn_isdisk(be_lun->vn, &error)) {
2240		error = ctl_be_block_open_dev(be_lun, req);
2241	} else if (be_lun->vn->v_type == VREG) {
2242		error = ctl_be_block_open_file(be_lun, req);
2243	} else {
2244		error = EINVAL;
2245		snprintf(req->error_str, sizeof(req->error_str),
2246			 "%s is not a disk or plain file", be_lun->dev_path);
2247	}
2248	VOP_UNLOCK(be_lun->vn, 0);
2249
2250	if (error != 0)
2251		ctl_be_block_close(be_lun);
2252	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2253	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2254		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2255	value = ctl_get_opt(&cbe_lun->options, "serseq");
2256	if (value != NULL && strcmp(value, "on") == 0)
2257		cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
2258	else if (value != NULL && strcmp(value, "read") == 0)
2259		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2260	else if (value != NULL && strcmp(value, "off") == 0)
2261		cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2262	return (0);
2263}
2264
2265static int
2266ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2267{
2268	struct ctl_be_lun *cbe_lun;
2269	struct ctl_be_block_lun *be_lun;
2270	struct ctl_lun_create_params *params;
2271	char num_thread_str[16];
2272	char tmpstr[32];
2273	char *value;
2274	int retval, num_threads;
2275	int tmp_num_threads;
2276
2277	params = &req->reqdata.create;
2278	retval = 0;
2279	req->status = CTL_LUN_OK;
2280
2281	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2282	cbe_lun = &be_lun->cbe_lun;
2283	cbe_lun->be_lun = be_lun;
2284	be_lun->params = req->reqdata.create;
2285	be_lun->softc = softc;
2286	STAILQ_INIT(&be_lun->input_queue);
2287	STAILQ_INIT(&be_lun->config_read_queue);
2288	STAILQ_INIT(&be_lun->config_write_queue);
2289	STAILQ_INIT(&be_lun->datamove_queue);
2290	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
2291	mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF);
2292	mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF);
2293	ctl_init_opts(&cbe_lun->options,
2294	    req->num_be_args, req->kern_be_args);
2295	be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG,
2296	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
2297	if (be_lun->lun_zone == NULL) {
2298		snprintf(req->error_str, sizeof(req->error_str),
2299			 "error allocating UMA zone");
2300		goto bailout_error;
2301	}
2302
2303	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2304		cbe_lun->lun_type = params->device_type;
2305	else
2306		cbe_lun->lun_type = T_DIRECT;
2307	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
2308	cbe_lun->flags = 0;
2309	value = ctl_get_opt(&cbe_lun->options, "ha_role");
2310	if (value != NULL) {
2311		if (strcmp(value, "primary") == 0)
2312			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2313	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2314		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2315
2316	if (cbe_lun->lun_type == T_DIRECT) {
2317		be_lun->size_bytes = params->lun_size_bytes;
2318		if (params->blocksize_bytes != 0)
2319			cbe_lun->blocksize = params->blocksize_bytes;
2320		else
2321			cbe_lun->blocksize = 512;
2322		be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2323		cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2324		    0 : (be_lun->size_blocks - 1);
2325
2326		if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2327		    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2328			retval = ctl_be_block_open(softc, be_lun, req);
2329			if (retval != 0) {
2330				retval = 0;
2331				req->status = CTL_LUN_WARNING;
2332			}
2333		}
2334		num_threads = cbb_num_threads;
2335	} else {
2336		num_threads = 1;
2337	}
2338
2339	/*
2340	 * XXX This searching loop might be refactored to be combined with
2341	 * the loop above,
2342	 */
2343	value = ctl_get_opt(&cbe_lun->options, "num_threads");
2344	if (value != NULL) {
2345		tmp_num_threads = strtol(value, NULL, 0);
2346
2347		/*
2348		 * We don't let the user specify less than one
2349		 * thread, but hope he's clueful enough not to
2350		 * specify 1000 threads.
2351		 */
2352		if (tmp_num_threads < 1) {
2353			snprintf(req->error_str, sizeof(req->error_str),
2354				 "invalid number of threads %s",
2355				 num_thread_str);
2356			goto bailout_error;
2357		}
2358		num_threads = tmp_num_threads;
2359	}
2360
2361	if (be_lun->vn == NULL)
2362		cbe_lun->flags |= CTL_LUN_FLAG_OFFLINE;
2363	/* Tell the user the blocksize we ended up using */
2364	params->lun_size_bytes = be_lun->size_bytes;
2365	params->blocksize_bytes = cbe_lun->blocksize;
2366	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2367		cbe_lun->req_lun_id = params->req_lun_id;
2368		cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ;
2369	} else
2370		cbe_lun->req_lun_id = 0;
2371
2372	cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
2373	cbe_lun->lun_config_status = ctl_be_block_lun_config_status;
2374	cbe_lun->be = &ctl_be_block_driver;
2375
2376	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2377		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
2378			 softc->num_luns);
2379		strncpy((char *)cbe_lun->serial_num, tmpstr,
2380			MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));
2381
2382		/* Tell the user what we used for a serial number */
2383		strncpy((char *)params->serial_num, tmpstr,
2384			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2385	} else {
2386		strncpy((char *)cbe_lun->serial_num, params->serial_num,
2387			MIN(sizeof(cbe_lun->serial_num),
2388			sizeof(params->serial_num)));
2389	}
2390	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2391		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
2392		strncpy((char *)cbe_lun->device_id, tmpstr,
2393			MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));
2394
2395		/* Tell the user what we used for a device ID */
2396		strncpy((char *)params->device_id, tmpstr,
2397			MIN(sizeof(params->device_id), sizeof(tmpstr)));
2398	} else {
2399		strncpy((char *)cbe_lun->device_id, params->device_id,
2400			MIN(sizeof(cbe_lun->device_id),
2401			    sizeof(params->device_id)));
2402	}
2403
2404	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2405
2406	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
2407	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2408
2409	if (be_lun->io_taskqueue == NULL) {
2410		snprintf(req->error_str, sizeof(req->error_str),
2411			 "unable to create taskqueue");
2412		goto bailout_error;
2413	}
2414
2415	/*
2416	 * Note that we start the same number of threads by default for
2417	 * both the file case and the block device case.  For the file
2418	 * case, we need multiple threads to allow concurrency, because the
2419	 * vnode interface is designed to be a blocking interface.  For the
2420	 * block device case, ZFS zvols at least will block the caller's
2421	 * context in many instances, and so we need multiple threads to
2422	 * overcome that problem.  Other block devices don't need as many
2423	 * threads, but they shouldn't cause too many problems.
2424	 *
2425	 * If the user wants to just have a single thread for a block
2426	 * device, he can specify that when the LUN is created, or change
2427	 * the tunable/sysctl to alter the default number of threads.
2428	 */
2429	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
2430					 /*num threads*/num_threads,
2431					 /*priority*/PWAIT,
2432					 /*thread name*/
2433					 "%s taskq", be_lun->lunname);
2434
2435	if (retval != 0)
2436		goto bailout_error;
2437
2438	be_lun->num_threads = num_threads;
2439
2440	mtx_lock(&softc->lock);
2441	softc->num_luns++;
2442	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
2443
2444	mtx_unlock(&softc->lock);
2445
2446	retval = ctl_add_lun(&be_lun->cbe_lun);
2447	if (retval != 0) {
2448		mtx_lock(&softc->lock);
2449		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2450			      links);
2451		softc->num_luns--;
2452		mtx_unlock(&softc->lock);
2453		snprintf(req->error_str, sizeof(req->error_str),
2454			 "ctl_add_lun() returned error %d, see dmesg for "
2455			 "details", retval);
2456		retval = 0;
2457		goto bailout_error;
2458	}
2459
2460	mtx_lock(&softc->lock);
2461
2462	/*
2463	 * Tell the config_status routine that we're waiting so it won't
2464	 * clean up the LUN in the event of an error.
2465	 */
2466	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2467
2468	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2469		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2470		if (retval == EINTR)
2471			break;
2472	}
2473	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2474
2475	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
2476		snprintf(req->error_str, sizeof(req->error_str),
2477			 "LUN configuration error, see dmesg for details");
2478		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2479			      links);
2480		softc->num_luns--;
2481		mtx_unlock(&softc->lock);
2482		goto bailout_error;
2483	} else {
2484		params->req_lun_id = cbe_lun->lun_id;
2485	}
2486
2487	mtx_unlock(&softc->lock);
2488
2489	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
2490					       cbe_lun->blocksize,
2491					       DEVSTAT_ALL_SUPPORTED,
2492					       cbe_lun->lun_type
2493					       | DEVSTAT_TYPE_IF_OTHER,
2494					       DEVSTAT_PRIORITY_OTHER);
2495
2496	return (retval);
2497
2498bailout_error:
2499	req->status = CTL_LUN_ERROR;
2500
2501	if (be_lun->io_taskqueue != NULL)
2502		taskqueue_free(be_lun->io_taskqueue);
2503	ctl_be_block_close(be_lun);
2504	if (be_lun->dev_path != NULL)
2505		free(be_lun->dev_path, M_CTLBLK);
2506	if (be_lun->lun_zone != NULL)
2507		uma_zdestroy(be_lun->lun_zone);
2508	ctl_free_opts(&cbe_lun->options);
2509	mtx_destroy(&be_lun->queue_lock);
2510	mtx_destroy(&be_lun->io_lock);
2511	free(be_lun, M_CTLBLK);
2512
2513	return (retval);
2514}
2515
2516static int
2517ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2518{
2519	struct ctl_lun_rm_params *params;
2520	struct ctl_be_block_lun *be_lun;
2521	struct ctl_be_lun *cbe_lun;
2522	int retval;
2523
2524	params = &req->reqdata.rm;
2525
2526	mtx_lock(&softc->lock);
2527	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2528		if (be_lun->cbe_lun.lun_id == params->lun_id)
2529			break;
2530	}
2531	mtx_unlock(&softc->lock);
2532
2533	if (be_lun == NULL) {
2534		snprintf(req->error_str, sizeof(req->error_str),
2535			 "LUN %u is not managed by the block backend",
2536			 params->lun_id);
2537		goto bailout_error;
2538	}
2539	cbe_lun = &be_lun->cbe_lun;
2540
2541	retval = ctl_disable_lun(cbe_lun);
2542	if (retval != 0) {
2543		snprintf(req->error_str, sizeof(req->error_str),
2544			 "error %d returned from ctl_disable_lun() for "
2545			 "LUN %d", retval, params->lun_id);
2546		goto bailout_error;
2547	}
2548
2549	if (be_lun->vn != NULL) {
2550		cbe_lun->flags |= CTL_LUN_FLAG_OFFLINE;
2551		ctl_lun_offline(cbe_lun);
2552		taskqueue_drain_all(be_lun->io_taskqueue);
2553		ctl_be_block_close(be_lun);
2554	}
2555
2556	retval = ctl_invalidate_lun(cbe_lun);
2557	if (retval != 0) {
2558		snprintf(req->error_str, sizeof(req->error_str),
2559			 "error %d returned from ctl_invalidate_lun() for "
2560			 "LUN %d", retval, params->lun_id);
2561		goto bailout_error;
2562	}
2563
2564	mtx_lock(&softc->lock);
2565	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2566	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2567                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2568                if (retval == EINTR)
2569                        break;
2570        }
2571	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2572
2573	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2574		snprintf(req->error_str, sizeof(req->error_str),
2575			 "interrupted waiting for LUN to be freed");
2576		mtx_unlock(&softc->lock);
2577		goto bailout_error;
2578	}
2579
2580	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2581
2582	softc->num_luns--;
2583	mtx_unlock(&softc->lock);
2584
2585	taskqueue_drain_all(be_lun->io_taskqueue);
2586	taskqueue_free(be_lun->io_taskqueue);
2587
2588	if (be_lun->disk_stats != NULL)
2589		devstat_remove_entry(be_lun->disk_stats);
2590
2591	uma_zdestroy(be_lun->lun_zone);
2592
2593	ctl_free_opts(&cbe_lun->options);
2594	free(be_lun->dev_path, M_CTLBLK);
2595	mtx_destroy(&be_lun->queue_lock);
2596	mtx_destroy(&be_lun->io_lock);
2597	free(be_lun, M_CTLBLK);
2598
2599	req->status = CTL_LUN_OK;
2600
2601	return (0);
2602
2603bailout_error:
2604
2605	req->status = CTL_LUN_ERROR;
2606
2607	return (0);
2608}
2609
2610static int
2611ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2612{
2613	struct ctl_lun_modify_params *params;
2614	struct ctl_be_block_lun *be_lun;
2615	struct ctl_be_lun *cbe_lun;
2616	char *value;
2617	uint64_t oldsize;
2618	int error, wasprim;
2619
2620	params = &req->reqdata.modify;
2621
2622	mtx_lock(&softc->lock);
2623	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2624		if (be_lun->cbe_lun.lun_id == params->lun_id)
2625			break;
2626	}
2627	mtx_unlock(&softc->lock);
2628
2629	if (be_lun == NULL) {
2630		snprintf(req->error_str, sizeof(req->error_str),
2631			 "LUN %u is not managed by the block backend",
2632			 params->lun_id);
2633		goto bailout_error;
2634	}
2635	cbe_lun = &be_lun->cbe_lun;
2636
2637	if (params->lun_size_bytes != 0)
2638		be_lun->params.lun_size_bytes = params->lun_size_bytes;
2639	ctl_update_opts(&cbe_lun->options, req->num_be_args, req->kern_be_args);
2640
2641	wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
2642	value = ctl_get_opt(&cbe_lun->options, "ha_role");
2643	if (value != NULL) {
2644		if (strcmp(value, "primary") == 0)
2645			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2646		else
2647			cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2648	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2649		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2650	else
2651		cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2652	if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
2653		if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
2654			ctl_lun_primary(cbe_lun);
2655		else
2656			ctl_lun_secondary(cbe_lun);
2657	}
2658
2659	oldsize = be_lun->size_blocks;
2660	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2661	    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2662		if (be_lun->vn == NULL)
2663			error = ctl_be_block_open(softc, be_lun, req);
2664		else if (vn_isdisk(be_lun->vn, &error))
2665			error = ctl_be_block_open_dev(be_lun, req);
2666		else if (be_lun->vn->v_type == VREG)
2667			error = ctl_be_block_open_file(be_lun, req);
2668		else
2669			error = EINVAL;
2670		if ((cbe_lun->flags & CTL_LUN_FLAG_OFFLINE) &&
2671		    be_lun->vn != NULL) {
2672			cbe_lun->flags &= ~CTL_LUN_FLAG_OFFLINE;
2673			ctl_lun_online(cbe_lun);
2674		}
2675	} else {
2676		if (be_lun->vn != NULL) {
2677			cbe_lun->flags |= CTL_LUN_FLAG_OFFLINE;
2678			ctl_lun_offline(cbe_lun);
2679			taskqueue_drain_all(be_lun->io_taskqueue);
2680			error = ctl_be_block_close(be_lun);
2681		} else
2682			error = 0;
2683	}
2684	if (be_lun->size_blocks != oldsize)
2685		ctl_lun_capacity_changed(cbe_lun);
2686
2687	/* Tell the user the exact size we ended up using */
2688	params->lun_size_bytes = be_lun->size_bytes;
2689
2690	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
2691	return (0);
2692
2693bailout_error:
2694	req->status = CTL_LUN_ERROR;
2695	return (0);
2696}
2697
2698static void
2699ctl_be_block_lun_shutdown(void *be_lun)
2700{
2701	struct ctl_be_block_lun *lun;
2702	struct ctl_be_block_softc *softc;
2703
2704	lun = (struct ctl_be_block_lun *)be_lun;
2705
2706	softc = lun->softc;
2707
2708	mtx_lock(&softc->lock);
2709	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2710	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2711		wakeup(lun);
2712	mtx_unlock(&softc->lock);
2713
2714}
2715
2716static void
2717ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2718{
2719	struct ctl_be_block_lun *lun;
2720	struct ctl_be_block_softc *softc;
2721
2722	lun = (struct ctl_be_block_lun *)be_lun;
2723	softc = lun->softc;
2724
2725	if (status == CTL_LUN_CONFIG_OK) {
2726		mtx_lock(&softc->lock);
2727		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2728		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2729			wakeup(lun);
2730		mtx_unlock(&softc->lock);
2731
2732		/*
2733		 * We successfully added the LUN, attempt to enable it.
2734		 */
2735		if (ctl_enable_lun(&lun->cbe_lun) != 0) {
2736			printf("%s: ctl_enable_lun() failed!\n", __func__);
2737			if (ctl_invalidate_lun(&lun->cbe_lun) != 0) {
2738				printf("%s: ctl_invalidate_lun() failed!\n",
2739				       __func__);
2740			}
2741		}
2742
2743		return;
2744	}
2745
2746
2747	mtx_lock(&softc->lock);
2748	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2749	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2750	wakeup(lun);
2751	mtx_unlock(&softc->lock);
2752}
2753
2754
2755static int
2756ctl_be_block_config_write(union ctl_io *io)
2757{
2758	struct ctl_be_block_lun *be_lun;
2759	struct ctl_be_lun *cbe_lun;
2760	int retval;
2761
2762	retval = 0;
2763
2764	DPRINTF("entered\n");
2765
2766	cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2767		CTL_PRIV_BACKEND_LUN].ptr;
2768	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
2769
2770	switch (io->scsiio.cdb[0]) {
2771	case SYNCHRONIZE_CACHE:
2772	case SYNCHRONIZE_CACHE_16:
2773	case WRITE_SAME_10:
2774	case WRITE_SAME_16:
2775	case UNMAP:
2776		/*
2777		 * The upper level CTL code will filter out any CDBs with
2778		 * the immediate bit set and return the proper error.
2779		 *
2780		 * We don't really need to worry about what LBA range the
2781		 * user asked to be synced out.  When they issue a sync
2782		 * cache command, we'll sync out the whole thing.
2783		 */
2784		mtx_lock(&be_lun->queue_lock);
2785		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2786				   links);
2787		mtx_unlock(&be_lun->queue_lock);
2788		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2789		break;
2790	case START_STOP_UNIT: {
2791		struct scsi_start_stop_unit *cdb;
2792
2793		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2794
2795		if (cdb->how & SSS_START)
2796			retval = ctl_start_lun(cbe_lun);
2797		else {
2798			retval = ctl_stop_lun(cbe_lun);
2799			/*
2800			 * XXX KDM Copan-specific offline behavior.
2801			 * Figure out a reasonable way to port this?
2802			 */
2803#ifdef NEEDTOPORT
2804			if ((retval == 0)
2805			 && (cdb->byte2 & SSS_ONOFFLINE))
2806				retval = ctl_lun_offline(cbe_lun);
2807#endif
2808		}
2809
2810		/*
2811		 * In general, the above routines should not fail.  They
2812		 * just set state for the LUN.  So we've got something
2813		 * pretty wrong here if we can't start or stop the LUN.
2814		 */
2815		if (retval != 0) {
2816			ctl_set_internal_failure(&io->scsiio,
2817						 /*sks_valid*/ 1,
2818						 /*retry_count*/ 0xf051);
2819			retval = CTL_RETVAL_COMPLETE;
2820		} else {
2821			ctl_set_success(&io->scsiio);
2822		}
2823		ctl_config_write_done(io);
2824		break;
2825	}
2826	default:
2827		ctl_set_invalid_opcode(&io->scsiio);
2828		ctl_config_write_done(io);
2829		retval = CTL_RETVAL_COMPLETE;
2830		break;
2831	}
2832
2833	return (retval);
2834}
2835
2836static int
2837ctl_be_block_config_read(union ctl_io *io)
2838{
2839	struct ctl_be_block_lun *be_lun;
2840	struct ctl_be_lun *cbe_lun;
2841	int retval = 0;
2842
2843	DPRINTF("entered\n");
2844
2845	cbe_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2846		CTL_PRIV_BACKEND_LUN].ptr;
2847	be_lun = (struct ctl_be_block_lun *)cbe_lun->be_lun;
2848
2849	switch (io->scsiio.cdb[0]) {
2850	case SERVICE_ACTION_IN:
2851		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
2852			mtx_lock(&be_lun->queue_lock);
2853			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
2854			    &io->io_hdr, links);
2855			mtx_unlock(&be_lun->queue_lock);
2856			taskqueue_enqueue(be_lun->io_taskqueue,
2857			    &be_lun->io_task);
2858			retval = CTL_RETVAL_QUEUED;
2859			break;
2860		}
2861		ctl_set_invalid_field(&io->scsiio,
2862				      /*sks_valid*/ 1,
2863				      /*command*/ 1,
2864				      /*field*/ 1,
2865				      /*bit_valid*/ 1,
2866				      /*bit*/ 4);
2867		ctl_config_read_done(io);
2868		retval = CTL_RETVAL_COMPLETE;
2869		break;
2870	default:
2871		ctl_set_invalid_opcode(&io->scsiio);
2872		ctl_config_read_done(io);
2873		retval = CTL_RETVAL_COMPLETE;
2874		break;
2875	}
2876
2877	return (retval);
2878}
2879
2880static int
2881ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2882{
2883	struct ctl_be_block_lun *lun;
2884	int retval;
2885
2886	lun = (struct ctl_be_block_lun *)be_lun;
2887	retval = 0;
2888
2889	retval = sbuf_printf(sb, "\t<num_threads>");
2890
2891	if (retval != 0)
2892		goto bailout;
2893
2894	retval = sbuf_printf(sb, "%d", lun->num_threads);
2895
2896	if (retval != 0)
2897		goto bailout;
2898
2899	retval = sbuf_printf(sb, "</num_threads>\n");
2900
2901bailout:
2902
2903	return (retval);
2904}
2905
2906static uint64_t
2907ctl_be_block_lun_attr(void *be_lun, const char *attrname)
2908{
2909	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun;
2910
2911	if (lun->getattr == NULL)
2912		return (UINT64_MAX);
2913	return (lun->getattr(lun, attrname));
2914}
2915
2916int
2917ctl_be_block_init(void)
2918{
2919	struct ctl_be_block_softc *softc;
2920	int retval;
2921
2922	softc = &backend_block_softc;
2923	retval = 0;
2924
2925	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
2926	beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2927	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2928	STAILQ_INIT(&softc->lun_list);
2929
2930	return (retval);
2931}
2932