ctl_backend_block.c revision 278672
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Edward Tomasz Napierala
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17 *    substantially similar to the "NO WARRANTY" disclaimer below
18 *    ("Disclaimer") and any redistribution must be conditioned upon
19 *    including a substantially similar Disclaimer requirement for further
20 *    binary redistribution.
21 *
22 * NO WARRANTY
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGES.
34 *
35 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36 */
37/*
38 * CAM Target Layer driver backend for block devices.
39 *
40 * Author: Ken Merry <ken@FreeBSD.org>
41 */
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/cam/ctl/ctl_backend_block.c 278672 2015-02-13 13:26:23Z mav $");
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kernel.h>
48#include <sys/types.h>
49#include <sys/kthread.h>
50#include <sys/bio.h>
51#include <sys/fcntl.h>
52#include <sys/limits.h>
53#include <sys/lock.h>
54#include <sys/mutex.h>
55#include <sys/condvar.h>
56#include <sys/malloc.h>
57#include <sys/conf.h>
58#include <sys/ioccom.h>
59#include <sys/queue.h>
60#include <sys/sbuf.h>
61#include <sys/endian.h>
62#include <sys/uio.h>
63#include <sys/buf.h>
64#include <sys/taskqueue.h>
65#include <sys/vnode.h>
66#include <sys/namei.h>
67#include <sys/mount.h>
68#include <sys/disk.h>
69#include <sys/fcntl.h>
70#include <sys/filedesc.h>
71#include <sys/filio.h>
72#include <sys/proc.h>
73#include <sys/pcpu.h>
74#include <sys/module.h>
75#include <sys/sdt.h>
76#include <sys/devicestat.h>
77#include <sys/sysctl.h>
78
79#include <geom/geom.h>
80
81#include <cam/cam.h>
82#include <cam/scsi/scsi_all.h>
83#include <cam/scsi/scsi_da.h>
84#include <cam/ctl/ctl_io.h>
85#include <cam/ctl/ctl.h>
86#include <cam/ctl/ctl_backend.h>
87#include <cam/ctl/ctl_frontend_internal.h>
88#include <cam/ctl/ctl_ioctl.h>
89#include <cam/ctl/ctl_scsi_all.h>
90#include <cam/ctl/ctl_error.h>
91
92/*
93 * The idea here is that we'll allocate enough S/G space to hold a 1MB
94 * I/O.  If we get an I/O larger than that, we'll split it.
95 */
96#define	CTLBLK_HALF_IO_SIZE	(512 * 1024)
97#define	CTLBLK_MAX_IO_SIZE	(CTLBLK_HALF_IO_SIZE * 2)
98#define	CTLBLK_MAX_SEG		MAXPHYS
99#define	CTLBLK_HALF_SEGS	MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1)
100#define	CTLBLK_MAX_SEGS		(CTLBLK_HALF_SEGS * 2)
101
102#ifdef CTLBLK_DEBUG
103#define DPRINTF(fmt, args...) \
104    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
105#else
106#define DPRINTF(fmt, args...) do {} while(0)
107#endif
108
109#define PRIV(io)	\
110    ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
111#define ARGS(io)	\
112    ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
113
114SDT_PROVIDER_DEFINE(cbb);
115
116typedef enum {
117	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
118	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
119	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
120	CTL_BE_BLOCK_LUN_MULTI_THREAD	= 0x08
121} ctl_be_block_lun_flags;
122
123typedef enum {
124	CTL_BE_BLOCK_NONE,
125	CTL_BE_BLOCK_DEV,
126	CTL_BE_BLOCK_FILE
127} ctl_be_block_type;
128
129struct ctl_be_block_devdata {
130	struct cdev *cdev;
131	struct cdevsw *csw;
132	int dev_ref;
133};
134
135struct ctl_be_block_filedata {
136	struct ucred *cred;
137};
138
139union ctl_be_block_bedata {
140	struct ctl_be_block_devdata dev;
141	struct ctl_be_block_filedata file;
142};
143
144struct ctl_be_block_io;
145struct ctl_be_block_lun;
146
147typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
148			       struct ctl_be_block_io *beio);
149typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
150				  const char *attrname);
151
152/*
153 * Backend LUN structure.  There is a 1:1 mapping between a block device
154 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
155 */
156struct ctl_be_block_lun {
157	struct ctl_lun_create_params params;
158	struct ctl_block_disk *disk;
159	char lunname[32];
160	char *dev_path;
161	ctl_be_block_type dev_type;
162	struct vnode *vn;
163	union ctl_be_block_bedata backend;
164	cbb_dispatch_t dispatch;
165	cbb_dispatch_t lun_flush;
166	cbb_dispatch_t unmap;
167	cbb_dispatch_t get_lba_status;
168	cbb_getattr_t getattr;
169	uma_zone_t lun_zone;
170	uint64_t size_blocks;
171	uint64_t size_bytes;
172	uint32_t blocksize;
173	int blocksize_shift;
174	uint16_t pblockexp;
175	uint16_t pblockoff;
176	uint16_t ublockexp;
177	uint16_t ublockoff;
178	uint32_t atomicblock;
179	uint32_t opttxferlen;
180	struct ctl_be_block_softc *softc;
181	struct devstat *disk_stats;
182	ctl_be_block_lun_flags flags;
183	STAILQ_ENTRY(ctl_be_block_lun) links;
184	struct ctl_be_lun ctl_be_lun;
185	struct taskqueue *io_taskqueue;
186	struct task io_task;
187	int num_threads;
188	STAILQ_HEAD(, ctl_io_hdr) input_queue;
189	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
190	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
191	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
192	struct mtx_padalign io_lock;
193	struct mtx_padalign queue_lock;
194};
195
196/*
197 * Overall softc structure for the block backend module.
198 */
199struct ctl_be_block_softc {
200	struct mtx			 lock;
201	int				 num_disks;
202	STAILQ_HEAD(, ctl_block_disk)	 disk_list;
203	int				 num_luns;
204	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
205};
206
207static struct ctl_be_block_softc backend_block_softc;
208
209/*
210 * Per-I/O information.
211 */
212struct ctl_be_block_io {
213	union ctl_io			*io;
214	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
215	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
216	int				bio_cmd;
217	int				num_segs;
218	int				num_bios_sent;
219	int				num_bios_done;
220	int				send_complete;
221	int				num_errors;
222	struct bintime			ds_t0;
223	devstat_tag_type		ds_tag_type;
224	devstat_trans_flags		ds_trans_type;
225	uint64_t			io_len;
226	uint64_t			io_offset;
227	struct ctl_be_block_softc	*softc;
228	struct ctl_be_block_lun		*lun;
229	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
230};
231
232static int cbb_num_threads = 14;
233SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
234	    "CAM Target Layer Block Backend");
235SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN,
236           &cbb_num_threads, 0, "Number of threads per backing file");
237
238static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
239static void ctl_free_beio(struct ctl_be_block_io *beio);
240static void ctl_complete_beio(struct ctl_be_block_io *beio);
241static int ctl_be_block_move_done(union ctl_io *io);
242static void ctl_be_block_biodone(struct bio *bio);
243static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
244				    struct ctl_be_block_io *beio);
245static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
246				       struct ctl_be_block_io *beio);
247static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
248				  struct ctl_be_block_io *beio);
249static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
250					 const char *attrname);
251static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
252				   struct ctl_be_block_io *beio);
253static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
254				   struct ctl_be_block_io *beio);
255static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
256				      struct ctl_be_block_io *beio);
257static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
258					 const char *attrname);
259static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
260				    union ctl_io *io);
261static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
262				    union ctl_io *io);
263static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
264				  union ctl_io *io);
265static void ctl_be_block_worker(void *context, int pending);
266static int ctl_be_block_submit(union ctl_io *io);
267static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
268				   int flag, struct thread *td);
269static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
270				  struct ctl_lun_req *req);
271static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
272				 struct ctl_lun_req *req);
273static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
274static int ctl_be_block_open(struct ctl_be_block_softc *softc,
275			     struct ctl_be_block_lun *be_lun,
276			     struct ctl_lun_req *req);
277static int ctl_be_block_create(struct ctl_be_block_softc *softc,
278			       struct ctl_lun_req *req);
279static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
280			   struct ctl_lun_req *req);
281static int ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
282				  struct ctl_lun_req *req);
283static int ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
284				 struct ctl_lun_req *req);
285static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
286			   struct ctl_lun_req *req);
287static void ctl_be_block_lun_shutdown(void *be_lun);
288static void ctl_be_block_lun_config_status(void *be_lun,
289					   ctl_lun_config_status status);
290static int ctl_be_block_config_write(union ctl_io *io);
291static int ctl_be_block_config_read(union ctl_io *io);
292static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
293static uint64_t ctl_be_block_lun_attr(void *be_lun, const char *attrname);
294int ctl_be_block_init(void);
295
296static struct ctl_backend_driver ctl_be_block_driver =
297{
298	.name = "block",
299	.flags = CTL_BE_FLAG_HAS_CONFIG,
300	.init = ctl_be_block_init,
301	.data_submit = ctl_be_block_submit,
302	.data_move_done = ctl_be_block_move_done,
303	.config_read = ctl_be_block_config_read,
304	.config_write = ctl_be_block_config_write,
305	.ioctl = ctl_be_block_ioctl,
306	.lun_info = ctl_be_block_lun_info,
307	.lun_attr = ctl_be_block_lun_attr
308};
309
310MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
311CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
312
313static uma_zone_t beio_zone;
314
315static struct ctl_be_block_io *
316ctl_alloc_beio(struct ctl_be_block_softc *softc)
317{
318	struct ctl_be_block_io *beio;
319
320	beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO);
321	beio->softc = softc;
322	return (beio);
323}
324
325static void
326ctl_free_beio(struct ctl_be_block_io *beio)
327{
328	int duplicate_free;
329	int i;
330
331	duplicate_free = 0;
332
333	for (i = 0; i < beio->num_segs; i++) {
334		if (beio->sg_segs[i].addr == NULL)
335			duplicate_free++;
336
337		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
338		beio->sg_segs[i].addr = NULL;
339
340		/* For compare we had two equal S/G lists. */
341		if (ARGS(beio->io)->flags & CTL_LLF_COMPARE) {
342			uma_zfree(beio->lun->lun_zone,
343			    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr);
344			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = NULL;
345		}
346	}
347
348	if (duplicate_free > 0) {
349		printf("%s: %d duplicate frees out of %d segments\n", __func__,
350		       duplicate_free, beio->num_segs);
351	}
352
353	uma_zfree(beio_zone, beio);
354}
355
356static void
357ctl_complete_beio(struct ctl_be_block_io *beio)
358{
359	union ctl_io *io = beio->io;
360
361	if (beio->beio_cont != NULL) {
362		beio->beio_cont(beio);
363	} else {
364		ctl_free_beio(beio);
365		ctl_data_submit_done(io);
366	}
367}
368
369static int
370ctl_be_block_move_done(union ctl_io *io)
371{
372	struct ctl_be_block_io *beio;
373	struct ctl_be_block_lun *be_lun;
374	struct ctl_lba_len_flags *lbalen;
375#ifdef CTL_TIME_IO
376	struct bintime cur_bt;
377#endif
378	int i;
379
380	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
381	be_lun = beio->lun;
382
383	DPRINTF("entered\n");
384
385#ifdef CTL_TIME_IO
386	getbintime(&cur_bt);
387	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
388	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
389	io->io_hdr.num_dmas++;
390#endif
391	io->scsiio.kern_rel_offset += io->scsiio.kern_data_len;
392
393	/*
394	 * We set status at this point for read commands, and write
395	 * commands with errors.
396	 */
397	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
398		;
399	} else if ((io->io_hdr.port_status == 0) &&
400	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
401		lbalen = ARGS(beio->io);
402		if (lbalen->flags & CTL_LLF_READ) {
403			ctl_set_success(&io->scsiio);
404		} else if (lbalen->flags & CTL_LLF_COMPARE) {
405			/* We have two data blocks ready for comparison. */
406			for (i = 0; i < beio->num_segs; i++) {
407				if (memcmp(beio->sg_segs[i].addr,
408				    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
409				    beio->sg_segs[i].len) != 0)
410					break;
411			}
412			if (i < beio->num_segs)
413				ctl_set_sense(&io->scsiio,
414				    /*current_error*/ 1,
415				    /*sense_key*/ SSD_KEY_MISCOMPARE,
416				    /*asc*/ 0x1D,
417				    /*ascq*/ 0x00,
418				    SSD_ELEM_NONE);
419			else
420				ctl_set_success(&io->scsiio);
421		}
422	} else if ((io->io_hdr.port_status != 0) &&
423	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
424	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
425		/*
426		 * For hardware error sense keys, the sense key
427		 * specific value is defined to be a retry count,
428		 * but we use it to pass back an internal FETD
429		 * error code.  XXX KDM  Hopefully the FETD is only
430		 * using 16 bits for an error code, since that's
431		 * all the space we have in the sks field.
432		 */
433		ctl_set_internal_failure(&io->scsiio,
434					 /*sks_valid*/ 1,
435					 /*retry_count*/
436					 io->io_hdr.port_status);
437	}
438
439	/*
440	 * If this is a read, or a write with errors, it is done.
441	 */
442	if ((beio->bio_cmd == BIO_READ)
443	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
444	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
445		ctl_complete_beio(beio);
446		return (0);
447	}
448
449	/*
450	 * At this point, we have a write and the DMA completed
451	 * successfully.  We now have to queue it to the task queue to
452	 * execute the backend I/O.  That is because we do blocking
453	 * memory allocations, and in the file backing case, blocking I/O.
454	 * This move done routine is generally called in the SIM's
455	 * interrupt context, and therefore we cannot block.
456	 */
457	mtx_lock(&be_lun->queue_lock);
458	/*
459	 * XXX KDM make sure that links is okay to use at this point.
460	 * Otherwise, we either need to add another field to ctl_io_hdr,
461	 * or deal with resource allocation here.
462	 */
463	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
464	mtx_unlock(&be_lun->queue_lock);
465
466	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
467
468	return (0);
469}
470
471static void
472ctl_be_block_biodone(struct bio *bio)
473{
474	struct ctl_be_block_io *beio;
475	struct ctl_be_block_lun *be_lun;
476	union ctl_io *io;
477	int error;
478
479	beio = bio->bio_caller1;
480	be_lun = beio->lun;
481	io = beio->io;
482
483	DPRINTF("entered\n");
484
485	error = bio->bio_error;
486	mtx_lock(&be_lun->io_lock);
487	if (error != 0)
488		beio->num_errors++;
489
490	beio->num_bios_done++;
491
492	/*
493	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
494	 * during the free might cause it to complain.
495	 */
496	g_destroy_bio(bio);
497
498	/*
499	 * If the send complete bit isn't set, or we aren't the last I/O to
500	 * complete, then we're done.
501	 */
502	if ((beio->send_complete == 0)
503	 || (beio->num_bios_done < beio->num_bios_sent)) {
504		mtx_unlock(&be_lun->io_lock);
505		return;
506	}
507
508	/*
509	 * At this point, we've verified that we are the last I/O to
510	 * complete, so it's safe to drop the lock.
511	 */
512	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
513	    beio->ds_tag_type, beio->ds_trans_type,
514	    /*now*/ NULL, /*then*/&beio->ds_t0);
515	mtx_unlock(&be_lun->io_lock);
516
517	/*
518	 * If there are any errors from the backing device, we fail the
519	 * entire I/O with a medium error.
520	 */
521	if (beio->num_errors > 0) {
522		if (error == EOPNOTSUPP) {
523			ctl_set_invalid_opcode(&io->scsiio);
524		} else if (error == ENOSPC) {
525			ctl_set_space_alloc_fail(&io->scsiio);
526		} else if (beio->bio_cmd == BIO_FLUSH) {
527			/* XXX KDM is there is a better error here? */
528			ctl_set_internal_failure(&io->scsiio,
529						 /*sks_valid*/ 1,
530						 /*retry_count*/ 0xbad2);
531		} else
532			ctl_set_medium_error(&io->scsiio);
533		ctl_complete_beio(beio);
534		return;
535	}
536
537	/*
538	 * If this is a write, a flush, a delete or verify, we're all done.
539	 * If this is a read, we can now send the data to the user.
540	 */
541	if ((beio->bio_cmd == BIO_WRITE)
542	 || (beio->bio_cmd == BIO_FLUSH)
543	 || (beio->bio_cmd == BIO_DELETE)
544	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
545		ctl_set_success(&io->scsiio);
546		ctl_complete_beio(beio);
547	} else {
548		if ((ARGS(io)->flags & CTL_LLF_READ) &&
549		    beio->beio_cont == NULL)
550			ctl_set_success(&io->scsiio);
551#ifdef CTL_TIME_IO
552        	getbintime(&io->io_hdr.dma_start_bt);
553#endif
554		ctl_datamove(io);
555	}
556}
557
558static void
559ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
560			struct ctl_be_block_io *beio)
561{
562	union ctl_io *io = beio->io;
563	struct mount *mountpoint;
564	int error, lock_flags;
565
566	DPRINTF("entered\n");
567
568	binuptime(&beio->ds_t0);
569	mtx_lock(&be_lun->io_lock);
570	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
571	mtx_unlock(&be_lun->io_lock);
572
573	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
574
575	if (MNT_SHARED_WRITES(mountpoint)
576	 || ((mountpoint == NULL)
577	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
578		lock_flags = LK_SHARED;
579	else
580		lock_flags = LK_EXCLUSIVE;
581
582	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
583
584	error = VOP_FSYNC(be_lun->vn, MNT_WAIT, curthread);
585	VOP_UNLOCK(be_lun->vn, 0);
586
587	vn_finished_write(mountpoint);
588
589	mtx_lock(&be_lun->io_lock);
590	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
591	    beio->ds_tag_type, beio->ds_trans_type,
592	    /*now*/ NULL, /*then*/&beio->ds_t0);
593	mtx_unlock(&be_lun->io_lock);
594
595	if (error == 0)
596		ctl_set_success(&io->scsiio);
597	else {
598		/* XXX KDM is there is a better error here? */
599		ctl_set_internal_failure(&io->scsiio,
600					 /*sks_valid*/ 1,
601					 /*retry_count*/ 0xbad1);
602	}
603
604	ctl_complete_beio(beio);
605}
606
607SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, "uint64_t");
608SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, "uint64_t");
609SDT_PROBE_DEFINE1(cbb, kernel, read, file_done,"uint64_t");
610SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, "uint64_t");
611
612static void
613ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
614			   struct ctl_be_block_io *beio)
615{
616	struct ctl_be_block_filedata *file_data;
617	union ctl_io *io;
618	struct uio xuio;
619	struct iovec *xiovec;
620	int flags;
621	int error, i;
622
623	DPRINTF("entered\n");
624
625	file_data = &be_lun->backend.file;
626	io = beio->io;
627	flags = 0;
628	if (ARGS(io)->flags & CTL_LLF_DPO)
629		flags |= IO_DIRECT;
630	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
631		flags |= IO_SYNC;
632
633	bzero(&xuio, sizeof(xuio));
634	if (beio->bio_cmd == BIO_READ) {
635		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
636		xuio.uio_rw = UIO_READ;
637	} else {
638		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
639		xuio.uio_rw = UIO_WRITE;
640	}
641	xuio.uio_offset = beio->io_offset;
642	xuio.uio_resid = beio->io_len;
643	xuio.uio_segflg = UIO_SYSSPACE;
644	xuio.uio_iov = beio->xiovecs;
645	xuio.uio_iovcnt = beio->num_segs;
646	xuio.uio_td = curthread;
647
648	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
649		xiovec->iov_base = beio->sg_segs[i].addr;
650		xiovec->iov_len = beio->sg_segs[i].len;
651	}
652
653	binuptime(&beio->ds_t0);
654	mtx_lock(&be_lun->io_lock);
655	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
656	mtx_unlock(&be_lun->io_lock);
657
658	if (beio->bio_cmd == BIO_READ) {
659		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
660
661		/*
662		 * UFS pays attention to IO_DIRECT for reads.  If the
663		 * DIRECTIO option is configured into the kernel, it calls
664		 * ffs_rawread().  But that only works for single-segment
665		 * uios with user space addresses.  In our case, with a
666		 * kernel uio, it still reads into the buffer cache, but it
667		 * will just try to release the buffer from the cache later
668		 * on in ffs_read().
669		 *
670		 * ZFS does not pay attention to IO_DIRECT for reads.
671		 *
672		 * UFS does not pay attention to IO_SYNC for reads.
673		 *
674		 * ZFS pays attention to IO_SYNC (which translates into the
675		 * Solaris define FRSYNC for zfs_read()) for reads.  It
676		 * attempts to sync the file before reading.
677		 *
678		 * So, to attempt to provide some barrier semantics in the
679		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
680		 */
681		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
682
683		VOP_UNLOCK(be_lun->vn, 0);
684		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
685	} else {
686		struct mount *mountpoint;
687		int lock_flags;
688
689		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
690
691		if (MNT_SHARED_WRITES(mountpoint)
692		 || ((mountpoint == NULL)
693		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
694			lock_flags = LK_SHARED;
695		else
696			lock_flags = LK_EXCLUSIVE;
697
698		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
699
700		/*
701		 * UFS pays attention to IO_DIRECT for writes.  The write
702		 * is done asynchronously.  (Normally the write would just
703		 * get put into cache.
704		 *
705		 * UFS pays attention to IO_SYNC for writes.  It will
706		 * attempt to write the buffer out synchronously if that
707		 * flag is set.
708		 *
709		 * ZFS does not pay attention to IO_DIRECT for writes.
710		 *
711		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
712		 * for writes.  It will flush the transaction from the
713		 * cache before returning.
714		 *
715		 * So if we've got the BIO_ORDERED flag set, we want
716		 * IO_SYNC in either the UFS or ZFS case.
717		 */
718		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
719		VOP_UNLOCK(be_lun->vn, 0);
720
721		vn_finished_write(mountpoint);
722		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
723        }
724
725	mtx_lock(&be_lun->io_lock);
726	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
727	    beio->ds_tag_type, beio->ds_trans_type,
728	    /*now*/ NULL, /*then*/&beio->ds_t0);
729	mtx_unlock(&be_lun->io_lock);
730
731	/*
732	 * If we got an error, set the sense data to "MEDIUM ERROR" and
733	 * return the I/O to the user.
734	 */
735	if (error != 0) {
736		char path_str[32];
737
738		ctl_scsi_path_string(io, path_str, sizeof(path_str));
739		printf("%s%s command returned errno %d\n", path_str,
740		       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", error);
741		if (error == ENOSPC) {
742			ctl_set_space_alloc_fail(&io->scsiio);
743		} else
744			ctl_set_medium_error(&io->scsiio);
745		ctl_complete_beio(beio);
746		return;
747	}
748
749	/*
750	 * If this is a write or a verify, we're all done.
751	 * If this is a read, we can now send the data to the user.
752	 */
753	if ((beio->bio_cmd == BIO_WRITE) ||
754	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
755		ctl_set_success(&io->scsiio);
756		ctl_complete_beio(beio);
757	} else {
758		if ((ARGS(io)->flags & CTL_LLF_READ) &&
759		    beio->beio_cont == NULL)
760			ctl_set_success(&io->scsiio);
761#ifdef CTL_TIME_IO
762        	getbintime(&io->io_hdr.dma_start_bt);
763#endif
764		ctl_datamove(io);
765	}
766}
767
768static void
769ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
770			struct ctl_be_block_io *beio)
771{
772	union ctl_io *io = beio->io;
773	struct ctl_lba_len_flags *lbalen = ARGS(io);
774	struct scsi_get_lba_status_data *data;
775	off_t roff, off;
776	int error, status;
777
778	DPRINTF("entered\n");
779
780	off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
781	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
782	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
783	    0, curthread->td_ucred, curthread);
784	if (error == 0 && off > roff)
785		status = 0;	/* mapped up to off */
786	else {
787		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
788		    0, curthread->td_ucred, curthread);
789		if (error == 0 && off > roff)
790			status = 1;	/* deallocated up to off */
791		else {
792			status = 0;	/* unknown up to the end */
793			off = be_lun->size_bytes;
794		}
795	}
796	VOP_UNLOCK(be_lun->vn, 0);
797
798	off >>= be_lun->blocksize_shift;
799	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
800	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
801	scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
802	    data->descr[0].length);
803	data->descr[0].status = status;
804
805	ctl_complete_beio(beio);
806}
807
808static uint64_t
809ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
810{
811	struct vattr		vattr;
812	struct statfs		statfs;
813	int			error;
814
815	if (be_lun->vn == NULL)
816		return (UINT64_MAX);
817	if (strcmp(attrname, "blocksused") == 0) {
818		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
819		if (error != 0)
820			return (UINT64_MAX);
821		return (vattr.va_bytes >> be_lun->blocksize_shift);
822	}
823	if (strcmp(attrname, "blocksavail") == 0) {
824		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
825		if (error != 0)
826			return (UINT64_MAX);
827		return ((statfs.f_bavail * statfs.f_bsize) >>
828		    be_lun->blocksize_shift);
829	}
830	return (UINT64_MAX);
831}
832
833static void
834ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
835			   struct ctl_be_block_io *beio)
836{
837	struct ctl_be_block_devdata *dev_data;
838	union ctl_io *io;
839	struct uio xuio;
840	struct iovec *xiovec;
841	int flags;
842	int error, i;
843
844	DPRINTF("entered\n");
845
846	dev_data = &be_lun->backend.dev;
847	io = beio->io;
848	flags = 0;
849	if (ARGS(io)->flags & CTL_LLF_DPO)
850		flags |= IO_DIRECT;
851	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
852		flags |= IO_SYNC;
853
854	bzero(&xuio, sizeof(xuio));
855	if (beio->bio_cmd == BIO_READ) {
856		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
857		xuio.uio_rw = UIO_READ;
858	} else {
859		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
860		xuio.uio_rw = UIO_WRITE;
861	}
862	xuio.uio_offset = beio->io_offset;
863	xuio.uio_resid = beio->io_len;
864	xuio.uio_segflg = UIO_SYSSPACE;
865	xuio.uio_iov = beio->xiovecs;
866	xuio.uio_iovcnt = beio->num_segs;
867	xuio.uio_td = curthread;
868
869	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
870		xiovec->iov_base = beio->sg_segs[i].addr;
871		xiovec->iov_len = beio->sg_segs[i].len;
872	}
873
874	binuptime(&beio->ds_t0);
875	mtx_lock(&be_lun->io_lock);
876	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
877	mtx_unlock(&be_lun->io_lock);
878
879	if (beio->bio_cmd == BIO_READ) {
880		error = (*dev_data->csw->d_read)(dev_data->cdev, &xuio, flags);
881		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
882	} else {
883		error = (*dev_data->csw->d_write)(dev_data->cdev, &xuio, flags);
884		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
885	}
886
887	mtx_lock(&be_lun->io_lock);
888	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
889	    beio->ds_tag_type, beio->ds_trans_type,
890	    /*now*/ NULL, /*then*/&beio->ds_t0);
891	mtx_unlock(&be_lun->io_lock);
892
893	/*
894	 * If we got an error, set the sense data to "MEDIUM ERROR" and
895	 * return the I/O to the user.
896	 */
897	if (error != 0) {
898		if (error == ENOSPC) {
899			ctl_set_space_alloc_fail(&io->scsiio);
900		} else
901			ctl_set_medium_error(&io->scsiio);
902		ctl_complete_beio(beio);
903		return;
904	}
905
906	/*
907	 * If this is a write or a verify, we're all done.
908	 * If this is a read, we can now send the data to the user.
909	 */
910	if ((beio->bio_cmd == BIO_WRITE) ||
911	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
912		ctl_set_success(&io->scsiio);
913		ctl_complete_beio(beio);
914	} else {
915		if ((ARGS(io)->flags & CTL_LLF_READ) &&
916		    beio->beio_cont == NULL)
917			ctl_set_success(&io->scsiio);
918#ifdef CTL_TIME_IO
919        	getbintime(&io->io_hdr.dma_start_bt);
920#endif
921		ctl_datamove(io);
922	}
923}
924
925static void
926ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
927			struct ctl_be_block_io *beio)
928{
929	struct ctl_be_block_devdata *dev_data = &be_lun->backend.dev;
930	union ctl_io *io = beio->io;
931	struct ctl_lba_len_flags *lbalen = ARGS(io);
932	struct scsi_get_lba_status_data *data;
933	off_t roff, off;
934	int error, status;
935
936	DPRINTF("entered\n");
937
938	off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
939	error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKHOLE,
940	    (caddr_t)&off, FREAD, curthread);
941	if (error == 0 && off > roff)
942		status = 0;	/* mapped up to off */
943	else {
944		error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKDATA,
945		    (caddr_t)&off, FREAD, curthread);
946		if (error == 0 && off > roff)
947			status = 1;	/* deallocated up to off */
948		else {
949			status = 0;	/* unknown up to the end */
950			off = be_lun->size_bytes;
951		}
952	}
953
954	off >>= be_lun->blocksize_shift;
955	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
956	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
957	scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
958	    data->descr[0].length);
959	data->descr[0].status = status;
960
961	ctl_complete_beio(beio);
962}
963
964static void
965ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
966		       struct ctl_be_block_io *beio)
967{
968	struct bio *bio;
969	union ctl_io *io;
970	struct ctl_be_block_devdata *dev_data;
971
972	dev_data = &be_lun->backend.dev;
973	io = beio->io;
974
975	DPRINTF("entered\n");
976
977	/* This can't fail, it's a blocking allocation. */
978	bio = g_alloc_bio();
979
980	bio->bio_cmd	    = BIO_FLUSH;
981	bio->bio_flags	   |= BIO_ORDERED;
982	bio->bio_dev	    = dev_data->cdev;
983	bio->bio_offset	    = 0;
984	bio->bio_data	    = 0;
985	bio->bio_done	    = ctl_be_block_biodone;
986	bio->bio_caller1    = beio;
987	bio->bio_pblkno	    = 0;
988
989	/*
990	 * We don't need to acquire the LUN lock here, because we are only
991	 * sending one bio, and so there is no other context to synchronize
992	 * with.
993	 */
994	beio->num_bios_sent = 1;
995	beio->send_complete = 1;
996
997	binuptime(&beio->ds_t0);
998	mtx_lock(&be_lun->io_lock);
999	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1000	mtx_unlock(&be_lun->io_lock);
1001
1002	(*dev_data->csw->d_strategy)(bio);
1003}
1004
1005static void
1006ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1007		       struct ctl_be_block_io *beio,
1008		       uint64_t off, uint64_t len, int last)
1009{
1010	struct bio *bio;
1011	struct ctl_be_block_devdata *dev_data;
1012	uint64_t maxlen;
1013
1014	dev_data = &be_lun->backend.dev;
1015	maxlen = LONG_MAX - (LONG_MAX % be_lun->blocksize);
1016	while (len > 0) {
1017		bio = g_alloc_bio();
1018		bio->bio_cmd	    = BIO_DELETE;
1019		bio->bio_dev	    = dev_data->cdev;
1020		bio->bio_offset	    = off;
1021		bio->bio_length	    = MIN(len, maxlen);
1022		bio->bio_data	    = 0;
1023		bio->bio_done	    = ctl_be_block_biodone;
1024		bio->bio_caller1    = beio;
1025		bio->bio_pblkno     = off / be_lun->blocksize;
1026
1027		off += bio->bio_length;
1028		len -= bio->bio_length;
1029
1030		mtx_lock(&be_lun->io_lock);
1031		beio->num_bios_sent++;
1032		if (last && len == 0)
1033			beio->send_complete = 1;
1034		mtx_unlock(&be_lun->io_lock);
1035
1036		(*dev_data->csw->d_strategy)(bio);
1037	}
1038}
1039
1040static void
1041ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1042		       struct ctl_be_block_io *beio)
1043{
1044	union ctl_io *io;
1045	struct ctl_be_block_devdata *dev_data;
1046	struct ctl_ptr_len_flags *ptrlen;
1047	struct scsi_unmap_desc *buf, *end;
1048	uint64_t len;
1049
1050	dev_data = &be_lun->backend.dev;
1051	io = beio->io;
1052
1053	DPRINTF("entered\n");
1054
1055	binuptime(&beio->ds_t0);
1056	mtx_lock(&be_lun->io_lock);
1057	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1058	mtx_unlock(&be_lun->io_lock);
1059
1060	if (beio->io_offset == -1) {
1061		beio->io_len = 0;
1062		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1063		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1064		end = buf + ptrlen->len / sizeof(*buf);
1065		for (; buf < end; buf++) {
1066			len = (uint64_t)scsi_4btoul(buf->length) *
1067			    be_lun->blocksize;
1068			beio->io_len += len;
1069			ctl_be_block_unmap_dev_range(be_lun, beio,
1070			    scsi_8btou64(buf->lba) * be_lun->blocksize, len,
1071			    (end - buf < 2) ? TRUE : FALSE);
1072		}
1073	} else
1074		ctl_be_block_unmap_dev_range(be_lun, beio,
1075		    beio->io_offset, beio->io_len, TRUE);
1076}
1077
1078static void
1079ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1080			  struct ctl_be_block_io *beio)
1081{
1082	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1083	int i;
1084	struct bio *bio;
1085	struct ctl_be_block_devdata *dev_data;
1086	off_t cur_offset;
1087	int max_iosize;
1088
1089	DPRINTF("entered\n");
1090
1091	dev_data = &be_lun->backend.dev;
1092
1093	/*
1094	 * We have to limit our I/O size to the maximum supported by the
1095	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
1096	 * set it properly, use DFLTPHYS.
1097	 */
1098	max_iosize = dev_data->cdev->si_iosize_max;
1099	if (max_iosize < PAGE_SIZE)
1100		max_iosize = DFLTPHYS;
1101
1102	cur_offset = beio->io_offset;
1103	for (i = 0; i < beio->num_segs; i++) {
1104		size_t cur_size;
1105		uint8_t *cur_ptr;
1106
1107		cur_size = beio->sg_segs[i].len;
1108		cur_ptr = beio->sg_segs[i].addr;
1109
1110		while (cur_size > 0) {
1111			/* This can't fail, it's a blocking allocation. */
1112			bio = g_alloc_bio();
1113
1114			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1115
1116			bio->bio_cmd = beio->bio_cmd;
1117			bio->bio_dev = dev_data->cdev;
1118			bio->bio_caller1 = beio;
1119			bio->bio_length = min(cur_size, max_iosize);
1120			bio->bio_offset = cur_offset;
1121			bio->bio_data = cur_ptr;
1122			bio->bio_done = ctl_be_block_biodone;
1123			bio->bio_pblkno = cur_offset / be_lun->blocksize;
1124
1125			cur_offset += bio->bio_length;
1126			cur_ptr += bio->bio_length;
1127			cur_size -= bio->bio_length;
1128
1129			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1130			beio->num_bios_sent++;
1131		}
1132	}
1133	binuptime(&beio->ds_t0);
1134	mtx_lock(&be_lun->io_lock);
1135	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1136	beio->send_complete = 1;
1137	mtx_unlock(&be_lun->io_lock);
1138
1139	/*
1140	 * Fire off all allocated requests!
1141	 */
1142	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1143		TAILQ_REMOVE(&queue, bio, bio_queue);
1144		(*dev_data->csw->d_strategy)(bio);
1145	}
1146}
1147
1148static uint64_t
1149ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1150{
1151	struct ctl_be_block_devdata	*dev_data = &be_lun->backend.dev;
1152	struct diocgattr_arg	arg;
1153	int			error;
1154
1155	if (dev_data->csw == NULL || dev_data->csw->d_ioctl == NULL)
1156		return (UINT64_MAX);
1157	strlcpy(arg.name, attrname, sizeof(arg.name));
1158	arg.len = sizeof(arg.value.off);
1159	error = dev_data->csw->d_ioctl(dev_data->cdev,
1160	    DIOCGATTR, (caddr_t)&arg, FREAD, curthread);
1161	if (error != 0)
1162		return (UINT64_MAX);
1163	return (arg.value.off);
1164}
1165
1166static void
1167ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1168{
1169	union ctl_io *io;
1170
1171	io = beio->io;
1172	ctl_free_beio(beio);
1173	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1174	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1175	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1176		ctl_config_write_done(io);
1177		return;
1178	}
1179
1180	ctl_be_block_config_write(io);
1181}
1182
1183static void
1184ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1185			    union ctl_io *io)
1186{
1187	struct ctl_be_block_io *beio;
1188	struct ctl_be_block_softc *softc;
1189	struct ctl_lba_len_flags *lbalen;
1190	uint64_t len_left, lba;
1191	uint32_t pb, pbo, adj;
1192	int i, seglen;
1193	uint8_t *buf, *end;
1194
1195	DPRINTF("entered\n");
1196
1197	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1198	softc = be_lun->softc;
1199	lbalen = ARGS(beio->io);
1200
1201	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1202	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1203		ctl_free_beio(beio);
1204		ctl_set_invalid_field(&io->scsiio,
1205				      /*sks_valid*/ 1,
1206				      /*command*/ 1,
1207				      /*field*/ 1,
1208				      /*bit_valid*/ 0,
1209				      /*bit*/ 0);
1210		ctl_config_write_done(io);
1211		return;
1212	}
1213
1214	switch (io->scsiio.tag_type) {
1215	case CTL_TAG_ORDERED:
1216		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1217		break;
1218	case CTL_TAG_HEAD_OF_QUEUE:
1219		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1220		break;
1221	case CTL_TAG_UNTAGGED:
1222	case CTL_TAG_SIMPLE:
1223	case CTL_TAG_ACA:
1224	default:
1225		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1226		break;
1227	}
1228
1229	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1230		beio->io_offset = lbalen->lba * be_lun->blocksize;
1231		beio->io_len = (uint64_t)lbalen->len * be_lun->blocksize;
1232		beio->bio_cmd = BIO_DELETE;
1233		beio->ds_trans_type = DEVSTAT_FREE;
1234
1235		be_lun->unmap(be_lun, beio);
1236		return;
1237	}
1238
1239	beio->bio_cmd = BIO_WRITE;
1240	beio->ds_trans_type = DEVSTAT_WRITE;
1241
1242	DPRINTF("WRITE SAME at LBA %jx len %u\n",
1243	       (uintmax_t)lbalen->lba, lbalen->len);
1244
1245	pb = be_lun->blocksize << be_lun->pblockexp;
1246	if (be_lun->pblockoff > 0)
1247		pbo = pb - be_lun->blocksize * be_lun->pblockoff;
1248	else
1249		pbo = 0;
1250	len_left = (uint64_t)lbalen->len * be_lun->blocksize;
1251	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1252
1253		/*
1254		 * Setup the S/G entry for this chunk.
1255		 */
1256		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1257		if (pb > be_lun->blocksize) {
1258			adj = ((lbalen->lba + lba) * be_lun->blocksize +
1259			    seglen - pbo) % pb;
1260			if (seglen > adj)
1261				seglen -= adj;
1262			else
1263				seglen -= seglen % be_lun->blocksize;
1264		} else
1265			seglen -= seglen % be_lun->blocksize;
1266		beio->sg_segs[i].len = seglen;
1267		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1268
1269		DPRINTF("segment %d addr %p len %zd\n", i,
1270			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1271
1272		beio->num_segs++;
1273		len_left -= seglen;
1274
1275		buf = beio->sg_segs[i].addr;
1276		end = buf + seglen;
1277		for (; buf < end; buf += be_lun->blocksize) {
1278			memcpy(buf, io->scsiio.kern_data_ptr, be_lun->blocksize);
1279			if (lbalen->flags & SWS_LBDATA)
1280				scsi_ulto4b(lbalen->lba + lba, buf);
1281			lba++;
1282		}
1283	}
1284
1285	beio->io_offset = lbalen->lba * be_lun->blocksize;
1286	beio->io_len = lba * be_lun->blocksize;
1287
1288	/* We can not do all in one run. Correct and schedule rerun. */
1289	if (len_left > 0) {
1290		lbalen->lba += lba;
1291		lbalen->len -= lba;
1292		beio->beio_cont = ctl_be_block_cw_done_ws;
1293	}
1294
1295	be_lun->dispatch(be_lun, beio);
1296}
1297
1298static void
1299ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1300			    union ctl_io *io)
1301{
1302	struct ctl_be_block_io *beio;
1303	struct ctl_be_block_softc *softc;
1304	struct ctl_ptr_len_flags *ptrlen;
1305
1306	DPRINTF("entered\n");
1307
1308	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1309	softc = be_lun->softc;
1310	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1311
1312	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1313		ctl_free_beio(beio);
1314		ctl_set_invalid_field(&io->scsiio,
1315				      /*sks_valid*/ 0,
1316				      /*command*/ 1,
1317				      /*field*/ 0,
1318				      /*bit_valid*/ 0,
1319				      /*bit*/ 0);
1320		ctl_config_write_done(io);
1321		return;
1322	}
1323
1324	switch (io->scsiio.tag_type) {
1325	case CTL_TAG_ORDERED:
1326		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1327		break;
1328	case CTL_TAG_HEAD_OF_QUEUE:
1329		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1330		break;
1331	case CTL_TAG_UNTAGGED:
1332	case CTL_TAG_SIMPLE:
1333	case CTL_TAG_ACA:
1334	default:
1335		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1336		break;
1337	}
1338
1339	beio->io_len = 0;
1340	beio->io_offset = -1;
1341
1342	beio->bio_cmd = BIO_DELETE;
1343	beio->ds_trans_type = DEVSTAT_FREE;
1344
1345	DPRINTF("UNMAP\n");
1346
1347	be_lun->unmap(be_lun, beio);
1348}
1349
1350static void
1351ctl_be_block_cr_done(struct ctl_be_block_io *beio)
1352{
1353	union ctl_io *io;
1354
1355	io = beio->io;
1356	ctl_free_beio(beio);
1357	ctl_config_read_done(io);
1358}
1359
1360static void
1361ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1362			 union ctl_io *io)
1363{
1364	struct ctl_be_block_io *beio;
1365	struct ctl_be_block_softc *softc;
1366
1367	DPRINTF("entered\n");
1368
1369	softc = be_lun->softc;
1370	beio = ctl_alloc_beio(softc);
1371	beio->io = io;
1372	beio->lun = be_lun;
1373	beio->beio_cont = ctl_be_block_cr_done;
1374	PRIV(io)->ptr = (void *)beio;
1375
1376	switch (io->scsiio.cdb[0]) {
1377	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
1378		beio->bio_cmd = -1;
1379		beio->ds_trans_type = DEVSTAT_NO_DATA;
1380		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1381		beio->io_len = 0;
1382		if (be_lun->get_lba_status)
1383			be_lun->get_lba_status(be_lun, beio);
1384		else
1385			ctl_be_block_cr_done(beio);
1386		break;
1387	default:
1388		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1389		break;
1390	}
1391}
1392
1393static void
1394ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1395{
1396	union ctl_io *io;
1397
1398	io = beio->io;
1399	ctl_free_beio(beio);
1400	ctl_config_write_done(io);
1401}
1402
1403static void
1404ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1405			 union ctl_io *io)
1406{
1407	struct ctl_be_block_io *beio;
1408	struct ctl_be_block_softc *softc;
1409
1410	DPRINTF("entered\n");
1411
1412	softc = be_lun->softc;
1413	beio = ctl_alloc_beio(softc);
1414	beio->io = io;
1415	beio->lun = be_lun;
1416	beio->beio_cont = ctl_be_block_cw_done;
1417	PRIV(io)->ptr = (void *)beio;
1418
1419	switch (io->scsiio.cdb[0]) {
1420	case SYNCHRONIZE_CACHE:
1421	case SYNCHRONIZE_CACHE_16:
1422		beio->bio_cmd = BIO_FLUSH;
1423		beio->ds_trans_type = DEVSTAT_NO_DATA;
1424		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1425		beio->io_len = 0;
1426		be_lun->lun_flush(be_lun, beio);
1427		break;
1428	case WRITE_SAME_10:
1429	case WRITE_SAME_16:
1430		ctl_be_block_cw_dispatch_ws(be_lun, io);
1431		break;
1432	case UNMAP:
1433		ctl_be_block_cw_dispatch_unmap(be_lun, io);
1434		break;
1435	default:
1436		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1437		break;
1438	}
1439}
1440
1441SDT_PROBE_DEFINE1(cbb, kernel, read, start, "uint64_t");
1442SDT_PROBE_DEFINE1(cbb, kernel, write, start, "uint64_t");
1443SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, "uint64_t");
1444SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, "uint64_t");
1445
1446static void
1447ctl_be_block_next(struct ctl_be_block_io *beio)
1448{
1449	struct ctl_be_block_lun *be_lun;
1450	union ctl_io *io;
1451
1452	io = beio->io;
1453	be_lun = beio->lun;
1454	ctl_free_beio(beio);
1455	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1456	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1457	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1458		ctl_data_submit_done(io);
1459		return;
1460	}
1461
1462	io->io_hdr.status &= ~CTL_STATUS_MASK;
1463	io->io_hdr.status |= CTL_STATUS_NONE;
1464
1465	mtx_lock(&be_lun->queue_lock);
1466	/*
1467	 * XXX KDM make sure that links is okay to use at this point.
1468	 * Otherwise, we either need to add another field to ctl_io_hdr,
1469	 * or deal with resource allocation here.
1470	 */
1471	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1472	mtx_unlock(&be_lun->queue_lock);
1473
1474	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1475}
1476
1477static void
1478ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1479			   union ctl_io *io)
1480{
1481	struct ctl_be_block_io *beio;
1482	struct ctl_be_block_softc *softc;
1483	struct ctl_lba_len_flags *lbalen;
1484	struct ctl_ptr_len_flags *bptrlen;
1485	uint64_t len_left, lbas;
1486	int i;
1487
1488	softc = be_lun->softc;
1489
1490	DPRINTF("entered\n");
1491
1492	lbalen = ARGS(io);
1493	if (lbalen->flags & CTL_LLF_WRITE) {
1494		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
1495	} else {
1496		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
1497	}
1498
1499	beio = ctl_alloc_beio(softc);
1500	beio->io = io;
1501	beio->lun = be_lun;
1502	bptrlen = PRIV(io);
1503	bptrlen->ptr = (void *)beio;
1504
1505	switch (io->scsiio.tag_type) {
1506	case CTL_TAG_ORDERED:
1507		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1508		break;
1509	case CTL_TAG_HEAD_OF_QUEUE:
1510		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1511		break;
1512	case CTL_TAG_UNTAGGED:
1513	case CTL_TAG_SIMPLE:
1514	case CTL_TAG_ACA:
1515	default:
1516		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1517		break;
1518	}
1519
1520	if (lbalen->flags & CTL_LLF_WRITE) {
1521		beio->bio_cmd = BIO_WRITE;
1522		beio->ds_trans_type = DEVSTAT_WRITE;
1523	} else {
1524		beio->bio_cmd = BIO_READ;
1525		beio->ds_trans_type = DEVSTAT_READ;
1526	}
1527
1528	DPRINTF("%s at LBA %jx len %u @%ju\n",
1529	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1530	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
1531	if (lbalen->flags & CTL_LLF_COMPARE)
1532		lbas = CTLBLK_HALF_IO_SIZE;
1533	else
1534		lbas = CTLBLK_MAX_IO_SIZE;
1535	lbas = MIN(lbalen->len - bptrlen->len, lbas / be_lun->blocksize);
1536	beio->io_offset = (lbalen->lba + bptrlen->len) * be_lun->blocksize;
1537	beio->io_len = lbas * be_lun->blocksize;
1538	bptrlen->len += lbas;
1539
1540	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
1541		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
1542		    i, CTLBLK_MAX_SEGS));
1543
1544		/*
1545		 * Setup the S/G entry for this chunk.
1546		 */
1547		beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left);
1548		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1549
1550		DPRINTF("segment %d addr %p len %zd\n", i,
1551			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1552
1553		/* Set up second segment for compare operation. */
1554		if (lbalen->flags & CTL_LLF_COMPARE) {
1555			beio->sg_segs[i + CTLBLK_HALF_SEGS].len =
1556			    beio->sg_segs[i].len;
1557			beio->sg_segs[i + CTLBLK_HALF_SEGS].addr =
1558			    uma_zalloc(be_lun->lun_zone, M_WAITOK);
1559		}
1560
1561		beio->num_segs++;
1562		len_left -= beio->sg_segs[i].len;
1563	}
1564	if (bptrlen->len < lbalen->len)
1565		beio->beio_cont = ctl_be_block_next;
1566	io->scsiio.be_move_done = ctl_be_block_move_done;
1567	/* For compare we have separate S/G lists for read and datamove. */
1568	if (lbalen->flags & CTL_LLF_COMPARE)
1569		io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS];
1570	else
1571		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1572	io->scsiio.kern_data_len = beio->io_len;
1573	io->scsiio.kern_data_resid = 0;
1574	io->scsiio.kern_sg_entries = beio->num_segs;
1575	io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
1576
1577	/*
1578	 * For the read case, we need to read the data into our buffers and
1579	 * then we can send it back to the user.  For the write case, we
1580	 * need to get the data from the user first.
1581	 */
1582	if (beio->bio_cmd == BIO_READ) {
1583		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
1584		be_lun->dispatch(be_lun, beio);
1585	} else {
1586		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1587#ifdef CTL_TIME_IO
1588        	getbintime(&io->io_hdr.dma_start_bt);
1589#endif
1590		ctl_datamove(io);
1591	}
1592}
1593
1594static void
1595ctl_be_block_worker(void *context, int pending)
1596{
1597	struct ctl_be_block_lun *be_lun;
1598	struct ctl_be_block_softc *softc;
1599	union ctl_io *io;
1600
1601	be_lun = (struct ctl_be_block_lun *)context;
1602	softc = be_lun->softc;
1603
1604	DPRINTF("entered\n");
1605
1606	mtx_lock(&be_lun->queue_lock);
1607	for (;;) {
1608		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1609		if (io != NULL) {
1610			struct ctl_be_block_io *beio;
1611
1612			DPRINTF("datamove queue\n");
1613
1614			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1615				      ctl_io_hdr, links);
1616
1617			mtx_unlock(&be_lun->queue_lock);
1618
1619			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1620
1621			be_lun->dispatch(be_lun, beio);
1622
1623			mtx_lock(&be_lun->queue_lock);
1624			continue;
1625		}
1626		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1627		if (io != NULL) {
1628			DPRINTF("config write queue\n");
1629			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1630				      ctl_io_hdr, links);
1631			mtx_unlock(&be_lun->queue_lock);
1632			ctl_be_block_cw_dispatch(be_lun, io);
1633			mtx_lock(&be_lun->queue_lock);
1634			continue;
1635		}
1636		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
1637		if (io != NULL) {
1638			DPRINTF("config read queue\n");
1639			STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr,
1640				      ctl_io_hdr, links);
1641			mtx_unlock(&be_lun->queue_lock);
1642			ctl_be_block_cr_dispatch(be_lun, io);
1643			mtx_lock(&be_lun->queue_lock);
1644			continue;
1645		}
1646		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1647		if (io != NULL) {
1648			DPRINTF("input queue\n");
1649
1650			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1651				      ctl_io_hdr, links);
1652			mtx_unlock(&be_lun->queue_lock);
1653
1654			/*
1655			 * We must drop the lock, since this routine and
1656			 * its children may sleep.
1657			 */
1658			ctl_be_block_dispatch(be_lun, io);
1659
1660			mtx_lock(&be_lun->queue_lock);
1661			continue;
1662		}
1663
1664		/*
1665		 * If we get here, there is no work left in the queues, so
1666		 * just break out and let the task queue go to sleep.
1667		 */
1668		break;
1669	}
1670	mtx_unlock(&be_lun->queue_lock);
1671}
1672
1673/*
1674 * Entry point from CTL to the backend for I/O.  We queue everything to a
1675 * work thread, so this just puts the I/O on a queue and wakes up the
1676 * thread.
1677 */
1678static int
1679ctl_be_block_submit(union ctl_io *io)
1680{
1681	struct ctl_be_block_lun *be_lun;
1682	struct ctl_be_lun *ctl_be_lun;
1683
1684	DPRINTF("entered\n");
1685
1686	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1687		CTL_PRIV_BACKEND_LUN].ptr;
1688	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
1689
1690	/*
1691	 * Make sure we only get SCSI I/O.
1692	 */
1693	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1694		"%#x) encountered", io->io_hdr.io_type));
1695
1696	PRIV(io)->len = 0;
1697
1698	mtx_lock(&be_lun->queue_lock);
1699	/*
1700	 * XXX KDM make sure that links is okay to use at this point.
1701	 * Otherwise, we either need to add another field to ctl_io_hdr,
1702	 * or deal with resource allocation here.
1703	 */
1704	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1705	mtx_unlock(&be_lun->queue_lock);
1706	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1707
1708	return (CTL_RETVAL_COMPLETE);
1709}
1710
1711static int
1712ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1713			int flag, struct thread *td)
1714{
1715	struct ctl_be_block_softc *softc;
1716	int error;
1717
1718	softc = &backend_block_softc;
1719
1720	error = 0;
1721
1722	switch (cmd) {
1723	case CTL_LUN_REQ: {
1724		struct ctl_lun_req *lun_req;
1725
1726		lun_req = (struct ctl_lun_req *)addr;
1727
1728		switch (lun_req->reqtype) {
1729		case CTL_LUNREQ_CREATE:
1730			error = ctl_be_block_create(softc, lun_req);
1731			break;
1732		case CTL_LUNREQ_RM:
1733			error = ctl_be_block_rm(softc, lun_req);
1734			break;
1735		case CTL_LUNREQ_MODIFY:
1736			error = ctl_be_block_modify(softc, lun_req);
1737			break;
1738		default:
1739			lun_req->status = CTL_LUN_ERROR;
1740			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1741				 "invalid LUN request type %d",
1742				 lun_req->reqtype);
1743			break;
1744		}
1745		break;
1746	}
1747	default:
1748		error = ENOTTY;
1749		break;
1750	}
1751
1752	return (error);
1753}
1754
1755static int
1756ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1757{
1758	struct ctl_be_block_filedata *file_data;
1759	struct ctl_lun_create_params *params;
1760	char			     *value;
1761	struct vattr		      vattr;
1762	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1763	int			      error;
1764
1765	error = 0;
1766	file_data = &be_lun->backend.file;
1767	params = &be_lun->params;
1768
1769	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1770	be_lun->dispatch = ctl_be_block_dispatch_file;
1771	be_lun->lun_flush = ctl_be_block_flush_file;
1772	be_lun->get_lba_status = ctl_be_block_gls_file;
1773	be_lun->getattr = ctl_be_block_getattr_file;
1774
1775	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1776	if (error != 0) {
1777		snprintf(req->error_str, sizeof(req->error_str),
1778			 "error calling VOP_GETATTR() for file %s",
1779			 be_lun->dev_path);
1780		return (error);
1781	}
1782
1783	/*
1784	 * Verify that we have the ability to upgrade to exclusive
1785	 * access on this file so we can trap errors at open instead
1786	 * of reporting them during first access.
1787	 */
1788	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1789		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1790		if (be_lun->vn->v_iflag & VI_DOOMED) {
1791			error = EBADF;
1792			snprintf(req->error_str, sizeof(req->error_str),
1793				 "error locking file %s", be_lun->dev_path);
1794			return (error);
1795		}
1796	}
1797
1798
1799	file_data->cred = crhold(curthread->td_ucred);
1800	if (params->lun_size_bytes != 0)
1801		be_lun->size_bytes = params->lun_size_bytes;
1802	else
1803		be_lun->size_bytes = vattr.va_size;
1804	/*
1805	 * We set the multi thread flag for file operations because all
1806	 * filesystems (in theory) are capable of allowing multiple readers
1807	 * of a file at once.  So we want to get the maximum possible
1808	 * concurrency.
1809	 */
1810	be_lun->flags |= CTL_BE_BLOCK_LUN_MULTI_THREAD;
1811
1812	/*
1813	 * For files we can use any logical block size.  Prefer 512 bytes
1814	 * for compatibility reasons.  If file's vattr.va_blocksize
1815	 * (preferred I/O block size) is bigger and multiple to chosen
1816	 * logical block size -- report it as physical block size.
1817	 */
1818	if (params->blocksize_bytes != 0)
1819		be_lun->blocksize = params->blocksize_bytes;
1820	else
1821		be_lun->blocksize = 512;
1822
1823	us = ps = vattr.va_blocksize;
1824	uo = po = 0;
1825
1826	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblocksize");
1827	if (value != NULL)
1828		ctl_expand_number(value, &ps);
1829	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblockoffset");
1830	if (value != NULL)
1831		ctl_expand_number(value, &po);
1832	pss = ps / be_lun->blocksize;
1833	pos = po / be_lun->blocksize;
1834	if ((pss > 0) && (pss * be_lun->blocksize == ps) && (pss >= pos) &&
1835	    ((pss & (pss - 1)) == 0) && (pos * be_lun->blocksize == po)) {
1836		be_lun->pblockexp = fls(pss) - 1;
1837		be_lun->pblockoff = (pss - pos) % pss;
1838	}
1839
1840	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublocksize");
1841	if (value != NULL)
1842		ctl_expand_number(value, &us);
1843	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublockoffset");
1844	if (value != NULL)
1845		ctl_expand_number(value, &uo);
1846	uss = us / be_lun->blocksize;
1847	uos = uo / be_lun->blocksize;
1848	if ((uss > 0) && (uss * be_lun->blocksize == us) && (uss >= uos) &&
1849	    ((uss & (uss - 1)) == 0) && (uos * be_lun->blocksize == uo)) {
1850		be_lun->ublockexp = fls(uss) - 1;
1851		be_lun->ublockoff = (uss - uos) % uss;
1852	}
1853
1854	/*
1855	 * Sanity check.  The media size has to be at least one
1856	 * sector long.
1857	 */
1858	if (be_lun->size_bytes < be_lun->blocksize) {
1859		error = EINVAL;
1860		snprintf(req->error_str, sizeof(req->error_str),
1861			 "file %s size %ju < block size %u", be_lun->dev_path,
1862			 (uintmax_t)be_lun->size_bytes, be_lun->blocksize);
1863	}
1864
1865	be_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / be_lun->blocksize;
1866	return (error);
1867}
1868
1869static int
1870ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1871{
1872	struct ctl_lun_create_params *params;
1873	struct vattr		      vattr;
1874	struct cdev		     *dev;
1875	struct cdevsw		     *devsw;
1876	char			     *value;
1877	int			      error, atomic, maxio, unmap;
1878	off_t			      ps, pss, po, pos, us, uss, uo, uos;
1879
1880	params = &be_lun->params;
1881
1882	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1883	be_lun->backend.dev.cdev = be_lun->vn->v_rdev;
1884	be_lun->backend.dev.csw = dev_refthread(be_lun->backend.dev.cdev,
1885					     &be_lun->backend.dev.dev_ref);
1886	if (be_lun->backend.dev.csw == NULL)
1887		panic("Unable to retrieve device switch");
1888	if (strcmp(be_lun->backend.dev.csw->d_name, "zvol") == 0) {
1889		be_lun->dispatch = ctl_be_block_dispatch_zvol;
1890		be_lun->get_lba_status = ctl_be_block_gls_zvol;
1891		atomic = maxio = CTLBLK_MAX_IO_SIZE;
1892	} else {
1893		be_lun->dispatch = ctl_be_block_dispatch_dev;
1894		atomic = 0;
1895		maxio = be_lun->backend.dev.cdev->si_iosize_max;
1896		if (maxio <= 0)
1897			maxio = DFLTPHYS;
1898		if (maxio > CTLBLK_MAX_IO_SIZE)
1899			maxio = CTLBLK_MAX_IO_SIZE;
1900	}
1901	be_lun->lun_flush = ctl_be_block_flush_dev;
1902	be_lun->getattr = ctl_be_block_getattr_dev;
1903
1904	error = VOP_GETATTR(be_lun->vn, &vattr, NOCRED);
1905	if (error) {
1906		snprintf(req->error_str, sizeof(req->error_str),
1907			 "error getting vnode attributes for device %s",
1908			 be_lun->dev_path);
1909		return (error);
1910	}
1911
1912	dev = be_lun->vn->v_rdev;
1913	devsw = dev->si_devsw;
1914	if (!devsw->d_ioctl) {
1915		snprintf(req->error_str, sizeof(req->error_str),
1916			 "no d_ioctl for device %s!",
1917			 be_lun->dev_path);
1918		return (ENODEV);
1919	}
1920
1921	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
1922			       (caddr_t)&be_lun->blocksize, FREAD,
1923			       curthread);
1924	if (error) {
1925		snprintf(req->error_str, sizeof(req->error_str),
1926			 "error %d returned for DIOCGSECTORSIZE ioctl "
1927			 "on %s!", error, be_lun->dev_path);
1928		return (error);
1929	}
1930
1931	/*
1932	 * If the user has asked for a blocksize that is greater than the
1933	 * backing device's blocksize, we can do it only if the blocksize
1934	 * the user is asking for is an even multiple of the underlying
1935	 * device's blocksize.
1936	 */
1937	if ((params->blocksize_bytes != 0)
1938	 && (params->blocksize_bytes > be_lun->blocksize)) {
1939		uint32_t bs_multiple, tmp_blocksize;
1940
1941		bs_multiple = params->blocksize_bytes / be_lun->blocksize;
1942
1943		tmp_blocksize = bs_multiple * be_lun->blocksize;
1944
1945		if (tmp_blocksize == params->blocksize_bytes) {
1946			be_lun->blocksize = params->blocksize_bytes;
1947		} else {
1948			snprintf(req->error_str, sizeof(req->error_str),
1949				 "requested blocksize %u is not an even "
1950				 "multiple of backing device blocksize %u",
1951				 params->blocksize_bytes,
1952				 be_lun->blocksize);
1953			return (EINVAL);
1954
1955		}
1956	} else if ((params->blocksize_bytes != 0)
1957		&& (params->blocksize_bytes != be_lun->blocksize)) {
1958		snprintf(req->error_str, sizeof(req->error_str),
1959			 "requested blocksize %u < backing device "
1960			 "blocksize %u", params->blocksize_bytes,
1961			 be_lun->blocksize);
1962		return (EINVAL);
1963	}
1964
1965	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
1966			       (caddr_t)&be_lun->size_bytes, FREAD,
1967			       curthread);
1968	if (error) {
1969		snprintf(req->error_str, sizeof(req->error_str),
1970			 "error %d returned for DIOCGMEDIASIZE "
1971			 " ioctl on %s!", error,
1972			 be_lun->dev_path);
1973		return (error);
1974	}
1975
1976	if (params->lun_size_bytes != 0) {
1977		if (params->lun_size_bytes > be_lun->size_bytes) {
1978			snprintf(req->error_str, sizeof(req->error_str),
1979				 "requested LUN size %ju > backing device "
1980				 "size %ju",
1981				 (uintmax_t)params->lun_size_bytes,
1982				 (uintmax_t)be_lun->size_bytes);
1983			return (EINVAL);
1984		}
1985
1986		be_lun->size_bytes = params->lun_size_bytes;
1987	}
1988
1989	error = devsw->d_ioctl(dev, DIOCGSTRIPESIZE,
1990			       (caddr_t)&ps, FREAD, curthread);
1991	if (error)
1992		ps = po = 0;
1993	else {
1994		error = devsw->d_ioctl(dev, DIOCGSTRIPEOFFSET,
1995				       (caddr_t)&po, FREAD, curthread);
1996		if (error)
1997			po = 0;
1998	}
1999	us = ps;
2000	uo = po;
2001
2002	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblocksize");
2003	if (value != NULL)
2004		ctl_expand_number(value, &ps);
2005	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "pblockoffset");
2006	if (value != NULL)
2007		ctl_expand_number(value, &po);
2008	pss = ps / be_lun->blocksize;
2009	pos = po / be_lun->blocksize;
2010	if ((pss > 0) && (pss * be_lun->blocksize == ps) && (pss >= pos) &&
2011	    ((pss & (pss - 1)) == 0) && (pos * be_lun->blocksize == po)) {
2012		be_lun->pblockexp = fls(pss) - 1;
2013		be_lun->pblockoff = (pss - pos) % pss;
2014	}
2015
2016	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublocksize");
2017	if (value != NULL)
2018		ctl_expand_number(value, &us);
2019	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "ublockoffset");
2020	if (value != NULL)
2021		ctl_expand_number(value, &uo);
2022	uss = us / be_lun->blocksize;
2023	uos = uo / be_lun->blocksize;
2024	if ((uss > 0) && (uss * be_lun->blocksize == us) && (uss >= uos) &&
2025	    ((uss & (uss - 1)) == 0) && (uos * be_lun->blocksize == uo)) {
2026		be_lun->ublockexp = fls(uss) - 1;
2027		be_lun->ublockoff = (uss - uos) % uss;
2028	}
2029
2030	be_lun->atomicblock = atomic / be_lun->blocksize;
2031	be_lun->opttxferlen = maxio / be_lun->blocksize;
2032
2033	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2034		unmap = 1;
2035	} else {
2036		struct diocgattr_arg	arg;
2037
2038		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2039		arg.len = sizeof(arg.value.i);
2040		error = devsw->d_ioctl(dev, DIOCGATTR,
2041		    (caddr_t)&arg, FREAD, curthread);
2042		unmap = (error == 0) ? arg.value.i : 0;
2043	}
2044	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "unmap");
2045	if (value != NULL)
2046		unmap = (strcmp(value, "on") == 0);
2047	if (unmap)
2048		be_lun->unmap = ctl_be_block_unmap_dev;
2049
2050	return (0);
2051}
2052
2053static int
2054ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2055{
2056	DROP_GIANT();
2057	if (be_lun->vn) {
2058		int flags = FREAD | FWRITE;
2059
2060		switch (be_lun->dev_type) {
2061		case CTL_BE_BLOCK_DEV:
2062			if (be_lun->backend.dev.csw) {
2063				dev_relthread(be_lun->backend.dev.cdev,
2064					      be_lun->backend.dev.dev_ref);
2065				be_lun->backend.dev.csw  = NULL;
2066				be_lun->backend.dev.cdev = NULL;
2067			}
2068			break;
2069		case CTL_BE_BLOCK_FILE:
2070			break;
2071		case CTL_BE_BLOCK_NONE:
2072			break;
2073		default:
2074			panic("Unexpected backend type.");
2075			break;
2076		}
2077
2078		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2079		be_lun->vn = NULL;
2080
2081		switch (be_lun->dev_type) {
2082		case CTL_BE_BLOCK_DEV:
2083			break;
2084		case CTL_BE_BLOCK_FILE:
2085			if (be_lun->backend.file.cred != NULL) {
2086				crfree(be_lun->backend.file.cred);
2087				be_lun->backend.file.cred = NULL;
2088			}
2089			break;
2090		case CTL_BE_BLOCK_NONE:
2091			break;
2092		default:
2093			panic("Unexpected backend type.");
2094			break;
2095		}
2096		be_lun->dev_type = CTL_BE_BLOCK_NONE;
2097	}
2098	PICKUP_GIANT();
2099
2100	return (0);
2101}
2102
2103static int
2104ctl_be_block_open(struct ctl_be_block_softc *softc,
2105		       struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2106{
2107	struct nameidata nd;
2108	int		 flags;
2109	int		 error;
2110
2111	/*
2112	 * XXX KDM allow a read-only option?
2113	 */
2114	flags = FREAD | FWRITE;
2115	error = 0;
2116
2117	if (rootvnode == NULL) {
2118		snprintf(req->error_str, sizeof(req->error_str),
2119			 "Root filesystem is not mounted");
2120		return (1);
2121	}
2122
2123	if (!curthread->td_proc->p_fd->fd_cdir) {
2124		curthread->td_proc->p_fd->fd_cdir = rootvnode;
2125		VREF(rootvnode);
2126	}
2127	if (!curthread->td_proc->p_fd->fd_rdir) {
2128		curthread->td_proc->p_fd->fd_rdir = rootvnode;
2129		VREF(rootvnode);
2130	}
2131	if (!curthread->td_proc->p_fd->fd_jdir) {
2132		curthread->td_proc->p_fd->fd_jdir = rootvnode;
2133		VREF(rootvnode);
2134	}
2135
2136 again:
2137	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
2138	error = vn_open(&nd, &flags, 0, NULL);
2139	if (error) {
2140		/*
2141		 * This is the only reasonable guess we can make as far as
2142		 * path if the user doesn't give us a fully qualified path.
2143		 * If they want to specify a file, they need to specify the
2144		 * full path.
2145		 */
2146		if (be_lun->dev_path[0] != '/') {
2147			char *dev_path = "/dev/";
2148			char *dev_name;
2149
2150			/* Try adding device path at beginning of name */
2151			dev_name = malloc(strlen(be_lun->dev_path)
2152					+ strlen(dev_path) + 1,
2153					  M_CTLBLK, M_WAITOK);
2154			if (dev_name) {
2155				sprintf(dev_name, "%s%s", dev_path,
2156					be_lun->dev_path);
2157				free(be_lun->dev_path, M_CTLBLK);
2158				be_lun->dev_path = dev_name;
2159				goto again;
2160			}
2161		}
2162		snprintf(req->error_str, sizeof(req->error_str),
2163		    "error opening %s: %d", be_lun->dev_path, error);
2164		return (error);
2165	}
2166
2167	NDFREE(&nd, NDF_ONLY_PNBUF);
2168
2169	be_lun->vn = nd.ni_vp;
2170
2171	/* We only support disks and files. */
2172	if (vn_isdisk(be_lun->vn, &error)) {
2173		error = ctl_be_block_open_dev(be_lun, req);
2174	} else if (be_lun->vn->v_type == VREG) {
2175		error = ctl_be_block_open_file(be_lun, req);
2176	} else {
2177		error = EINVAL;
2178		snprintf(req->error_str, sizeof(req->error_str),
2179			 "%s is not a disk or plain file", be_lun->dev_path);
2180	}
2181	VOP_UNLOCK(be_lun->vn, 0);
2182
2183	if (error != 0) {
2184		ctl_be_block_close(be_lun);
2185		return (error);
2186	}
2187
2188	be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
2189	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
2190
2191	return (0);
2192}
2193
2194static int
2195ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2196{
2197	struct ctl_be_block_lun *be_lun;
2198	struct ctl_lun_create_params *params;
2199	char num_thread_str[16];
2200	char tmpstr[32];
2201	char *value;
2202	int retval, num_threads;
2203	int tmp_num_threads;
2204
2205	params = &req->reqdata.create;
2206	retval = 0;
2207	req->status = CTL_LUN_OK;
2208
2209	num_threads = cbb_num_threads;
2210
2211	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2212
2213	be_lun->params = req->reqdata.create;
2214	be_lun->softc = softc;
2215	STAILQ_INIT(&be_lun->input_queue);
2216	STAILQ_INIT(&be_lun->config_read_queue);
2217	STAILQ_INIT(&be_lun->config_write_queue);
2218	STAILQ_INIT(&be_lun->datamove_queue);
2219	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
2220	mtx_init(&be_lun->io_lock, "cblk io lock", NULL, MTX_DEF);
2221	mtx_init(&be_lun->queue_lock, "cblk queue lock", NULL, MTX_DEF);
2222	ctl_init_opts(&be_lun->ctl_be_lun.options,
2223	    req->num_be_args, req->kern_be_args);
2224
2225	be_lun->lun_zone = uma_zcreate(be_lun->lunname, CTLBLK_MAX_SEG,
2226	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
2227
2228	if (be_lun->lun_zone == NULL) {
2229		snprintf(req->error_str, sizeof(req->error_str),
2230			 "error allocating UMA zone");
2231		goto bailout_error;
2232	}
2233
2234	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2235		be_lun->ctl_be_lun.lun_type = params->device_type;
2236	else
2237		be_lun->ctl_be_lun.lun_type = T_DIRECT;
2238
2239	if (be_lun->ctl_be_lun.lun_type == T_DIRECT) {
2240		value = ctl_get_opt(&be_lun->ctl_be_lun.options, "file");
2241		if (value == NULL) {
2242			snprintf(req->error_str, sizeof(req->error_str),
2243				 "no file argument specified");
2244			goto bailout_error;
2245		}
2246		be_lun->dev_path = strdup(value, M_CTLBLK);
2247		be_lun->blocksize = 512;
2248		be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
2249
2250		retval = ctl_be_block_open(softc, be_lun, req);
2251		if (retval != 0) {
2252			retval = 0;
2253			req->status = CTL_LUN_WARNING;
2254		}
2255	} else {
2256		/*
2257		 * For processor devices, we don't have any size.
2258		 */
2259		be_lun->blocksize = 0;
2260		be_lun->pblockexp = 0;
2261		be_lun->pblockoff = 0;
2262		be_lun->ublockexp = 0;
2263		be_lun->ublockoff = 0;
2264		be_lun->size_blocks = 0;
2265		be_lun->size_bytes = 0;
2266		be_lun->ctl_be_lun.maxlba = 0;
2267
2268		/*
2269		 * Default to just 1 thread for processor devices.
2270		 */
2271		num_threads = 1;
2272	}
2273
2274	/*
2275	 * XXX This searching loop might be refactored to be combined with
2276	 * the loop above,
2277	 */
2278	value = ctl_get_opt(&be_lun->ctl_be_lun.options, "num_threads");
2279	if (value != NULL) {
2280		tmp_num_threads = strtol(value, NULL, 0);
2281
2282		/*
2283		 * We don't let the user specify less than one
2284		 * thread, but hope he's clueful enough not to
2285		 * specify 1000 threads.
2286		 */
2287		if (tmp_num_threads < 1) {
2288			snprintf(req->error_str, sizeof(req->error_str),
2289				 "invalid number of threads %s",
2290				 num_thread_str);
2291			goto bailout_error;
2292		}
2293		num_threads = tmp_num_threads;
2294	}
2295
2296	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
2297	be_lun->ctl_be_lun.flags = CTL_LUN_FLAG_PRIMARY;
2298	if (be_lun->vn == NULL)
2299		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_OFFLINE;
2300	if (be_lun->unmap != NULL)
2301		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_UNMAP;
2302	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2303		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_SERSEQ_READ;
2304	be_lun->ctl_be_lun.be_lun = be_lun;
2305	be_lun->ctl_be_lun.maxlba = (be_lun->size_blocks == 0) ?
2306	    0 : (be_lun->size_blocks - 1);
2307	be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
2308	be_lun->ctl_be_lun.pblockexp = be_lun->pblockexp;
2309	be_lun->ctl_be_lun.pblockoff = be_lun->pblockoff;
2310	be_lun->ctl_be_lun.ublockexp = be_lun->ublockexp;
2311	be_lun->ctl_be_lun.ublockoff = be_lun->ublockoff;
2312	be_lun->ctl_be_lun.atomicblock = be_lun->atomicblock;
2313	be_lun->ctl_be_lun.opttxferlen = be_lun->opttxferlen;
2314	/* Tell the user the blocksize we ended up using */
2315	params->lun_size_bytes = be_lun->size_bytes;
2316	params->blocksize_bytes = be_lun->blocksize;
2317	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2318		be_lun->ctl_be_lun.req_lun_id = params->req_lun_id;
2319		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_ID_REQ;
2320	} else
2321		be_lun->ctl_be_lun.req_lun_id = 0;
2322
2323	be_lun->ctl_be_lun.lun_shutdown = ctl_be_block_lun_shutdown;
2324	be_lun->ctl_be_lun.lun_config_status =
2325		ctl_be_block_lun_config_status;
2326	be_lun->ctl_be_lun.be = &ctl_be_block_driver;
2327
2328	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2329		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
2330			 softc->num_luns);
2331		strncpy((char *)be_lun->ctl_be_lun.serial_num, tmpstr,
2332			MIN(sizeof(be_lun->ctl_be_lun.serial_num),
2333			sizeof(tmpstr)));
2334
2335		/* Tell the user what we used for a serial number */
2336		strncpy((char *)params->serial_num, tmpstr,
2337			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2338	} else {
2339		strncpy((char *)be_lun->ctl_be_lun.serial_num,
2340			params->serial_num,
2341			MIN(sizeof(be_lun->ctl_be_lun.serial_num),
2342			sizeof(params->serial_num)));
2343	}
2344	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2345		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
2346		strncpy((char *)be_lun->ctl_be_lun.device_id, tmpstr,
2347			MIN(sizeof(be_lun->ctl_be_lun.device_id),
2348			sizeof(tmpstr)));
2349
2350		/* Tell the user what we used for a device ID */
2351		strncpy((char *)params->device_id, tmpstr,
2352			MIN(sizeof(params->device_id), sizeof(tmpstr)));
2353	} else {
2354		strncpy((char *)be_lun->ctl_be_lun.device_id,
2355			params->device_id,
2356			MIN(sizeof(be_lun->ctl_be_lun.device_id),
2357			    sizeof(params->device_id)));
2358	}
2359
2360	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2361
2362	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
2363	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2364
2365	if (be_lun->io_taskqueue == NULL) {
2366		snprintf(req->error_str, sizeof(req->error_str),
2367			 "unable to create taskqueue");
2368		goto bailout_error;
2369	}
2370
2371	/*
2372	 * Note that we start the same number of threads by default for
2373	 * both the file case and the block device case.  For the file
2374	 * case, we need multiple threads to allow concurrency, because the
2375	 * vnode interface is designed to be a blocking interface.  For the
2376	 * block device case, ZFS zvols at least will block the caller's
2377	 * context in many instances, and so we need multiple threads to
2378	 * overcome that problem.  Other block devices don't need as many
2379	 * threads, but they shouldn't cause too many problems.
2380	 *
2381	 * If the user wants to just have a single thread for a block
2382	 * device, he can specify that when the LUN is created, or change
2383	 * the tunable/sysctl to alter the default number of threads.
2384	 */
2385	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
2386					 /*num threads*/num_threads,
2387					 /*priority*/PWAIT,
2388					 /*thread name*/
2389					 "%s taskq", be_lun->lunname);
2390
2391	if (retval != 0)
2392		goto bailout_error;
2393
2394	be_lun->num_threads = num_threads;
2395
2396	mtx_lock(&softc->lock);
2397	softc->num_luns++;
2398	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
2399
2400	mtx_unlock(&softc->lock);
2401
2402	retval = ctl_add_lun(&be_lun->ctl_be_lun);
2403	if (retval != 0) {
2404		mtx_lock(&softc->lock);
2405		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2406			      links);
2407		softc->num_luns--;
2408		mtx_unlock(&softc->lock);
2409		snprintf(req->error_str, sizeof(req->error_str),
2410			 "ctl_add_lun() returned error %d, see dmesg for "
2411			 "details", retval);
2412		retval = 0;
2413		goto bailout_error;
2414	}
2415
2416	mtx_lock(&softc->lock);
2417
2418	/*
2419	 * Tell the config_status routine that we're waiting so it won't
2420	 * clean up the LUN in the event of an error.
2421	 */
2422	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2423
2424	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2425		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2426		if (retval == EINTR)
2427			break;
2428	}
2429	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2430
2431	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
2432		snprintf(req->error_str, sizeof(req->error_str),
2433			 "LUN configuration error, see dmesg for details");
2434		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
2435			      links);
2436		softc->num_luns--;
2437		mtx_unlock(&softc->lock);
2438		goto bailout_error;
2439	} else {
2440		params->req_lun_id = be_lun->ctl_be_lun.lun_id;
2441	}
2442
2443	mtx_unlock(&softc->lock);
2444
2445	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
2446					       be_lun->blocksize,
2447					       DEVSTAT_ALL_SUPPORTED,
2448					       be_lun->ctl_be_lun.lun_type
2449					       | DEVSTAT_TYPE_IF_OTHER,
2450					       DEVSTAT_PRIORITY_OTHER);
2451
2452	return (retval);
2453
2454bailout_error:
2455	req->status = CTL_LUN_ERROR;
2456
2457	if (be_lun->io_taskqueue != NULL)
2458		taskqueue_free(be_lun->io_taskqueue);
2459	ctl_be_block_close(be_lun);
2460	if (be_lun->dev_path != NULL)
2461		free(be_lun->dev_path, M_CTLBLK);
2462	if (be_lun->lun_zone != NULL)
2463		uma_zdestroy(be_lun->lun_zone);
2464	ctl_free_opts(&be_lun->ctl_be_lun.options);
2465	mtx_destroy(&be_lun->queue_lock);
2466	mtx_destroy(&be_lun->io_lock);
2467	free(be_lun, M_CTLBLK);
2468
2469	return (retval);
2470}
2471
2472static int
2473ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2474{
2475	struct ctl_lun_rm_params *params;
2476	struct ctl_be_block_lun *be_lun;
2477	int retval;
2478
2479	params = &req->reqdata.rm;
2480
2481	mtx_lock(&softc->lock);
2482
2483	be_lun = NULL;
2484
2485	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2486		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2487			break;
2488	}
2489	mtx_unlock(&softc->lock);
2490
2491	if (be_lun == NULL) {
2492		snprintf(req->error_str, sizeof(req->error_str),
2493			 "LUN %u is not managed by the block backend",
2494			 params->lun_id);
2495		goto bailout_error;
2496	}
2497
2498	retval = ctl_disable_lun(&be_lun->ctl_be_lun);
2499
2500	if (retval != 0) {
2501		snprintf(req->error_str, sizeof(req->error_str),
2502			 "error %d returned from ctl_disable_lun() for "
2503			 "LUN %d", retval, params->lun_id);
2504		goto bailout_error;
2505
2506	}
2507
2508	retval = ctl_invalidate_lun(&be_lun->ctl_be_lun);
2509	if (retval != 0) {
2510		snprintf(req->error_str, sizeof(req->error_str),
2511			 "error %d returned from ctl_invalidate_lun() for "
2512			 "LUN %d", retval, params->lun_id);
2513		goto bailout_error;
2514	}
2515
2516	mtx_lock(&softc->lock);
2517
2518	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2519
2520	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2521                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
2522                if (retval == EINTR)
2523                        break;
2524        }
2525
2526	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2527
2528	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2529		snprintf(req->error_str, sizeof(req->error_str),
2530			 "interrupted waiting for LUN to be freed");
2531		mtx_unlock(&softc->lock);
2532		goto bailout_error;
2533	}
2534
2535	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2536
2537	softc->num_luns--;
2538	mtx_unlock(&softc->lock);
2539
2540	taskqueue_drain(be_lun->io_taskqueue, &be_lun->io_task);
2541
2542	taskqueue_free(be_lun->io_taskqueue);
2543
2544	ctl_be_block_close(be_lun);
2545
2546	if (be_lun->disk_stats != NULL)
2547		devstat_remove_entry(be_lun->disk_stats);
2548
2549	uma_zdestroy(be_lun->lun_zone);
2550
2551	ctl_free_opts(&be_lun->ctl_be_lun.options);
2552	free(be_lun->dev_path, M_CTLBLK);
2553	mtx_destroy(&be_lun->queue_lock);
2554	mtx_destroy(&be_lun->io_lock);
2555	free(be_lun, M_CTLBLK);
2556
2557	req->status = CTL_LUN_OK;
2558
2559	return (0);
2560
2561bailout_error:
2562
2563	req->status = CTL_LUN_ERROR;
2564
2565	return (0);
2566}
2567
2568static int
2569ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
2570			 struct ctl_lun_req *req)
2571{
2572	struct vattr vattr;
2573	int error;
2574	struct ctl_lun_create_params *params = &be_lun->params;
2575
2576	if (params->lun_size_bytes != 0) {
2577		be_lun->size_bytes = params->lun_size_bytes;
2578	} else  {
2579		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2580		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
2581		VOP_UNLOCK(be_lun->vn, 0);
2582		if (error != 0) {
2583			snprintf(req->error_str, sizeof(req->error_str),
2584				 "error calling VOP_GETATTR() for file %s",
2585				 be_lun->dev_path);
2586			return (error);
2587		}
2588
2589		be_lun->size_bytes = vattr.va_size;
2590	}
2591
2592	return (0);
2593}
2594
2595static int
2596ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
2597			struct ctl_lun_req *req)
2598{
2599	struct ctl_be_block_devdata *dev_data;
2600	int error;
2601	struct ctl_lun_create_params *params = &be_lun->params;
2602	uint64_t size_bytes;
2603
2604	dev_data = &be_lun->backend.dev;
2605	if (!dev_data->csw->d_ioctl) {
2606		snprintf(req->error_str, sizeof(req->error_str),
2607			 "no d_ioctl for device %s!", be_lun->dev_path);
2608		return (ENODEV);
2609	}
2610
2611	error = dev_data->csw->d_ioctl(dev_data->cdev, DIOCGMEDIASIZE,
2612			       (caddr_t)&size_bytes, FREAD,
2613			       curthread);
2614	if (error) {
2615		snprintf(req->error_str, sizeof(req->error_str),
2616			 "error %d returned for DIOCGMEDIASIZE ioctl "
2617			 "on %s!", error, be_lun->dev_path);
2618		return (error);
2619	}
2620
2621	if (params->lun_size_bytes != 0) {
2622		if (params->lun_size_bytes > size_bytes) {
2623			snprintf(req->error_str, sizeof(req->error_str),
2624				 "requested LUN size %ju > backing device "
2625				 "size %ju",
2626				 (uintmax_t)params->lun_size_bytes,
2627				 (uintmax_t)size_bytes);
2628			return (EINVAL);
2629		}
2630
2631		be_lun->size_bytes = params->lun_size_bytes;
2632	} else {
2633		be_lun->size_bytes = size_bytes;
2634	}
2635
2636	return (0);
2637}
2638
2639static int
2640ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2641{
2642	struct ctl_lun_modify_params *params;
2643	struct ctl_be_block_lun *be_lun;
2644	uint64_t oldsize;
2645	int error;
2646
2647	params = &req->reqdata.modify;
2648
2649	mtx_lock(&softc->lock);
2650	be_lun = NULL;
2651	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2652		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2653			break;
2654	}
2655	mtx_unlock(&softc->lock);
2656
2657	if (be_lun == NULL) {
2658		snprintf(req->error_str, sizeof(req->error_str),
2659			 "LUN %u is not managed by the block backend",
2660			 params->lun_id);
2661		goto bailout_error;
2662	}
2663
2664	be_lun->params.lun_size_bytes = params->lun_size_bytes;
2665
2666	oldsize = be_lun->size_bytes;
2667	if (be_lun->vn == NULL)
2668		error = ctl_be_block_open(softc, be_lun, req);
2669	else if (be_lun->vn->v_type == VREG)
2670		error = ctl_be_block_modify_file(be_lun, req);
2671	else
2672		error = ctl_be_block_modify_dev(be_lun, req);
2673
2674	if (error == 0 && be_lun->size_bytes != oldsize) {
2675		be_lun->size_blocks = be_lun->size_bytes >>
2676		    be_lun->blocksize_shift;
2677
2678		/*
2679		 * The maximum LBA is the size - 1.
2680		 *
2681		 * XXX: Note that this field is being updated without locking,
2682		 * 	which might cause problems on 32-bit architectures.
2683		 */
2684		if (be_lun->unmap != NULL)
2685			be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_UNMAP;
2686		be_lun->ctl_be_lun.maxlba = (be_lun->size_blocks == 0) ?
2687		    0 : (be_lun->size_blocks - 1);
2688		be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
2689		be_lun->ctl_be_lun.pblockexp = be_lun->pblockexp;
2690		be_lun->ctl_be_lun.pblockoff = be_lun->pblockoff;
2691		be_lun->ctl_be_lun.ublockexp = be_lun->ublockexp;
2692		be_lun->ctl_be_lun.ublockoff = be_lun->ublockoff;
2693		be_lun->ctl_be_lun.atomicblock = be_lun->atomicblock;
2694		be_lun->ctl_be_lun.opttxferlen = be_lun->opttxferlen;
2695		ctl_lun_capacity_changed(&be_lun->ctl_be_lun);
2696		if (oldsize == 0 && be_lun->size_blocks != 0)
2697			ctl_lun_online(&be_lun->ctl_be_lun);
2698	}
2699
2700	/* Tell the user the exact size we ended up using */
2701	params->lun_size_bytes = be_lun->size_bytes;
2702
2703	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
2704
2705	return (0);
2706
2707bailout_error:
2708	req->status = CTL_LUN_ERROR;
2709
2710	return (0);
2711}
2712
2713static void
2714ctl_be_block_lun_shutdown(void *be_lun)
2715{
2716	struct ctl_be_block_lun *lun;
2717	struct ctl_be_block_softc *softc;
2718
2719	lun = (struct ctl_be_block_lun *)be_lun;
2720
2721	softc = lun->softc;
2722
2723	mtx_lock(&softc->lock);
2724	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2725	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2726		wakeup(lun);
2727	mtx_unlock(&softc->lock);
2728
2729}
2730
2731static void
2732ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2733{
2734	struct ctl_be_block_lun *lun;
2735	struct ctl_be_block_softc *softc;
2736
2737	lun = (struct ctl_be_block_lun *)be_lun;
2738	softc = lun->softc;
2739
2740	if (status == CTL_LUN_CONFIG_OK) {
2741		mtx_lock(&softc->lock);
2742		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2743		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2744			wakeup(lun);
2745		mtx_unlock(&softc->lock);
2746
2747		/*
2748		 * We successfully added the LUN, attempt to enable it.
2749		 */
2750		if (ctl_enable_lun(&lun->ctl_be_lun) != 0) {
2751			printf("%s: ctl_enable_lun() failed!\n", __func__);
2752			if (ctl_invalidate_lun(&lun->ctl_be_lun) != 0) {
2753				printf("%s: ctl_invalidate_lun() failed!\n",
2754				       __func__);
2755			}
2756		}
2757
2758		return;
2759	}
2760
2761
2762	mtx_lock(&softc->lock);
2763	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2764	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2765	wakeup(lun);
2766	mtx_unlock(&softc->lock);
2767}
2768
2769
2770static int
2771ctl_be_block_config_write(union ctl_io *io)
2772{
2773	struct ctl_be_block_lun *be_lun;
2774	struct ctl_be_lun *ctl_be_lun;
2775	int retval;
2776
2777	retval = 0;
2778
2779	DPRINTF("entered\n");
2780
2781	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2782		CTL_PRIV_BACKEND_LUN].ptr;
2783	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2784
2785	switch (io->scsiio.cdb[0]) {
2786	case SYNCHRONIZE_CACHE:
2787	case SYNCHRONIZE_CACHE_16:
2788	case WRITE_SAME_10:
2789	case WRITE_SAME_16:
2790	case UNMAP:
2791		/*
2792		 * The upper level CTL code will filter out any CDBs with
2793		 * the immediate bit set and return the proper error.
2794		 *
2795		 * We don't really need to worry about what LBA range the
2796		 * user asked to be synced out.  When they issue a sync
2797		 * cache command, we'll sync out the whole thing.
2798		 */
2799		mtx_lock(&be_lun->queue_lock);
2800		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2801				   links);
2802		mtx_unlock(&be_lun->queue_lock);
2803		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2804		break;
2805	case START_STOP_UNIT: {
2806		struct scsi_start_stop_unit *cdb;
2807
2808		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2809
2810		if (cdb->how & SSS_START)
2811			retval = ctl_start_lun(ctl_be_lun);
2812		else {
2813			retval = ctl_stop_lun(ctl_be_lun);
2814			/*
2815			 * XXX KDM Copan-specific offline behavior.
2816			 * Figure out a reasonable way to port this?
2817			 */
2818#ifdef NEEDTOPORT
2819			if ((retval == 0)
2820			 && (cdb->byte2 & SSS_ONOFFLINE))
2821				retval = ctl_lun_offline(ctl_be_lun);
2822#endif
2823		}
2824
2825		/*
2826		 * In general, the above routines should not fail.  They
2827		 * just set state for the LUN.  So we've got something
2828		 * pretty wrong here if we can't start or stop the LUN.
2829		 */
2830		if (retval != 0) {
2831			ctl_set_internal_failure(&io->scsiio,
2832						 /*sks_valid*/ 1,
2833						 /*retry_count*/ 0xf051);
2834			retval = CTL_RETVAL_COMPLETE;
2835		} else {
2836			ctl_set_success(&io->scsiio);
2837		}
2838		ctl_config_write_done(io);
2839		break;
2840	}
2841	default:
2842		ctl_set_invalid_opcode(&io->scsiio);
2843		ctl_config_write_done(io);
2844		retval = CTL_RETVAL_COMPLETE;
2845		break;
2846	}
2847
2848	return (retval);
2849}
2850
2851static int
2852ctl_be_block_config_read(union ctl_io *io)
2853{
2854	struct ctl_be_block_lun *be_lun;
2855	struct ctl_be_lun *ctl_be_lun;
2856	int retval = 0;
2857
2858	DPRINTF("entered\n");
2859
2860	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2861		CTL_PRIV_BACKEND_LUN].ptr;
2862	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2863
2864	switch (io->scsiio.cdb[0]) {
2865	case SERVICE_ACTION_IN:
2866		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
2867			mtx_lock(&be_lun->queue_lock);
2868			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
2869			    &io->io_hdr, links);
2870			mtx_unlock(&be_lun->queue_lock);
2871			taskqueue_enqueue(be_lun->io_taskqueue,
2872			    &be_lun->io_task);
2873			retval = CTL_RETVAL_QUEUED;
2874			break;
2875		}
2876		ctl_set_invalid_field(&io->scsiio,
2877				      /*sks_valid*/ 1,
2878				      /*command*/ 1,
2879				      /*field*/ 1,
2880				      /*bit_valid*/ 1,
2881				      /*bit*/ 4);
2882		ctl_config_read_done(io);
2883		retval = CTL_RETVAL_COMPLETE;
2884		break;
2885	default:
2886		ctl_set_invalid_opcode(&io->scsiio);
2887		ctl_config_read_done(io);
2888		retval = CTL_RETVAL_COMPLETE;
2889		break;
2890	}
2891
2892	return (retval);
2893}
2894
2895static int
2896ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2897{
2898	struct ctl_be_block_lun *lun;
2899	int retval;
2900
2901	lun = (struct ctl_be_block_lun *)be_lun;
2902	retval = 0;
2903
2904	retval = sbuf_printf(sb, "\t<num_threads>");
2905
2906	if (retval != 0)
2907		goto bailout;
2908
2909	retval = sbuf_printf(sb, "%d", lun->num_threads);
2910
2911	if (retval != 0)
2912		goto bailout;
2913
2914	retval = sbuf_printf(sb, "</num_threads>\n");
2915
2916bailout:
2917
2918	return (retval);
2919}
2920
2921static uint64_t
2922ctl_be_block_lun_attr(void *be_lun, const char *attrname)
2923{
2924	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)be_lun;
2925
2926	if (lun->getattr == NULL)
2927		return (UINT64_MAX);
2928	return (lun->getattr(lun, attrname));
2929}
2930
2931int
2932ctl_be_block_init(void)
2933{
2934	struct ctl_be_block_softc *softc;
2935	int retval;
2936
2937	softc = &backend_block_softc;
2938	retval = 0;
2939
2940	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
2941	beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2942	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2943	STAILQ_INIT(&softc->disk_list);
2944	STAILQ_INIT(&softc->lun_list);
2945
2946	return (retval);
2947}
2948