ctl_backend_block.c revision 264191
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Edward Tomasz Napierala
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17 *    substantially similar to the "NO WARRANTY" disclaimer below
18 *    ("Disclaimer") and any redistribution must be conditioned upon
19 *    including a substantially similar Disclaimer requirement for further
20 *    binary redistribution.
21 *
22 * NO WARRANTY
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGES.
34 *
35 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36 */
37/*
38 * CAM Target Layer driver backend for block devices.
39 *
40 * Author: Ken Merry <ken@FreeBSD.org>
41 */
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: head/sys/cam/ctl/ctl_backend_block.c 264191 2014-04-06 10:13:14Z mav $");
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/kernel.h>
48#include <sys/types.h>
49#include <sys/kthread.h>
50#include <sys/bio.h>
51#include <sys/fcntl.h>
52#include <sys/lock.h>
53#include <sys/mutex.h>
54#include <sys/condvar.h>
55#include <sys/malloc.h>
56#include <sys/conf.h>
57#include <sys/ioccom.h>
58#include <sys/queue.h>
59#include <sys/sbuf.h>
60#include <sys/endian.h>
61#include <sys/uio.h>
62#include <sys/buf.h>
63#include <sys/taskqueue.h>
64#include <sys/vnode.h>
65#include <sys/namei.h>
66#include <sys/mount.h>
67#include <sys/disk.h>
68#include <sys/fcntl.h>
69#include <sys/filedesc.h>
70#include <sys/proc.h>
71#include <sys/pcpu.h>
72#include <sys/module.h>
73#include <sys/sdt.h>
74#include <sys/devicestat.h>
75#include <sys/sysctl.h>
76
77#include <geom/geom.h>
78
79#include <cam/cam.h>
80#include <cam/scsi/scsi_all.h>
81#include <cam/scsi/scsi_da.h>
82#include <cam/ctl/ctl_io.h>
83#include <cam/ctl/ctl.h>
84#include <cam/ctl/ctl_backend.h>
85#include <cam/ctl/ctl_frontend_internal.h>
86#include <cam/ctl/ctl_ioctl.h>
87#include <cam/ctl/ctl_scsi_all.h>
88#include <cam/ctl/ctl_error.h>
89
90/*
91 * The idea here is that we'll allocate enough S/G space to hold a 16MB
92 * I/O.  If we get an I/O larger than that, we'll reject it.
93 */
94#define	CTLBLK_MAX_IO_SIZE	(16 * 1024 * 1024)
95#define	CTLBLK_MAX_SEGS		(CTLBLK_MAX_IO_SIZE / MAXPHYS) + 1
96
97#ifdef CTLBLK_DEBUG
98#define DPRINTF(fmt, args...) \
99    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
100#else
101#define DPRINTF(fmt, args...) do {} while(0)
102#endif
103
104SDT_PROVIDER_DEFINE(cbb);
105
106typedef enum {
107	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
108	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
109	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
110	CTL_BE_BLOCK_LUN_MULTI_THREAD	= 0x08
111} ctl_be_block_lun_flags;
112
113typedef enum {
114	CTL_BE_BLOCK_NONE,
115	CTL_BE_BLOCK_DEV,
116	CTL_BE_BLOCK_FILE
117} ctl_be_block_type;
118
119struct ctl_be_block_devdata {
120	struct cdev *cdev;
121	struct cdevsw *csw;
122	int dev_ref;
123};
124
125struct ctl_be_block_filedata {
126	struct ucred *cred;
127};
128
129union ctl_be_block_bedata {
130	struct ctl_be_block_devdata dev;
131	struct ctl_be_block_filedata file;
132};
133
134struct ctl_be_block_io;
135struct ctl_be_block_lun;
136
137typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
138			       struct ctl_be_block_io *beio);
139
140/*
141 * Backend LUN structure.  There is a 1:1 mapping between a block device
142 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
143 */
144struct ctl_be_block_lun {
145	struct ctl_block_disk *disk;
146	char lunname[32];
147	char *dev_path;
148	ctl_be_block_type dev_type;
149	struct vnode *vn;
150	union ctl_be_block_bedata backend;
151	cbb_dispatch_t dispatch;
152	cbb_dispatch_t lun_flush;
153	struct mtx lock;
154	uma_zone_t lun_zone;
155	uint64_t size_blocks;
156	uint64_t size_bytes;
157	uint32_t blocksize;
158	int blocksize_shift;
159	uint16_t pblockexp;
160	uint16_t pblockoff;
161	struct ctl_be_block_softc *softc;
162	struct devstat *disk_stats;
163	ctl_be_block_lun_flags flags;
164	STAILQ_ENTRY(ctl_be_block_lun) links;
165	struct ctl_be_lun ctl_be_lun;
166	struct taskqueue *io_taskqueue;
167	struct task io_task;
168	int num_threads;
169	STAILQ_HEAD(, ctl_io_hdr) input_queue;
170	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
171	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
172};
173
174/*
175 * Overall softc structure for the block backend module.
176 */
177struct ctl_be_block_softc {
178	struct mtx			 lock;
179	int				 num_disks;
180	STAILQ_HEAD(, ctl_block_disk)	 disk_list;
181	int				 num_luns;
182	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
183};
184
185static struct ctl_be_block_softc backend_block_softc;
186
187/*
188 * Per-I/O information.
189 */
190struct ctl_be_block_io {
191	union ctl_io			*io;
192	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
193	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
194	int				bio_cmd;
195	int				bio_flags;
196	int				num_segs;
197	int				num_bios_sent;
198	int				num_bios_done;
199	int				send_complete;
200	int				num_errors;
201	struct bintime			ds_t0;
202	devstat_tag_type		ds_tag_type;
203	devstat_trans_flags		ds_trans_type;
204	uint64_t			io_len;
205	uint64_t			io_offset;
206	struct ctl_be_block_softc	*softc;
207	struct ctl_be_block_lun		*lun;
208};
209
210static int cbb_num_threads = 14;
211TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
212SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
213	    "CAM Target Layer Block Backend");
214SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
215           &cbb_num_threads, 0, "Number of threads per backing file");
216
217static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
218static void ctl_free_beio(struct ctl_be_block_io *beio);
219static void ctl_complete_beio(struct ctl_be_block_io *beio);
220static int ctl_be_block_move_done(union ctl_io *io);
221static void ctl_be_block_biodone(struct bio *bio);
222static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
223				    struct ctl_be_block_io *beio);
224static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
225				       struct ctl_be_block_io *beio);
226static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
227				   struct ctl_be_block_io *beio);
228static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
229				      struct ctl_be_block_io *beio);
230static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
231				    union ctl_io *io);
232static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
233				  union ctl_io *io);
234static void ctl_be_block_worker(void *context, int pending);
235static int ctl_be_block_submit(union ctl_io *io);
236static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
237				   int flag, struct thread *td);
238static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
239				  struct ctl_lun_req *req);
240static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
241				 struct ctl_lun_req *req);
242static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
243static int ctl_be_block_open(struct ctl_be_block_softc *softc,
244			     struct ctl_be_block_lun *be_lun,
245			     struct ctl_lun_req *req);
246static int ctl_be_block_create(struct ctl_be_block_softc *softc,
247			       struct ctl_lun_req *req);
248static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
249			   struct ctl_lun_req *req);
250static int ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
251				  struct ctl_lun_req *req);
252static int ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
253				 struct ctl_lun_req *req);
254static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
255			   struct ctl_lun_req *req);
256static void ctl_be_block_lun_shutdown(void *be_lun);
257static void ctl_be_block_lun_config_status(void *be_lun,
258					   ctl_lun_config_status status);
259static int ctl_be_block_config_write(union ctl_io *io);
260static int ctl_be_block_config_read(union ctl_io *io);
261static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
262int ctl_be_block_init(void);
263
264static struct ctl_backend_driver ctl_be_block_driver =
265{
266	.name = "block",
267	.flags = CTL_BE_FLAG_HAS_CONFIG,
268	.init = ctl_be_block_init,
269	.data_submit = ctl_be_block_submit,
270	.data_move_done = ctl_be_block_move_done,
271	.config_read = ctl_be_block_config_read,
272	.config_write = ctl_be_block_config_write,
273	.ioctl = ctl_be_block_ioctl,
274	.lun_info = ctl_be_block_lun_info
275};
276
277MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
278CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
279
280static uma_zone_t beio_zone;
281
282static struct ctl_be_block_io *
283ctl_alloc_beio(struct ctl_be_block_softc *softc)
284{
285	struct ctl_be_block_io *beio;
286
287	beio = uma_zalloc(beio_zone, M_WAITOK | M_ZERO);
288	beio->softc = softc;
289	return (beio);
290}
291
292static void
293ctl_free_beio(struct ctl_be_block_io *beio)
294{
295	int duplicate_free;
296	int i;
297
298	duplicate_free = 0;
299
300	for (i = 0; i < beio->num_segs; i++) {
301		if (beio->sg_segs[i].addr == NULL)
302			duplicate_free++;
303
304		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
305		beio->sg_segs[i].addr = NULL;
306	}
307
308	if (duplicate_free > 0) {
309		printf("%s: %d duplicate frees out of %d segments\n", __func__,
310		       duplicate_free, beio->num_segs);
311	}
312
313	uma_zfree(beio_zone, beio);
314}
315
316static void
317ctl_complete_beio(struct ctl_be_block_io *beio)
318{
319	union ctl_io *io;
320	int io_len;
321
322	io = beio->io;
323
324	if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)
325		io_len = beio->io_len;
326	else
327		io_len = 0;
328
329	devstat_end_transaction(beio->lun->disk_stats,
330				/*bytes*/ io_len,
331				beio->ds_tag_type,
332				beio->ds_trans_type,
333				/*now*/ NULL,
334				/*then*/&beio->ds_t0);
335
336	ctl_free_beio(beio);
337	ctl_done(io);
338}
339
340static int
341ctl_be_block_move_done(union ctl_io *io)
342{
343	struct ctl_be_block_io *beio;
344	struct ctl_be_block_lun *be_lun;
345#ifdef CTL_TIME_IO
346	struct bintime cur_bt;
347#endif
348
349	beio = (struct ctl_be_block_io *)
350		io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
351
352	be_lun = beio->lun;
353
354	DPRINTF("entered\n");
355
356#ifdef CTL_TIME_IO
357	getbintime(&cur_bt);
358	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
359	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
360	io->io_hdr.num_dmas++;
361#endif
362
363	/*
364	 * We set status at this point for read commands, and write
365	 * commands with errors.
366	 */
367	if ((beio->bio_cmd == BIO_READ)
368	 && (io->io_hdr.port_status == 0)
369	 && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
370	 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE))
371		ctl_set_success(&io->scsiio);
372	else if ((io->io_hdr.port_status != 0)
373	      && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
374	      && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
375		/*
376		 * For hardware error sense keys, the sense key
377		 * specific value is defined to be a retry count,
378		 * but we use it to pass back an internal FETD
379		 * error code.  XXX KDM  Hopefully the FETD is only
380		 * using 16 bits for an error code, since that's
381		 * all the space we have in the sks field.
382		 */
383		ctl_set_internal_failure(&io->scsiio,
384					 /*sks_valid*/ 1,
385					 /*retry_count*/
386					 io->io_hdr.port_status);
387	}
388
389	/*
390	 * If this is a read, or a write with errors, it is done.
391	 */
392	if ((beio->bio_cmd == BIO_READ)
393	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
394	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
395		ctl_complete_beio(beio);
396		return (0);
397	}
398
399	/*
400	 * At this point, we have a write and the DMA completed
401	 * successfully.  We now have to queue it to the task queue to
402	 * execute the backend I/O.  That is because we do blocking
403	 * memory allocations, and in the file backing case, blocking I/O.
404	 * This move done routine is generally called in the SIM's
405	 * interrupt context, and therefore we cannot block.
406	 */
407	mtx_lock(&be_lun->lock);
408	/*
409	 * XXX KDM make sure that links is okay to use at this point.
410	 * Otherwise, we either need to add another field to ctl_io_hdr,
411	 * or deal with resource allocation here.
412	 */
413	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
414	mtx_unlock(&be_lun->lock);
415
416	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
417
418	return (0);
419}
420
421static void
422ctl_be_block_biodone(struct bio *bio)
423{
424	struct ctl_be_block_io *beio;
425	struct ctl_be_block_lun *be_lun;
426	union ctl_io *io;
427	int error;
428
429	beio = bio->bio_caller1;
430	be_lun = beio->lun;
431	io = beio->io;
432
433	DPRINTF("entered\n");
434
435	error = bio->bio_error;
436	mtx_lock(&be_lun->lock);
437	if (error != 0)
438		beio->num_errors++;
439
440	beio->num_bios_done++;
441
442	/*
443	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
444	 * during the free might cause it to complain.
445	 */
446	g_destroy_bio(bio);
447
448	/*
449	 * If the send complete bit isn't set, or we aren't the last I/O to
450	 * complete, then we're done.
451	 */
452	if ((beio->send_complete == 0)
453	 || (beio->num_bios_done < beio->num_bios_sent)) {
454		mtx_unlock(&be_lun->lock);
455		return;
456	}
457
458	/*
459	 * At this point, we've verified that we are the last I/O to
460	 * complete, so it's safe to drop the lock.
461	 */
462	mtx_unlock(&be_lun->lock);
463
464	/*
465	 * If there are any errors from the backing device, we fail the
466	 * entire I/O with a medium error.
467	 */
468	if (beio->num_errors > 0) {
469		if (error == EOPNOTSUPP) {
470			ctl_set_invalid_opcode(&io->scsiio);
471		} else if (beio->bio_cmd == BIO_FLUSH) {
472			/* XXX KDM is there is a better error here? */
473			ctl_set_internal_failure(&io->scsiio,
474						 /*sks_valid*/ 1,
475						 /*retry_count*/ 0xbad2);
476		} else
477			ctl_set_medium_error(&io->scsiio);
478		ctl_complete_beio(beio);
479		return;
480	}
481
482	/*
483	 * If this is a write or a flush, we're all done.
484	 * If this is a read, we can now send the data to the user.
485	 */
486	if ((beio->bio_cmd == BIO_WRITE)
487	 || (beio->bio_cmd == BIO_FLUSH)) {
488		ctl_set_success(&io->scsiio);
489		ctl_complete_beio(beio);
490	} else {
491		io->scsiio.be_move_done = ctl_be_block_move_done;
492		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
493		io->scsiio.kern_data_len = beio->io_len;
494		io->scsiio.kern_total_len = beio->io_len;
495		io->scsiio.kern_rel_offset = 0;
496		io->scsiio.kern_data_resid = 0;
497		io->scsiio.kern_sg_entries = beio->num_segs;
498		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
499#ifdef CTL_TIME_IO
500        	getbintime(&io->io_hdr.dma_start_bt);
501#endif
502		ctl_datamove(io);
503	}
504}
505
506static void
507ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
508			struct ctl_be_block_io *beio)
509{
510	union ctl_io *io;
511	struct mount *mountpoint;
512	int error, lock_flags;
513
514	DPRINTF("entered\n");
515
516	io = beio->io;
517
518       	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
519
520	if (MNT_SHARED_WRITES(mountpoint)
521	 || ((mountpoint == NULL)
522	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
523		lock_flags = LK_SHARED;
524	else
525		lock_flags = LK_EXCLUSIVE;
526
527	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
528
529	binuptime(&beio->ds_t0);
530	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
531
532	error = VOP_FSYNC(be_lun->vn, MNT_WAIT, curthread);
533	VOP_UNLOCK(be_lun->vn, 0);
534
535	vn_finished_write(mountpoint);
536
537	if (error == 0)
538		ctl_set_success(&io->scsiio);
539	else {
540		/* XXX KDM is there is a better error here? */
541		ctl_set_internal_failure(&io->scsiio,
542					 /*sks_valid*/ 1,
543					 /*retry_count*/ 0xbad1);
544	}
545
546	ctl_complete_beio(beio);
547}
548
549SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, "uint64_t");
550SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, "uint64_t");
551SDT_PROBE_DEFINE1(cbb, kernel, read, file_done,"uint64_t");
552SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, "uint64_t");
553
554static void
555ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
556			   struct ctl_be_block_io *beio)
557{
558	struct ctl_be_block_filedata *file_data;
559	union ctl_io *io;
560	struct uio xuio;
561	struct iovec *xiovec;
562	int flags;
563	int error, i;
564
565	DPRINTF("entered\n");
566
567	file_data = &be_lun->backend.file;
568	io = beio->io;
569	flags = beio->bio_flags;
570
571	if (beio->bio_cmd == BIO_READ) {
572		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
573	} else {
574		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
575	}
576
577	bzero(&xuio, sizeof(xuio));
578	if (beio->bio_cmd == BIO_READ)
579		xuio.uio_rw = UIO_READ;
580	else
581		xuio.uio_rw = UIO_WRITE;
582
583	xuio.uio_offset = beio->io_offset;
584	xuio.uio_resid = beio->io_len;
585	xuio.uio_segflg = UIO_SYSSPACE;
586	xuio.uio_iov = beio->xiovecs;
587	xuio.uio_iovcnt = beio->num_segs;
588	xuio.uio_td = curthread;
589
590	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
591		xiovec->iov_base = beio->sg_segs[i].addr;
592		xiovec->iov_len = beio->sg_segs[i].len;
593	}
594
595	if (beio->bio_cmd == BIO_READ) {
596		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
597
598		binuptime(&beio->ds_t0);
599		devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
600
601		/*
602		 * UFS pays attention to IO_DIRECT for reads.  If the
603		 * DIRECTIO option is configured into the kernel, it calls
604		 * ffs_rawread().  But that only works for single-segment
605		 * uios with user space addresses.  In our case, with a
606		 * kernel uio, it still reads into the buffer cache, but it
607		 * will just try to release the buffer from the cache later
608		 * on in ffs_read().
609		 *
610		 * ZFS does not pay attention to IO_DIRECT for reads.
611		 *
612		 * UFS does not pay attention to IO_SYNC for reads.
613		 *
614		 * ZFS pays attention to IO_SYNC (which translates into the
615		 * Solaris define FRSYNC for zfs_read()) for reads.  It
616		 * attempts to sync the file before reading.
617		 *
618		 * So, to attempt to provide some barrier semantics in the
619		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
620		 */
621		error = VOP_READ(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
622				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
623
624		VOP_UNLOCK(be_lun->vn, 0);
625	} else {
626		struct mount *mountpoint;
627		int lock_flags;
628
629		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
630
631		if (MNT_SHARED_WRITES(mountpoint)
632		 || ((mountpoint == NULL)
633		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
634			lock_flags = LK_SHARED;
635		else
636			lock_flags = LK_EXCLUSIVE;
637
638		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
639
640		binuptime(&beio->ds_t0);
641		devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
642
643		/*
644		 * UFS pays attention to IO_DIRECT for writes.  The write
645		 * is done asynchronously.  (Normally the write would just
646		 * get put into cache.
647		 *
648		 * UFS pays attention to IO_SYNC for writes.  It will
649		 * attempt to write the buffer out synchronously if that
650		 * flag is set.
651		 *
652		 * ZFS does not pay attention to IO_DIRECT for writes.
653		 *
654		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
655		 * for writes.  It will flush the transaction from the
656		 * cache before returning.
657		 *
658		 * So if we've got the BIO_ORDERED flag set, we want
659		 * IO_SYNC in either the UFS or ZFS case.
660		 */
661		error = VOP_WRITE(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
662				  IO_SYNC : 0, file_data->cred);
663		VOP_UNLOCK(be_lun->vn, 0);
664
665		vn_finished_write(mountpoint);
666        }
667
668	/*
669	 * If we got an error, set the sense data to "MEDIUM ERROR" and
670	 * return the I/O to the user.
671	 */
672	if (error != 0) {
673		char path_str[32];
674
675		ctl_scsi_path_string(io, path_str, sizeof(path_str));
676		/*
677		 * XXX KDM ZFS returns ENOSPC when the underlying
678		 * filesystem fills up.  What kind of SCSI error should we
679		 * return for that?
680		 */
681		printf("%s%s command returned errno %d\n", path_str,
682		       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", error);
683		ctl_set_medium_error(&io->scsiio);
684		ctl_complete_beio(beio);
685		return;
686	}
687
688	/*
689	 * If this is a write, we're all done.
690	 * If this is a read, we can now send the data to the user.
691	 */
692	if (beio->bio_cmd == BIO_WRITE) {
693		ctl_set_success(&io->scsiio);
694		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
695		ctl_complete_beio(beio);
696	} else {
697		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
698		io->scsiio.be_move_done = ctl_be_block_move_done;
699		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
700		io->scsiio.kern_data_len = beio->io_len;
701		io->scsiio.kern_total_len = beio->io_len;
702		io->scsiio.kern_rel_offset = 0;
703		io->scsiio.kern_data_resid = 0;
704		io->scsiio.kern_sg_entries = beio->num_segs;
705		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
706#ifdef CTL_TIME_IO
707        	getbintime(&io->io_hdr.dma_start_bt);
708#endif
709		ctl_datamove(io);
710	}
711}
712
713static void
714ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
715		       struct ctl_be_block_io *beio)
716{
717	struct bio *bio;
718	union ctl_io *io;
719	struct ctl_be_block_devdata *dev_data;
720
721	dev_data = &be_lun->backend.dev;
722	io = beio->io;
723
724	DPRINTF("entered\n");
725
726	/* This can't fail, it's a blocking allocation. */
727	bio = g_alloc_bio();
728
729	bio->bio_cmd	    = BIO_FLUSH;
730	bio->bio_flags	   |= BIO_ORDERED;
731	bio->bio_dev	    = dev_data->cdev;
732	bio->bio_offset	    = 0;
733	bio->bio_data	    = 0;
734	bio->bio_done	    = ctl_be_block_biodone;
735	bio->bio_caller1    = beio;
736	bio->bio_pblkno	    = 0;
737
738	/*
739	 * We don't need to acquire the LUN lock here, because we are only
740	 * sending one bio, and so there is no other context to synchronize
741	 * with.
742	 */
743	beio->num_bios_sent = 1;
744	beio->send_complete = 1;
745
746	binuptime(&beio->ds_t0);
747	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
748
749	(*dev_data->csw->d_strategy)(bio);
750}
751
752static void
753ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
754			  struct ctl_be_block_io *beio)
755{
756	int i;
757	struct bio *bio;
758	struct ctl_be_block_devdata *dev_data;
759	off_t cur_offset;
760	int max_iosize;
761
762	DPRINTF("entered\n");
763
764	dev_data = &be_lun->backend.dev;
765
766	/*
767	 * We have to limit our I/O size to the maximum supported by the
768	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
769	 * set it properly, use DFLTPHYS.
770	 */
771	max_iosize = dev_data->cdev->si_iosize_max;
772	if (max_iosize < PAGE_SIZE)
773		max_iosize = DFLTPHYS;
774
775	cur_offset = beio->io_offset;
776
777	/*
778	 * XXX KDM need to accurately reflect the number of I/Os outstanding
779	 * to a device.
780	 */
781	binuptime(&beio->ds_t0);
782	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
783
784	for (i = 0; i < beio->num_segs; i++) {
785		size_t cur_size;
786		uint8_t *cur_ptr;
787
788		cur_size = beio->sg_segs[i].len;
789		cur_ptr = beio->sg_segs[i].addr;
790
791		while (cur_size > 0) {
792			/* This can't fail, it's a blocking allocation. */
793			bio = g_alloc_bio();
794
795			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
796
797			bio->bio_cmd = beio->bio_cmd;
798			bio->bio_flags |= beio->bio_flags;
799			bio->bio_dev = dev_data->cdev;
800			bio->bio_caller1 = beio;
801			bio->bio_length = min(cur_size, max_iosize);
802			bio->bio_offset = cur_offset;
803			bio->bio_data = cur_ptr;
804			bio->bio_done = ctl_be_block_biodone;
805			bio->bio_pblkno = cur_offset / be_lun->blocksize;
806
807			cur_offset += bio->bio_length;
808			cur_ptr += bio->bio_length;
809			cur_size -= bio->bio_length;
810
811			/*
812			 * Make sure we set the complete bit just before we
813			 * issue the last bio so we don't wind up with a
814			 * race.
815			 *
816			 * Use the LUN mutex here instead of a combination
817			 * of atomic variables for simplicity.
818			 *
819			 * XXX KDM we could have a per-IO lock, but that
820			 * would cause additional per-IO setup and teardown
821			 * overhead.  Hopefully there won't be too much
822			 * contention on the LUN lock.
823			 */
824			mtx_lock(&be_lun->lock);
825
826			beio->num_bios_sent++;
827
828			if ((i == beio->num_segs - 1)
829			 && (cur_size == 0))
830				beio->send_complete = 1;
831
832			mtx_unlock(&be_lun->lock);
833
834			(*dev_data->csw->d_strategy)(bio);
835		}
836	}
837}
838
839static void
840ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
841			 union ctl_io *io)
842{
843	struct ctl_be_block_io *beio;
844	struct ctl_be_block_softc *softc;
845
846	DPRINTF("entered\n");
847
848	softc = be_lun->softc;
849	beio = ctl_alloc_beio(softc);
850	KASSERT(beio != NULL, ("ctl_alloc_beio() failed"));
851
852	beio->io = io;
853	beio->softc = softc;
854	beio->lun = be_lun;
855	io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
856
857	switch (io->scsiio.cdb[0]) {
858	case SYNCHRONIZE_CACHE:
859	case SYNCHRONIZE_CACHE_16:
860		beio->bio_cmd = BIO_FLUSH;
861		beio->ds_trans_type = DEVSTAT_NO_DATA;
862		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
863		beio->io_len = 0;
864		be_lun->lun_flush(be_lun, beio);
865		break;
866	default:
867		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
868		break;
869	}
870}
871
872SDT_PROBE_DEFINE1(cbb, kernel, read, start, "uint64_t");
873SDT_PROBE_DEFINE1(cbb, kernel, write, start, "uint64_t");
874SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, "uint64_t");
875SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, "uint64_t");
876
877static void
878ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
879			   union ctl_io *io)
880{
881	struct ctl_be_block_io *beio;
882	struct ctl_be_block_softc *softc;
883	struct ctl_lba_len lbalen;
884	uint64_t len_left, io_size_bytes;
885	int i;
886
887	softc = be_lun->softc;
888
889	DPRINTF("entered\n");
890
891	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
892		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
893	} else {
894		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
895	}
896
897	memcpy(&lbalen, io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes,
898	       sizeof(lbalen));
899
900	io_size_bytes = lbalen.len * be_lun->blocksize;
901
902	/*
903	 * XXX KDM this is temporary, until we implement chaining of beio
904	 * structures and multiple datamove calls to move all the data in
905	 * or out.
906	 */
907	if (io_size_bytes > CTLBLK_MAX_IO_SIZE) {
908		printf("%s: IO length %ju > max io size %u\n", __func__,
909		       io_size_bytes, CTLBLK_MAX_IO_SIZE);
910		ctl_set_invalid_field(&io->scsiio,
911				      /*sks_valid*/ 0,
912				      /*command*/ 1,
913				      /*field*/ 0,
914				      /*bit_valid*/ 0,
915				      /*bit*/ 0);
916		ctl_done(io);
917		return;
918	}
919
920	beio = ctl_alloc_beio(softc);
921	KASSERT(beio != NULL, ("ctl_alloc_beio() failed"));
922
923	beio->io = io;
924	beio->softc = softc;
925	beio->lun = be_lun;
926	io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
927
928	/*
929	 * If the I/O came down with an ordered or head of queue tag, set
930	 * the BIO_ORDERED attribute.  For head of queue tags, that's
931	 * pretty much the best we can do.
932	 *
933	 * XXX KDM we don't have a great way to easily know about the FUA
934	 * bit right now (it is decoded in ctl_read_write(), but we don't
935	 * pass that knowledge to the backend), and in any case we would
936	 * need to determine how to handle it.
937	 */
938	if ((io->scsiio.tag_type == CTL_TAG_ORDERED)
939	 || (io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE))
940		beio->bio_flags = BIO_ORDERED;
941
942	switch (io->scsiio.tag_type) {
943	case CTL_TAG_ORDERED:
944		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
945		break;
946	case CTL_TAG_HEAD_OF_QUEUE:
947		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
948		break;
949	case CTL_TAG_UNTAGGED:
950	case CTL_TAG_SIMPLE:
951	case CTL_TAG_ACA:
952	default:
953		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
954		break;
955	}
956
957	/*
958	 * This path handles read and write only.  The config write path
959	 * handles flush operations.
960	 */
961	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
962		beio->bio_cmd = BIO_READ;
963		beio->ds_trans_type = DEVSTAT_READ;
964	} else {
965		beio->bio_cmd = BIO_WRITE;
966		beio->ds_trans_type = DEVSTAT_WRITE;
967	}
968
969	beio->io_len = lbalen.len * be_lun->blocksize;
970	beio->io_offset = lbalen.lba * be_lun->blocksize;
971
972	DPRINTF("%s at LBA %jx len %u\n",
973	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
974	       (uintmax_t)lbalen.lba, lbalen.len);
975
976	for (i = 0, len_left = io_size_bytes; i < CTLBLK_MAX_SEGS &&
977	     len_left > 0; i++) {
978
979		/*
980		 * Setup the S/G entry for this chunk.
981		 */
982		beio->sg_segs[i].len = min(MAXPHYS, len_left);
983		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
984
985		DPRINTF("segment %d addr %p len %zd\n", i,
986			beio->sg_segs[i].addr, beio->sg_segs[i].len);
987
988		beio->num_segs++;
989		len_left -= beio->sg_segs[i].len;
990	}
991
992	/*
993	 * For the read case, we need to read the data into our buffers and
994	 * then we can send it back to the user.  For the write case, we
995	 * need to get the data from the user first.
996	 */
997	if (beio->bio_cmd == BIO_READ) {
998		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
999		be_lun->dispatch(be_lun, beio);
1000	} else {
1001		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1002		io->scsiio.be_move_done = ctl_be_block_move_done;
1003		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1004		io->scsiio.kern_data_len = beio->io_len;
1005		io->scsiio.kern_total_len = beio->io_len;
1006		io->scsiio.kern_rel_offset = 0;
1007		io->scsiio.kern_data_resid = 0;
1008		io->scsiio.kern_sg_entries = beio->num_segs;
1009		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
1010#ifdef CTL_TIME_IO
1011        	getbintime(&io->io_hdr.dma_start_bt);
1012#endif
1013		ctl_datamove(io);
1014	}
1015}
1016
1017static void
1018ctl_be_block_worker(void *context, int pending)
1019{
1020	struct ctl_be_block_lun *be_lun;
1021	struct ctl_be_block_softc *softc;
1022	union ctl_io *io;
1023
1024	be_lun = (struct ctl_be_block_lun *)context;
1025	softc = be_lun->softc;
1026
1027	DPRINTF("entered\n");
1028
1029	mtx_lock(&be_lun->lock);
1030	for (;;) {
1031		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1032		if (io != NULL) {
1033			struct ctl_be_block_io *beio;
1034
1035			DPRINTF("datamove queue\n");
1036
1037			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1038				      ctl_io_hdr, links);
1039
1040			mtx_unlock(&be_lun->lock);
1041
1042			beio = (struct ctl_be_block_io *)
1043			    io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
1044
1045			be_lun->dispatch(be_lun, beio);
1046
1047			mtx_lock(&be_lun->lock);
1048			continue;
1049		}
1050		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1051		if (io != NULL) {
1052
1053			DPRINTF("config write queue\n");
1054
1055			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1056				      ctl_io_hdr, links);
1057
1058			mtx_unlock(&be_lun->lock);
1059
1060			ctl_be_block_cw_dispatch(be_lun, io);
1061
1062			mtx_lock(&be_lun->lock);
1063			continue;
1064		}
1065		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1066		if (io != NULL) {
1067			DPRINTF("input queue\n");
1068
1069			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1070				      ctl_io_hdr, links);
1071			mtx_unlock(&be_lun->lock);
1072
1073			/*
1074			 * We must drop the lock, since this routine and
1075			 * its children may sleep.
1076			 */
1077			ctl_be_block_dispatch(be_lun, io);
1078
1079			mtx_lock(&be_lun->lock);
1080			continue;
1081		}
1082
1083		/*
1084		 * If we get here, there is no work left in the queues, so
1085		 * just break out and let the task queue go to sleep.
1086		 */
1087		break;
1088	}
1089	mtx_unlock(&be_lun->lock);
1090}
1091
1092/*
1093 * Entry point from CTL to the backend for I/O.  We queue everything to a
1094 * work thread, so this just puts the I/O on a queue and wakes up the
1095 * thread.
1096 */
1097static int
1098ctl_be_block_submit(union ctl_io *io)
1099{
1100	struct ctl_be_block_lun *be_lun;
1101	struct ctl_be_lun *ctl_be_lun;
1102	int retval;
1103
1104	DPRINTF("entered\n");
1105
1106	retval = CTL_RETVAL_COMPLETE;
1107
1108	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1109		CTL_PRIV_BACKEND_LUN].ptr;
1110	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
1111
1112	/*
1113	 * Make sure we only get SCSI I/O.
1114	 */
1115	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1116		"%#x) encountered", io->io_hdr.io_type));
1117
1118	mtx_lock(&be_lun->lock);
1119	/*
1120	 * XXX KDM make sure that links is okay to use at this point.
1121	 * Otherwise, we either need to add another field to ctl_io_hdr,
1122	 * or deal with resource allocation here.
1123	 */
1124	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1125	mtx_unlock(&be_lun->lock);
1126
1127	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1128
1129	return (retval);
1130}
1131
1132static int
1133ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1134			int flag, struct thread *td)
1135{
1136	struct ctl_be_block_softc *softc;
1137	int error;
1138
1139	softc = &backend_block_softc;
1140
1141	error = 0;
1142
1143	switch (cmd) {
1144	case CTL_LUN_REQ: {
1145		struct ctl_lun_req *lun_req;
1146
1147		lun_req = (struct ctl_lun_req *)addr;
1148
1149		switch (lun_req->reqtype) {
1150		case CTL_LUNREQ_CREATE:
1151			error = ctl_be_block_create(softc, lun_req);
1152			break;
1153		case CTL_LUNREQ_RM:
1154			error = ctl_be_block_rm(softc, lun_req);
1155			break;
1156		case CTL_LUNREQ_MODIFY:
1157			error = ctl_be_block_modify(softc, lun_req);
1158			break;
1159		default:
1160			lun_req->status = CTL_LUN_ERROR;
1161			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1162				 "%s: invalid LUN request type %d", __func__,
1163				 lun_req->reqtype);
1164			break;
1165		}
1166		break;
1167	}
1168	default:
1169		error = ENOTTY;
1170		break;
1171	}
1172
1173	return (error);
1174}
1175
1176static int
1177ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1178{
1179	struct ctl_be_block_filedata *file_data;
1180	struct ctl_lun_create_params *params;
1181	struct vattr		      vattr;
1182	int			      error;
1183
1184	error = 0;
1185	file_data = &be_lun->backend.file;
1186	params = &req->reqdata.create;
1187
1188	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1189	be_lun->dispatch = ctl_be_block_dispatch_file;
1190	be_lun->lun_flush = ctl_be_block_flush_file;
1191
1192	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1193	if (error != 0) {
1194		snprintf(req->error_str, sizeof(req->error_str),
1195			 "error calling VOP_GETATTR() for file %s",
1196			 be_lun->dev_path);
1197		return (error);
1198	}
1199
1200	/*
1201	 * Verify that we have the ability to upgrade to exclusive
1202	 * access on this file so we can trap errors at open instead
1203	 * of reporting them during first access.
1204	 */
1205	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1206		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1207		if (be_lun->vn->v_iflag & VI_DOOMED) {
1208			error = EBADF;
1209			snprintf(req->error_str, sizeof(req->error_str),
1210				 "error locking file %s", be_lun->dev_path);
1211			return (error);
1212		}
1213	}
1214
1215
1216	file_data->cred = crhold(curthread->td_ucred);
1217	if (params->lun_size_bytes != 0)
1218		be_lun->size_bytes = params->lun_size_bytes;
1219	else
1220		be_lun->size_bytes = vattr.va_size;
1221	/*
1222	 * We set the multi thread flag for file operations because all
1223	 * filesystems (in theory) are capable of allowing multiple readers
1224	 * of a file at once.  So we want to get the maximum possible
1225	 * concurrency.
1226	 */
1227	be_lun->flags |= CTL_BE_BLOCK_LUN_MULTI_THREAD;
1228
1229	/*
1230	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
1231	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
1232	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
1233	 * may not work with other OSes as well.  So just export a sector
1234	 * size of 512 bytes, which should work with any OS or
1235	 * application.  Since our backing is a file, any block size will
1236	 * work fine for the backing store.
1237	 */
1238#if 0
1239	be_lun->blocksize= vattr.va_blocksize;
1240#endif
1241	if (params->blocksize_bytes != 0)
1242		be_lun->blocksize = params->blocksize_bytes;
1243	else
1244		be_lun->blocksize = 512;
1245
1246	/*
1247	 * Sanity check.  The media size has to be at least one
1248	 * sector long.
1249	 */
1250	if (be_lun->size_bytes < be_lun->blocksize) {
1251		error = EINVAL;
1252		snprintf(req->error_str, sizeof(req->error_str),
1253			 "file %s size %ju < block size %u", be_lun->dev_path,
1254			 (uintmax_t)be_lun->size_bytes, be_lun->blocksize);
1255	}
1256	return (error);
1257}
1258
1259static int
1260ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1261{
1262	struct ctl_lun_create_params *params;
1263	struct vattr		      vattr;
1264	struct cdev		     *dev;
1265	struct cdevsw		     *devsw;
1266	int			      error;
1267	off_t			      ps, pss, po, pos;
1268
1269	params = &req->reqdata.create;
1270
1271	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1272	be_lun->dispatch = ctl_be_block_dispatch_dev;
1273	be_lun->lun_flush = ctl_be_block_flush_dev;
1274	be_lun->backend.dev.cdev = be_lun->vn->v_rdev;
1275	be_lun->backend.dev.csw = dev_refthread(be_lun->backend.dev.cdev,
1276					     &be_lun->backend.dev.dev_ref);
1277	if (be_lun->backend.dev.csw == NULL)
1278		panic("Unable to retrieve device switch");
1279
1280	error = VOP_GETATTR(be_lun->vn, &vattr, NOCRED);
1281	if (error) {
1282		snprintf(req->error_str, sizeof(req->error_str),
1283			 "%s: error getting vnode attributes for device %s",
1284			 __func__, be_lun->dev_path);
1285		return (error);
1286	}
1287
1288	dev = be_lun->vn->v_rdev;
1289	devsw = dev->si_devsw;
1290	if (!devsw->d_ioctl) {
1291		snprintf(req->error_str, sizeof(req->error_str),
1292			 "%s: no d_ioctl for device %s!", __func__,
1293			 be_lun->dev_path);
1294		return (ENODEV);
1295	}
1296
1297	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
1298			       (caddr_t)&be_lun->blocksize, FREAD,
1299			       curthread);
1300	if (error) {
1301		snprintf(req->error_str, sizeof(req->error_str),
1302			 "%s: error %d returned for DIOCGSECTORSIZE ioctl "
1303			 "on %s!", __func__, error, be_lun->dev_path);
1304		return (error);
1305	}
1306
1307	/*
1308	 * If the user has asked for a blocksize that is greater than the
1309	 * backing device's blocksize, we can do it only if the blocksize
1310	 * the user is asking for is an even multiple of the underlying
1311	 * device's blocksize.
1312	 */
1313	if ((params->blocksize_bytes != 0)
1314	 && (params->blocksize_bytes > be_lun->blocksize)) {
1315		uint32_t bs_multiple, tmp_blocksize;
1316
1317		bs_multiple = params->blocksize_bytes / be_lun->blocksize;
1318
1319		tmp_blocksize = bs_multiple * be_lun->blocksize;
1320
1321		if (tmp_blocksize == params->blocksize_bytes) {
1322			be_lun->blocksize = params->blocksize_bytes;
1323		} else {
1324			snprintf(req->error_str, sizeof(req->error_str),
1325				 "%s: requested blocksize %u is not an even "
1326				 "multiple of backing device blocksize %u",
1327				 __func__, params->blocksize_bytes,
1328				 be_lun->blocksize);
1329			return (EINVAL);
1330
1331		}
1332	} else if ((params->blocksize_bytes != 0)
1333		&& (params->blocksize_bytes != be_lun->blocksize)) {
1334		snprintf(req->error_str, sizeof(req->error_str),
1335			 "%s: requested blocksize %u < backing device "
1336			 "blocksize %u", __func__, params->blocksize_bytes,
1337			 be_lun->blocksize);
1338		return (EINVAL);
1339	}
1340
1341	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
1342			       (caddr_t)&be_lun->size_bytes, FREAD,
1343			       curthread);
1344	if (error) {
1345		snprintf(req->error_str, sizeof(req->error_str),
1346			 "%s: error %d returned for DIOCGMEDIASIZE "
1347			 " ioctl on %s!", __func__, error,
1348			 be_lun->dev_path);
1349		return (error);
1350	}
1351
1352	if (params->lun_size_bytes != 0) {
1353		if (params->lun_size_bytes > be_lun->size_bytes) {
1354			snprintf(req->error_str, sizeof(req->error_str),
1355				 "%s: requested LUN size %ju > backing device "
1356				 "size %ju", __func__,
1357				 (uintmax_t)params->lun_size_bytes,
1358				 (uintmax_t)be_lun->size_bytes);
1359			return (EINVAL);
1360		}
1361
1362		be_lun->size_bytes = params->lun_size_bytes;
1363	}
1364
1365	error = devsw->d_ioctl(dev, DIOCGSTRIPESIZE,
1366			       (caddr_t)&ps, FREAD, curthread);
1367	if (error)
1368		ps = po = 0;
1369	else {
1370		error = devsw->d_ioctl(dev, DIOCGSTRIPEOFFSET,
1371				       (caddr_t)&po, FREAD, curthread);
1372		if (error)
1373			po = 0;
1374	}
1375	pss = ps / be_lun->blocksize;
1376	pos = po / be_lun->blocksize;
1377	if ((pss > 0) && (pss * be_lun->blocksize == ps) && (pss >= pos) &&
1378	    ((pss & (pss - 1)) == 0) && (pos * be_lun->blocksize == po)) {
1379		be_lun->pblockexp = fls(pss) - 1;
1380		be_lun->pblockoff = (pss - pos) % pss;
1381	}
1382
1383	return (0);
1384}
1385
1386static int
1387ctl_be_block_close(struct ctl_be_block_lun *be_lun)
1388{
1389	DROP_GIANT();
1390	if (be_lun->vn) {
1391		int flags = FREAD | FWRITE;
1392
1393		switch (be_lun->dev_type) {
1394		case CTL_BE_BLOCK_DEV:
1395			if (be_lun->backend.dev.csw) {
1396				dev_relthread(be_lun->backend.dev.cdev,
1397					      be_lun->backend.dev.dev_ref);
1398				be_lun->backend.dev.csw  = NULL;
1399				be_lun->backend.dev.cdev = NULL;
1400			}
1401			break;
1402		case CTL_BE_BLOCK_FILE:
1403			break;
1404		case CTL_BE_BLOCK_NONE:
1405			break;
1406		default:
1407			panic("Unexpected backend type.");
1408			break;
1409		}
1410
1411		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
1412		be_lun->vn = NULL;
1413
1414		switch (be_lun->dev_type) {
1415		case CTL_BE_BLOCK_DEV:
1416			break;
1417		case CTL_BE_BLOCK_FILE:
1418			if (be_lun->backend.file.cred != NULL) {
1419				crfree(be_lun->backend.file.cred);
1420				be_lun->backend.file.cred = NULL;
1421			}
1422			break;
1423		case CTL_BE_BLOCK_NONE:
1424			break;
1425		default:
1426			panic("Unexpected backend type.");
1427			break;
1428		}
1429	}
1430	PICKUP_GIANT();
1431
1432	return (0);
1433}
1434
1435static int
1436ctl_be_block_open(struct ctl_be_block_softc *softc,
1437		       struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1438{
1439	struct nameidata nd;
1440	int		 flags;
1441	int		 error;
1442
1443	/*
1444	 * XXX KDM allow a read-only option?
1445	 */
1446	flags = FREAD | FWRITE;
1447	error = 0;
1448
1449	if (rootvnode == NULL) {
1450		snprintf(req->error_str, sizeof(req->error_str),
1451			 "%s: Root filesystem is not mounted", __func__);
1452		return (1);
1453	}
1454
1455	if (!curthread->td_proc->p_fd->fd_cdir) {
1456		curthread->td_proc->p_fd->fd_cdir = rootvnode;
1457		VREF(rootvnode);
1458	}
1459	if (!curthread->td_proc->p_fd->fd_rdir) {
1460		curthread->td_proc->p_fd->fd_rdir = rootvnode;
1461		VREF(rootvnode);
1462	}
1463	if (!curthread->td_proc->p_fd->fd_jdir) {
1464		curthread->td_proc->p_fd->fd_jdir = rootvnode;
1465		VREF(rootvnode);
1466	}
1467
1468 again:
1469	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
1470	error = vn_open(&nd, &flags, 0, NULL);
1471	if (error) {
1472		/*
1473		 * This is the only reasonable guess we can make as far as
1474		 * path if the user doesn't give us a fully qualified path.
1475		 * If they want to specify a file, they need to specify the
1476		 * full path.
1477		 */
1478		if (be_lun->dev_path[0] != '/') {
1479			char *dev_path = "/dev/";
1480			char *dev_name;
1481
1482			/* Try adding device path at beginning of name */
1483			dev_name = malloc(strlen(be_lun->dev_path)
1484					+ strlen(dev_path) + 1,
1485					  M_CTLBLK, M_WAITOK);
1486			if (dev_name) {
1487				sprintf(dev_name, "%s%s", dev_path,
1488					be_lun->dev_path);
1489				free(be_lun->dev_path, M_CTLBLK);
1490				be_lun->dev_path = dev_name;
1491				goto again;
1492			}
1493		}
1494		snprintf(req->error_str, sizeof(req->error_str),
1495			 "%s: error opening %s", __func__, be_lun->dev_path);
1496		return (error);
1497	}
1498
1499	NDFREE(&nd, NDF_ONLY_PNBUF);
1500
1501	be_lun->vn = nd.ni_vp;
1502
1503	/* We only support disks and files. */
1504	if (vn_isdisk(be_lun->vn, &error)) {
1505		error = ctl_be_block_open_dev(be_lun, req);
1506	} else if (be_lun->vn->v_type == VREG) {
1507		error = ctl_be_block_open_file(be_lun, req);
1508	} else {
1509		error = EINVAL;
1510		snprintf(req->error_str, sizeof(req->error_str),
1511			 "%s is not a disk or plain file", be_lun->dev_path);
1512	}
1513	VOP_UNLOCK(be_lun->vn, 0);
1514
1515	if (error != 0) {
1516		ctl_be_block_close(be_lun);
1517		return (error);
1518	}
1519
1520	be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
1521	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
1522
1523	return (0);
1524}
1525
1526static int
1527ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1528{
1529	struct ctl_be_block_lun *be_lun;
1530	struct ctl_lun_create_params *params;
1531	struct ctl_be_arg *file_arg;
1532	char tmpstr[32];
1533	int retval, num_threads;
1534	int i;
1535
1536	params = &req->reqdata.create;
1537	retval = 0;
1538
1539	num_threads = cbb_num_threads;
1540
1541	file_arg = NULL;
1542
1543	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
1544
1545	be_lun->softc = softc;
1546	STAILQ_INIT(&be_lun->input_queue);
1547	STAILQ_INIT(&be_lun->config_write_queue);
1548	STAILQ_INIT(&be_lun->datamove_queue);
1549	STAILQ_INIT(&be_lun->ctl_be_lun.options);
1550	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
1551	mtx_init(&be_lun->lock, be_lun->lunname, NULL, MTX_DEF);
1552
1553	be_lun->lun_zone = uma_zcreate(be_lun->lunname, MAXPHYS,
1554	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
1555
1556	if (be_lun->lun_zone == NULL) {
1557		snprintf(req->error_str, sizeof(req->error_str),
1558			 "%s: error allocating UMA zone", __func__);
1559		goto bailout_error;
1560	}
1561
1562	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
1563		be_lun->ctl_be_lun.lun_type = params->device_type;
1564	else
1565		be_lun->ctl_be_lun.lun_type = T_DIRECT;
1566
1567	if (be_lun->ctl_be_lun.lun_type == T_DIRECT) {
1568		for (i = 0; i < req->num_be_args; i++) {
1569			if (strcmp(req->kern_be_args[i].kname, "file") == 0) {
1570				file_arg = &req->kern_be_args[i];
1571				break;
1572			}
1573		}
1574
1575		if (file_arg == NULL) {
1576			snprintf(req->error_str, sizeof(req->error_str),
1577				 "%s: no file argument specified", __func__);
1578			goto bailout_error;
1579		}
1580
1581		be_lun->dev_path = malloc(file_arg->vallen, M_CTLBLK,
1582					  M_WAITOK | M_ZERO);
1583
1584		strlcpy(be_lun->dev_path, (char *)file_arg->kvalue,
1585			file_arg->vallen);
1586
1587		retval = ctl_be_block_open(softc, be_lun, req);
1588		if (retval != 0) {
1589			retval = 0;
1590			goto bailout_error;
1591		}
1592
1593		/*
1594		 * Tell the user the size of the file/device.
1595		 */
1596		params->lun_size_bytes = be_lun->size_bytes;
1597
1598		/*
1599		 * The maximum LBA is the size - 1.
1600		 */
1601		be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
1602	} else {
1603		/*
1604		 * For processor devices, we don't have any size.
1605		 */
1606		be_lun->blocksize = 0;
1607		be_lun->pblockexp = 0;
1608		be_lun->pblockoff = 0;
1609		be_lun->size_blocks = 0;
1610		be_lun->size_bytes = 0;
1611		be_lun->ctl_be_lun.maxlba = 0;
1612		params->lun_size_bytes = 0;
1613
1614		/*
1615		 * Default to just 1 thread for processor devices.
1616		 */
1617		num_threads = 1;
1618	}
1619
1620	/*
1621	 * XXX This searching loop might be refactored to be combined with
1622	 * the loop above,
1623	 */
1624	for (i = 0; i < req->num_be_args; i++) {
1625		if (strcmp(req->kern_be_args[i].kname, "num_threads") == 0) {
1626			struct ctl_be_arg *thread_arg;
1627			char num_thread_str[16];
1628			int tmp_num_threads;
1629
1630
1631			thread_arg = &req->kern_be_args[i];
1632
1633			strlcpy(num_thread_str, (char *)thread_arg->kvalue,
1634				min(thread_arg->vallen,
1635				sizeof(num_thread_str)));
1636
1637			tmp_num_threads = strtol(num_thread_str, NULL, 0);
1638
1639			/*
1640			 * We don't let the user specify less than one
1641			 * thread, but hope he's clueful enough not to
1642			 * specify 1000 threads.
1643			 */
1644			if (tmp_num_threads < 1) {
1645				snprintf(req->error_str, sizeof(req->error_str),
1646					 "%s: invalid number of threads %s",
1647				         __func__, num_thread_str);
1648				goto bailout_error;
1649			}
1650
1651			num_threads = tmp_num_threads;
1652		} else if (strcmp(req->kern_be_args[i].kname, "file") != 0 &&
1653		    strcmp(req->kern_be_args[i].kname, "dev") != 0) {
1654			struct ctl_be_lun_option *opt;
1655
1656			opt = malloc(sizeof(*opt), M_CTLBLK, M_WAITOK);
1657			opt->name = malloc(strlen(req->kern_be_args[i].kname) + 1, M_CTLBLK, M_WAITOK);
1658			strcpy(opt->name, req->kern_be_args[i].kname);
1659			opt->value = malloc(strlen(req->kern_be_args[i].kvalue) + 1, M_CTLBLK, M_WAITOK);
1660			strcpy(opt->value, req->kern_be_args[i].kvalue);
1661			STAILQ_INSERT_TAIL(&be_lun->ctl_be_lun.options, opt, links);
1662		}
1663	}
1664
1665	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
1666	be_lun->ctl_be_lun.flags = CTL_LUN_FLAG_PRIMARY;
1667	be_lun->ctl_be_lun.be_lun = be_lun;
1668	be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
1669	be_lun->ctl_be_lun.pblockexp = be_lun->pblockexp;
1670	be_lun->ctl_be_lun.pblockoff = be_lun->pblockoff;
1671	/* Tell the user the blocksize we ended up using */
1672	params->blocksize_bytes = be_lun->blocksize;
1673	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
1674		be_lun->ctl_be_lun.req_lun_id = params->req_lun_id;
1675		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_ID_REQ;
1676	} else
1677		be_lun->ctl_be_lun.req_lun_id = 0;
1678
1679	be_lun->ctl_be_lun.lun_shutdown = ctl_be_block_lun_shutdown;
1680	be_lun->ctl_be_lun.lun_config_status =
1681		ctl_be_block_lun_config_status;
1682	be_lun->ctl_be_lun.be = &ctl_be_block_driver;
1683
1684	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
1685		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
1686			 softc->num_luns);
1687		strncpy((char *)be_lun->ctl_be_lun.serial_num, tmpstr,
1688			ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
1689			sizeof(tmpstr)));
1690
1691		/* Tell the user what we used for a serial number */
1692		strncpy((char *)params->serial_num, tmpstr,
1693			ctl_min(sizeof(params->serial_num), sizeof(tmpstr)));
1694	} else {
1695		strncpy((char *)be_lun->ctl_be_lun.serial_num,
1696			params->serial_num,
1697			ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
1698			sizeof(params->serial_num)));
1699	}
1700	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
1701		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
1702		strncpy((char *)be_lun->ctl_be_lun.device_id, tmpstr,
1703			ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
1704			sizeof(tmpstr)));
1705
1706		/* Tell the user what we used for a device ID */
1707		strncpy((char *)params->device_id, tmpstr,
1708			ctl_min(sizeof(params->device_id), sizeof(tmpstr)));
1709	} else {
1710		strncpy((char *)be_lun->ctl_be_lun.device_id,
1711			params->device_id,
1712			ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
1713				sizeof(params->device_id)));
1714	}
1715
1716	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
1717
1718	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
1719	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
1720
1721	if (be_lun->io_taskqueue == NULL) {
1722		snprintf(req->error_str, sizeof(req->error_str),
1723			 "%s: Unable to create taskqueue", __func__);
1724		goto bailout_error;
1725	}
1726
1727	/*
1728	 * Note that we start the same number of threads by default for
1729	 * both the file case and the block device case.  For the file
1730	 * case, we need multiple threads to allow concurrency, because the
1731	 * vnode interface is designed to be a blocking interface.  For the
1732	 * block device case, ZFS zvols at least will block the caller's
1733	 * context in many instances, and so we need multiple threads to
1734	 * overcome that problem.  Other block devices don't need as many
1735	 * threads, but they shouldn't cause too many problems.
1736	 *
1737	 * If the user wants to just have a single thread for a block
1738	 * device, he can specify that when the LUN is created, or change
1739	 * the tunable/sysctl to alter the default number of threads.
1740	 */
1741	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
1742					 /*num threads*/num_threads,
1743					 /*priority*/PWAIT,
1744					 /*thread name*/
1745					 "%s taskq", be_lun->lunname);
1746
1747	if (retval != 0)
1748		goto bailout_error;
1749
1750	be_lun->num_threads = num_threads;
1751
1752	mtx_lock(&softc->lock);
1753	softc->num_luns++;
1754	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
1755
1756	mtx_unlock(&softc->lock);
1757
1758	retval = ctl_add_lun(&be_lun->ctl_be_lun);
1759	if (retval != 0) {
1760		mtx_lock(&softc->lock);
1761		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
1762			      links);
1763		softc->num_luns--;
1764		mtx_unlock(&softc->lock);
1765		snprintf(req->error_str, sizeof(req->error_str),
1766			 "%s: ctl_add_lun() returned error %d, see dmesg for "
1767			"details", __func__, retval);
1768		retval = 0;
1769		goto bailout_error;
1770	}
1771
1772	mtx_lock(&softc->lock);
1773
1774	/*
1775	 * Tell the config_status routine that we're waiting so it won't
1776	 * clean up the LUN in the event of an error.
1777	 */
1778	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
1779
1780	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
1781		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
1782		if (retval == EINTR)
1783			break;
1784	}
1785	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
1786
1787	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
1788		snprintf(req->error_str, sizeof(req->error_str),
1789			 "%s: LUN configuration error, see dmesg for details",
1790			 __func__);
1791		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
1792			      links);
1793		softc->num_luns--;
1794		mtx_unlock(&softc->lock);
1795		goto bailout_error;
1796	} else {
1797		params->req_lun_id = be_lun->ctl_be_lun.lun_id;
1798	}
1799
1800	mtx_unlock(&softc->lock);
1801
1802	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
1803					       be_lun->blocksize,
1804					       DEVSTAT_ALL_SUPPORTED,
1805					       be_lun->ctl_be_lun.lun_type
1806					       | DEVSTAT_TYPE_IF_OTHER,
1807					       DEVSTAT_PRIORITY_OTHER);
1808
1809
1810	req->status = CTL_LUN_OK;
1811
1812	return (retval);
1813
1814bailout_error:
1815	req->status = CTL_LUN_ERROR;
1816
1817	ctl_be_block_close(be_lun);
1818
1819	free(be_lun->dev_path, M_CTLBLK);
1820	free(be_lun, M_CTLBLK);
1821
1822	return (retval);
1823}
1824
1825static int
1826ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1827{
1828	struct ctl_lun_rm_params *params;
1829	struct ctl_be_block_lun *be_lun;
1830	int retval;
1831
1832	params = &req->reqdata.rm;
1833
1834	mtx_lock(&softc->lock);
1835
1836	be_lun = NULL;
1837
1838	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
1839		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
1840			break;
1841	}
1842	mtx_unlock(&softc->lock);
1843
1844	if (be_lun == NULL) {
1845		snprintf(req->error_str, sizeof(req->error_str),
1846			 "%s: LUN %u is not managed by the block backend",
1847			 __func__, params->lun_id);
1848		goto bailout_error;
1849	}
1850
1851	retval = ctl_disable_lun(&be_lun->ctl_be_lun);
1852
1853	if (retval != 0) {
1854		snprintf(req->error_str, sizeof(req->error_str),
1855			 "%s: error %d returned from ctl_disable_lun() for "
1856			 "LUN %d", __func__, retval, params->lun_id);
1857		goto bailout_error;
1858
1859	}
1860
1861	retval = ctl_invalidate_lun(&be_lun->ctl_be_lun);
1862	if (retval != 0) {
1863		snprintf(req->error_str, sizeof(req->error_str),
1864			 "%s: error %d returned from ctl_invalidate_lun() for "
1865			 "LUN %d", __func__, retval, params->lun_id);
1866		goto bailout_error;
1867	}
1868
1869	mtx_lock(&softc->lock);
1870
1871	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
1872
1873	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
1874                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
1875                if (retval == EINTR)
1876                        break;
1877        }
1878
1879	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
1880
1881	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
1882		snprintf(req->error_str, sizeof(req->error_str),
1883			 "%s: interrupted waiting for LUN to be freed",
1884			 __func__);
1885		mtx_unlock(&softc->lock);
1886		goto bailout_error;
1887	}
1888
1889	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
1890
1891	softc->num_luns--;
1892	mtx_unlock(&softc->lock);
1893
1894	taskqueue_drain(be_lun->io_taskqueue, &be_lun->io_task);
1895
1896	taskqueue_free(be_lun->io_taskqueue);
1897
1898	ctl_be_block_close(be_lun);
1899
1900	if (be_lun->disk_stats != NULL)
1901		devstat_remove_entry(be_lun->disk_stats);
1902
1903	uma_zdestroy(be_lun->lun_zone);
1904
1905	free(be_lun->dev_path, M_CTLBLK);
1906
1907	free(be_lun, M_CTLBLK);
1908
1909	req->status = CTL_LUN_OK;
1910
1911	return (0);
1912
1913bailout_error:
1914
1915	req->status = CTL_LUN_ERROR;
1916
1917	return (0);
1918}
1919
1920static int
1921ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
1922			 struct ctl_lun_req *req)
1923{
1924	struct vattr vattr;
1925	int error;
1926	struct ctl_lun_modify_params *params;
1927
1928	params = &req->reqdata.modify;
1929
1930	if (params->lun_size_bytes != 0) {
1931		be_lun->size_bytes = params->lun_size_bytes;
1932	} else  {
1933		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1934		if (error != 0) {
1935			snprintf(req->error_str, sizeof(req->error_str),
1936				 "error calling VOP_GETATTR() for file %s",
1937				 be_lun->dev_path);
1938			return (error);
1939		}
1940
1941		be_lun->size_bytes = vattr.va_size;
1942	}
1943
1944	return (0);
1945}
1946
1947static int
1948ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
1949			struct ctl_lun_req *req)
1950{
1951	struct cdev *dev;
1952	struct cdevsw *devsw;
1953	int error;
1954	struct ctl_lun_modify_params *params;
1955	uint64_t size_bytes;
1956
1957	params = &req->reqdata.modify;
1958
1959	dev = be_lun->vn->v_rdev;
1960	devsw = dev->si_devsw;
1961	if (!devsw->d_ioctl) {
1962		snprintf(req->error_str, sizeof(req->error_str),
1963			 "%s: no d_ioctl for device %s!", __func__,
1964			 be_lun->dev_path);
1965		return (ENODEV);
1966	}
1967
1968	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
1969			       (caddr_t)&size_bytes, FREAD,
1970			       curthread);
1971	if (error) {
1972		snprintf(req->error_str, sizeof(req->error_str),
1973			 "%s: error %d returned for DIOCGMEDIASIZE ioctl "
1974			 "on %s!", __func__, error, be_lun->dev_path);
1975		return (error);
1976	}
1977
1978	if (params->lun_size_bytes != 0) {
1979		if (params->lun_size_bytes > size_bytes) {
1980			snprintf(req->error_str, sizeof(req->error_str),
1981				 "%s: requested LUN size %ju > backing device "
1982				 "size %ju", __func__,
1983				 (uintmax_t)params->lun_size_bytes,
1984				 (uintmax_t)size_bytes);
1985			return (EINVAL);
1986		}
1987
1988		be_lun->size_bytes = params->lun_size_bytes;
1989	} else {
1990		be_lun->size_bytes = size_bytes;
1991	}
1992
1993	return (0);
1994}
1995
1996static int
1997ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1998{
1999	struct ctl_lun_modify_params *params;
2000	struct ctl_be_block_lun *be_lun;
2001	int error;
2002
2003	params = &req->reqdata.modify;
2004
2005	mtx_lock(&softc->lock);
2006
2007	be_lun = NULL;
2008
2009	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2010		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2011			break;
2012	}
2013	mtx_unlock(&softc->lock);
2014
2015	if (be_lun == NULL) {
2016		snprintf(req->error_str, sizeof(req->error_str),
2017			 "%s: LUN %u is not managed by the block backend",
2018			 __func__, params->lun_id);
2019		goto bailout_error;
2020	}
2021
2022	if (params->lun_size_bytes != 0) {
2023		if (params->lun_size_bytes < be_lun->blocksize) {
2024			snprintf(req->error_str, sizeof(req->error_str),
2025				"%s: LUN size %ju < blocksize %u", __func__,
2026				params->lun_size_bytes, be_lun->blocksize);
2027			goto bailout_error;
2028		}
2029	}
2030
2031	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2032
2033	if (be_lun->vn->v_type == VREG)
2034		error = ctl_be_block_modify_file(be_lun, req);
2035	else
2036		error = ctl_be_block_modify_dev(be_lun, req);
2037
2038	VOP_UNLOCK(be_lun->vn, 0);
2039
2040	if (error != 0)
2041		goto bailout_error;
2042
2043	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
2044
2045	/*
2046	 * The maximum LBA is the size - 1.
2047	 *
2048	 * XXX: Note that this field is being updated without locking,
2049	 * 	which might cause problems on 32-bit architectures.
2050	 */
2051	be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
2052	ctl_lun_capacity_changed(&be_lun->ctl_be_lun);
2053
2054	/* Tell the user the exact size we ended up using */
2055	params->lun_size_bytes = be_lun->size_bytes;
2056
2057	req->status = CTL_LUN_OK;
2058
2059	return (0);
2060
2061bailout_error:
2062	req->status = CTL_LUN_ERROR;
2063
2064	return (0);
2065}
2066
2067static void
2068ctl_be_block_lun_shutdown(void *be_lun)
2069{
2070	struct ctl_be_block_lun *lun;
2071	struct ctl_be_block_softc *softc;
2072
2073	lun = (struct ctl_be_block_lun *)be_lun;
2074
2075	softc = lun->softc;
2076
2077	mtx_lock(&softc->lock);
2078	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2079	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2080		wakeup(lun);
2081	mtx_unlock(&softc->lock);
2082
2083}
2084
2085static void
2086ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2087{
2088	struct ctl_be_block_lun *lun;
2089	struct ctl_be_block_softc *softc;
2090
2091	lun = (struct ctl_be_block_lun *)be_lun;
2092	softc = lun->softc;
2093
2094	if (status == CTL_LUN_CONFIG_OK) {
2095		mtx_lock(&softc->lock);
2096		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2097		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2098			wakeup(lun);
2099		mtx_unlock(&softc->lock);
2100
2101		/*
2102		 * We successfully added the LUN, attempt to enable it.
2103		 */
2104		if (ctl_enable_lun(&lun->ctl_be_lun) != 0) {
2105			printf("%s: ctl_enable_lun() failed!\n", __func__);
2106			if (ctl_invalidate_lun(&lun->ctl_be_lun) != 0) {
2107				printf("%s: ctl_invalidate_lun() failed!\n",
2108				       __func__);
2109			}
2110		}
2111
2112		return;
2113	}
2114
2115
2116	mtx_lock(&softc->lock);
2117	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2118	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2119	wakeup(lun);
2120	mtx_unlock(&softc->lock);
2121}
2122
2123
2124static int
2125ctl_be_block_config_write(union ctl_io *io)
2126{
2127	struct ctl_be_block_lun *be_lun;
2128	struct ctl_be_lun *ctl_be_lun;
2129	int retval;
2130
2131	retval = 0;
2132
2133	DPRINTF("entered\n");
2134
2135	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2136		CTL_PRIV_BACKEND_LUN].ptr;
2137	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2138
2139	switch (io->scsiio.cdb[0]) {
2140	case SYNCHRONIZE_CACHE:
2141	case SYNCHRONIZE_CACHE_16:
2142		/*
2143		 * The upper level CTL code will filter out any CDBs with
2144		 * the immediate bit set and return the proper error.
2145		 *
2146		 * We don't really need to worry about what LBA range the
2147		 * user asked to be synced out.  When they issue a sync
2148		 * cache command, we'll sync out the whole thing.
2149		 */
2150		mtx_lock(&be_lun->lock);
2151		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2152				   links);
2153		mtx_unlock(&be_lun->lock);
2154		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2155		break;
2156	case START_STOP_UNIT: {
2157		struct scsi_start_stop_unit *cdb;
2158
2159		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2160
2161		if (cdb->how & SSS_START)
2162			retval = ctl_start_lun(ctl_be_lun);
2163		else {
2164			retval = ctl_stop_lun(ctl_be_lun);
2165			/*
2166			 * XXX KDM Copan-specific offline behavior.
2167			 * Figure out a reasonable way to port this?
2168			 */
2169#ifdef NEEDTOPORT
2170			if ((retval == 0)
2171			 && (cdb->byte2 & SSS_ONOFFLINE))
2172				retval = ctl_lun_offline(ctl_be_lun);
2173#endif
2174		}
2175
2176		/*
2177		 * In general, the above routines should not fail.  They
2178		 * just set state for the LUN.  So we've got something
2179		 * pretty wrong here if we can't start or stop the LUN.
2180		 */
2181		if (retval != 0) {
2182			ctl_set_internal_failure(&io->scsiio,
2183						 /*sks_valid*/ 1,
2184						 /*retry_count*/ 0xf051);
2185			retval = CTL_RETVAL_COMPLETE;
2186		} else {
2187			ctl_set_success(&io->scsiio);
2188		}
2189		ctl_config_write_done(io);
2190		break;
2191	}
2192	default:
2193		ctl_set_invalid_opcode(&io->scsiio);
2194		ctl_config_write_done(io);
2195		retval = CTL_RETVAL_COMPLETE;
2196		break;
2197	}
2198
2199	return (retval);
2200
2201}
2202
2203static int
2204ctl_be_block_config_read(union ctl_io *io)
2205{
2206	return (0);
2207}
2208
2209static int
2210ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2211{
2212	struct ctl_be_block_lun *lun;
2213	int retval;
2214
2215	lun = (struct ctl_be_block_lun *)be_lun;
2216	retval = 0;
2217
2218	retval = sbuf_printf(sb, "<num_threads>");
2219
2220	if (retval != 0)
2221		goto bailout;
2222
2223	retval = sbuf_printf(sb, "%d", lun->num_threads);
2224
2225	if (retval != 0)
2226		goto bailout;
2227
2228	retval = sbuf_printf(sb, "</num_threads>");
2229
2230	/*
2231	 * For processor devices, we don't have a path variable.
2232	 */
2233	if ((retval != 0)
2234	 || (lun->dev_path == NULL))
2235		goto bailout;
2236
2237	retval = sbuf_printf(sb, "<file>");
2238
2239	if (retval != 0)
2240		goto bailout;
2241
2242	retval = ctl_sbuf_printf_esc(sb, lun->dev_path);
2243
2244	if (retval != 0)
2245		goto bailout;
2246
2247	retval = sbuf_printf(sb, "</file>\n");
2248
2249bailout:
2250
2251	return (retval);
2252}
2253
2254int
2255ctl_be_block_init(void)
2256{
2257	struct ctl_be_block_softc *softc;
2258	int retval;
2259
2260	softc = &backend_block_softc;
2261	retval = 0;
2262
2263	mtx_init(&softc->lock, "ctlblk", NULL, MTX_DEF);
2264	beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
2265	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2266	STAILQ_INIT(&softc->disk_list);
2267	STAILQ_INIT(&softc->lun_list);
2268
2269	return (retval);
2270}
2271