ctl_backend_block.c revision 237821
1/*-
2 * Copyright (c) 2003 Silicon Graphics International Corp.
3 * Copyright (c) 2009-2011 Spectra Logic Corporation
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Edward Tomasz Napierala
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
17 *    substantially similar to the "NO WARRANTY" disclaimer below
18 *    ("Disclaimer") and any redistribution must be conditioned upon
19 *    including a substantially similar Disclaimer requirement for further
20 *    binary redistribution.
21 *
22 * NO WARRANTY
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
32 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGES.
34 *
35 * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
36 */
37/*
38 * CAM Target Layer driver backend for block devices.
39 *
40 * Author: Ken Merry <ken@FreeBSD.org>
41 */
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: stable/9/sys/cam/ctl/ctl_backend_block.c 237821 2012-06-29 21:19:24Z ken $");
44
45#include <opt_kdtrace.h>
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/kernel.h>
50#include <sys/types.h>
51#include <sys/kthread.h>
52#include <sys/bio.h>
53#include <sys/fcntl.h>
54#include <sys/lock.h>
55#include <sys/mutex.h>
56#include <sys/condvar.h>
57#include <sys/malloc.h>
58#include <sys/conf.h>
59#include <sys/ioccom.h>
60#include <sys/queue.h>
61#include <sys/sbuf.h>
62#include <sys/endian.h>
63#include <sys/uio.h>
64#include <sys/buf.h>
65#include <sys/taskqueue.h>
66#include <sys/vnode.h>
67#include <sys/namei.h>
68#include <sys/mount.h>
69#include <sys/disk.h>
70#include <sys/fcntl.h>
71#include <sys/filedesc.h>
72#include <sys/proc.h>
73#include <sys/pcpu.h>
74#include <sys/module.h>
75#include <sys/sdt.h>
76#include <sys/devicestat.h>
77#include <sys/sysctl.h>
78
79#include <geom/geom.h>
80
81#include <cam/cam.h>
82#include <cam/scsi/scsi_all.h>
83#include <cam/scsi/scsi_da.h>
84#include <cam/ctl/ctl_io.h>
85#include <cam/ctl/ctl.h>
86#include <cam/ctl/ctl_backend.h>
87#include <cam/ctl/ctl_frontend_internal.h>
88#include <cam/ctl/ctl_ioctl.h>
89#include <cam/ctl/ctl_scsi_all.h>
90#include <cam/ctl/ctl_error.h>
91
92/*
93 * The idea here is that we'll allocate enough S/G space to hold a 16MB
94 * I/O.  If we get an I/O larger than that, we'll reject it.
95 */
96#define	CTLBLK_MAX_IO_SIZE	(16 * 1024 * 1024)
97#define	CTLBLK_MAX_SEGS		(CTLBLK_MAX_IO_SIZE / MAXPHYS) + 1
98
99#ifdef CTLBLK_DEBUG
100#define DPRINTF(fmt, args...) \
101    printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
102#else
103#define DPRINTF(fmt, args...) do {} while(0)
104#endif
105
106SDT_PROVIDER_DEFINE(cbb);
107
108typedef enum {
109	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
110	CTL_BE_BLOCK_LUN_CONFIG_ERR	= 0x02,
111	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
112	CTL_BE_BLOCK_LUN_MULTI_THREAD	= 0x08
113} ctl_be_block_lun_flags;
114
115typedef enum {
116	CTL_BE_BLOCK_NONE,
117	CTL_BE_BLOCK_DEV,
118	CTL_BE_BLOCK_FILE
119} ctl_be_block_type;
120
121struct ctl_be_block_devdata {
122	struct cdev *cdev;
123	struct cdevsw *csw;
124	int dev_ref;
125};
126
127struct ctl_be_block_filedata {
128	struct ucred *cred;
129};
130
131union ctl_be_block_bedata {
132	struct ctl_be_block_devdata dev;
133	struct ctl_be_block_filedata file;
134};
135
136struct ctl_be_block_io;
137struct ctl_be_block_lun;
138
139typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
140			       struct ctl_be_block_io *beio);
141
142/*
143 * Backend LUN structure.  There is a 1:1 mapping between a block device
144 * and a backend block LUN, and between a backend block LUN and a CTL LUN.
145 */
146struct ctl_be_block_lun {
147	struct ctl_block_disk *disk;
148	char lunname[32];
149	char *dev_path;
150	ctl_be_block_type dev_type;
151	struct vnode *vn;
152	union ctl_be_block_bedata backend;
153	cbb_dispatch_t dispatch;
154	cbb_dispatch_t lun_flush;
155	struct mtx lock;
156	uma_zone_t lun_zone;
157	uint64_t size_blocks;
158	uint64_t size_bytes;
159	uint32_t blocksize;
160	int blocksize_shift;
161	struct ctl_be_block_softc *softc;
162	struct devstat *disk_stats;
163	ctl_be_block_lun_flags flags;
164	STAILQ_ENTRY(ctl_be_block_lun) links;
165	struct ctl_be_lun ctl_be_lun;
166	struct taskqueue *io_taskqueue;
167	struct task io_task;
168	int num_threads;
169	STAILQ_HEAD(, ctl_io_hdr) input_queue;
170	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
171	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
172};
173
174/*
175 * Overall softc structure for the block backend module.
176 */
177struct ctl_be_block_softc {
178	STAILQ_HEAD(, ctl_be_block_io)   beio_free_queue;
179	struct mtx			 lock;
180	int				 prealloc_beio;
181	int				 num_disks;
182	STAILQ_HEAD(, ctl_block_disk)	 disk_list;
183	int				 num_luns;
184	STAILQ_HEAD(, ctl_be_block_lun)	 lun_list;
185};
186
187static struct ctl_be_block_softc backend_block_softc;
188
189/*
190 * Per-I/O information.
191 */
192struct ctl_be_block_io {
193	union ctl_io			*io;
194	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
195	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
196	int				bio_cmd;
197	int				bio_flags;
198	int				num_segs;
199	int				num_bios_sent;
200	int				num_bios_done;
201	int				send_complete;
202	int				num_errors;
203	struct bintime			ds_t0;
204	devstat_tag_type		ds_tag_type;
205	devstat_trans_flags		ds_trans_type;
206	uint64_t			io_len;
207	uint64_t			io_offset;
208	struct ctl_be_block_softc	*softc;
209	struct ctl_be_block_lun		*lun;
210	STAILQ_ENTRY(ctl_be_block_io)	links;
211};
212
213static int cbb_num_threads = 14;
214TUNABLE_INT("kern.cam.ctl.block.num_threads", &cbb_num_threads);
215SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD, 0,
216	    "CAM Target Layer Block Backend");
217SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RW,
218           &cbb_num_threads, 0, "Number of threads per backing file");
219
220static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
221static void ctl_free_beio(struct ctl_be_block_io *beio);
222static int ctl_grow_beio(struct ctl_be_block_softc *softc, int count);
223#if 0
224static void ctl_shrink_beio(struct ctl_be_block_softc *softc);
225#endif
226static void ctl_complete_beio(struct ctl_be_block_io *beio);
227static int ctl_be_block_move_done(union ctl_io *io);
228static void ctl_be_block_biodone(struct bio *bio);
229static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
230				    struct ctl_be_block_io *beio);
231static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
232				       struct ctl_be_block_io *beio);
233static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
234				   struct ctl_be_block_io *beio);
235static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
236				      struct ctl_be_block_io *beio);
237static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
238				    union ctl_io *io);
239static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
240				  union ctl_io *io);
241static void ctl_be_block_worker(void *context, int pending);
242static int ctl_be_block_submit(union ctl_io *io);
243static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
244				   int flag, struct thread *td);
245static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
246				  struct ctl_lun_req *req);
247static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
248				 struct ctl_lun_req *req);
249static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
250static int ctl_be_block_open(struct ctl_be_block_softc *softc,
251			     struct ctl_be_block_lun *be_lun,
252			     struct ctl_lun_req *req);
253static int ctl_be_block_create(struct ctl_be_block_softc *softc,
254			       struct ctl_lun_req *req);
255static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
256			   struct ctl_lun_req *req);
257static int ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
258				  struct ctl_lun_req *req);
259static int ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
260				 struct ctl_lun_req *req);
261static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
262			   struct ctl_lun_req *req);
263static void ctl_be_block_lun_shutdown(void *be_lun);
264static void ctl_be_block_lun_config_status(void *be_lun,
265					   ctl_lun_config_status status);
266static int ctl_be_block_config_write(union ctl_io *io);
267static int ctl_be_block_config_read(union ctl_io *io);
268static int ctl_be_block_lun_info(void *be_lun, struct sbuf *sb);
269int ctl_be_block_init(void);
270
271static struct ctl_backend_driver ctl_be_block_driver =
272{
273	.name = "block",
274	.flags = CTL_BE_FLAG_HAS_CONFIG,
275	.init = ctl_be_block_init,
276	.data_submit = ctl_be_block_submit,
277	.data_move_done = ctl_be_block_move_done,
278	.config_read = ctl_be_block_config_read,
279	.config_write = ctl_be_block_config_write,
280	.ioctl = ctl_be_block_ioctl,
281	.lun_info = ctl_be_block_lun_info
282};
283
284MALLOC_DEFINE(M_CTLBLK, "ctlblk", "Memory used for CTL block backend");
285CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
286
287static struct ctl_be_block_io *
288ctl_alloc_beio(struct ctl_be_block_softc *softc)
289{
290	struct ctl_be_block_io *beio;
291	int count;
292
293	mtx_lock(&softc->lock);
294
295	beio = STAILQ_FIRST(&softc->beio_free_queue);
296	if (beio != NULL) {
297		STAILQ_REMOVE(&softc->beio_free_queue, beio,
298			      ctl_be_block_io, links);
299	}
300	mtx_unlock(&softc->lock);
301
302	if (beio != NULL) {
303		bzero(beio, sizeof(*beio));
304		beio->softc = softc;
305		return (beio);
306	}
307
308	for (;;) {
309
310		count = ctl_grow_beio(softc, /*count*/ 10);
311
312		/*
313		 * This shouldn't be possible, since ctl_grow_beio() uses a
314		 * blocking malloc.
315		 */
316		if (count == 0)
317			return (NULL);
318
319		/*
320		 * Since we have to drop the lock when we're allocating beio
321		 * structures, it's possible someone else can come along and
322		 * allocate the beio's we've just allocated.
323		 */
324		mtx_lock(&softc->lock);
325		beio = STAILQ_FIRST(&softc->beio_free_queue);
326		if (beio != NULL) {
327			STAILQ_REMOVE(&softc->beio_free_queue, beio,
328				      ctl_be_block_io, links);
329		}
330		mtx_unlock(&softc->lock);
331
332		if (beio != NULL) {
333			bzero(beio, sizeof(*beio));
334			beio->softc = softc;
335			break;
336		}
337	}
338	return (beio);
339}
340
341static void
342ctl_free_beio(struct ctl_be_block_io *beio)
343{
344	struct ctl_be_block_softc *softc;
345	int duplicate_free;
346	int i;
347
348	softc = beio->softc;
349	duplicate_free = 0;
350
351	for (i = 0; i < beio->num_segs; i++) {
352		if (beio->sg_segs[i].addr == NULL)
353			duplicate_free++;
354
355		uma_zfree(beio->lun->lun_zone, beio->sg_segs[i].addr);
356		beio->sg_segs[i].addr = NULL;
357	}
358
359	if (duplicate_free > 0) {
360		printf("%s: %d duplicate frees out of %d segments\n", __func__,
361		       duplicate_free, beio->num_segs);
362	}
363	mtx_lock(&softc->lock);
364	STAILQ_INSERT_TAIL(&softc->beio_free_queue, beio, links);
365	mtx_unlock(&softc->lock);
366}
367
368static int
369ctl_grow_beio(struct ctl_be_block_softc *softc, int count)
370{
371	int i;
372
373	for (i = 0; i < count; i++) {
374		struct ctl_be_block_io *beio;
375
376		beio = (struct ctl_be_block_io *)malloc(sizeof(*beio),
377							   M_CTLBLK,
378							   M_WAITOK | M_ZERO);
379		if (beio == NULL)
380			break;
381
382		bzero(beio, sizeof(*beio));
383		beio->softc = softc;
384		mtx_lock(&softc->lock);
385		STAILQ_INSERT_TAIL(&softc->beio_free_queue, beio, links);
386		mtx_unlock(&softc->lock);
387	}
388
389	return (i);
390}
391
392#if 0
393static void
394ctl_shrink_beio(struct ctl_be_block_softc *softc)
395{
396	struct ctl_be_block_io *beio, *beio_tmp;
397
398	mtx_lock(&softc->lock);
399	STAILQ_FOREACH_SAFE(beio, &softc->beio_free_queue, links, beio_tmp) {
400		STAILQ_REMOVE(&softc->beio_free_queue, beio,
401			      ctl_be_block_io, links);
402		free(beio, M_CTLBLK);
403	}
404	mtx_unlock(&softc->lock);
405}
406#endif
407
408static void
409ctl_complete_beio(struct ctl_be_block_io *beio)
410{
411	union ctl_io *io;
412	int io_len;
413
414	io = beio->io;
415
416	if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)
417		io_len = beio->io_len;
418	else
419		io_len = 0;
420
421	devstat_end_transaction(beio->lun->disk_stats,
422				/*bytes*/ io_len,
423				beio->ds_tag_type,
424				beio->ds_trans_type,
425				/*now*/ NULL,
426				/*then*/&beio->ds_t0);
427
428	ctl_free_beio(beio);
429	ctl_done(io);
430}
431
432static int
433ctl_be_block_move_done(union ctl_io *io)
434{
435	struct ctl_be_block_io *beio;
436	struct ctl_be_block_lun *be_lun;
437#ifdef CTL_TIME_IO
438	struct bintime cur_bt;
439#endif
440
441	beio = (struct ctl_be_block_io *)
442		io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
443
444	be_lun = beio->lun;
445
446	DPRINTF("entered\n");
447
448#ifdef CTL_TIME_IO
449	getbintime(&cur_bt);
450	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
451	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
452	io->io_hdr.num_dmas++;
453#endif
454
455	/*
456	 * We set status at this point for read commands, and write
457	 * commands with errors.
458	 */
459	if ((beio->bio_cmd == BIO_READ)
460	 && (io->io_hdr.port_status == 0)
461	 && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
462	 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE))
463		ctl_set_success(&io->scsiio);
464	else if ((io->io_hdr.port_status != 0)
465	      && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
466	      && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) {
467		/*
468		 * For hardware error sense keys, the sense key
469		 * specific value is defined to be a retry count,
470		 * but we use it to pass back an internal FETD
471		 * error code.  XXX KDM  Hopefully the FETD is only
472		 * using 16 bits for an error code, since that's
473		 * all the space we have in the sks field.
474		 */
475		ctl_set_internal_failure(&io->scsiio,
476					 /*sks_valid*/ 1,
477					 /*retry_count*/
478					 io->io_hdr.port_status);
479	}
480
481	/*
482	 * If this is a read, or a write with errors, it is done.
483	 */
484	if ((beio->bio_cmd == BIO_READ)
485	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
486	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
487		ctl_complete_beio(beio);
488		return (0);
489	}
490
491	/*
492	 * At this point, we have a write and the DMA completed
493	 * successfully.  We now have to queue it to the task queue to
494	 * execute the backend I/O.  That is because we do blocking
495	 * memory allocations, and in the file backing case, blocking I/O.
496	 * This move done routine is generally called in the SIM's
497	 * interrupt context, and therefore we cannot block.
498	 */
499	mtx_lock(&be_lun->lock);
500	/*
501	 * XXX KDM make sure that links is okay to use at this point.
502	 * Otherwise, we either need to add another field to ctl_io_hdr,
503	 * or deal with resource allocation here.
504	 */
505	STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
506	mtx_unlock(&be_lun->lock);
507
508	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
509
510	return (0);
511}
512
513static void
514ctl_be_block_biodone(struct bio *bio)
515{
516	struct ctl_be_block_io *beio;
517	struct ctl_be_block_lun *be_lun;
518	union ctl_io *io;
519
520	beio = bio->bio_caller1;
521	be_lun = beio->lun;
522	io = beio->io;
523
524	DPRINTF("entered\n");
525
526	mtx_lock(&be_lun->lock);
527	if (bio->bio_error != 0)
528		beio->num_errors++;
529
530	beio->num_bios_done++;
531
532	/*
533	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
534	 * during the free might cause it to complain.
535	 */
536	g_destroy_bio(bio);
537
538	/*
539	 * If the send complete bit isn't set, or we aren't the last I/O to
540	 * complete, then we're done.
541	 */
542	if ((beio->send_complete == 0)
543	 || (beio->num_bios_done < beio->num_bios_sent)) {
544		mtx_unlock(&be_lun->lock);
545		return;
546	}
547
548	/*
549	 * At this point, we've verified that we are the last I/O to
550	 * complete, so it's safe to drop the lock.
551	 */
552	mtx_unlock(&be_lun->lock);
553
554	/*
555	 * If there are any errors from the backing device, we fail the
556	 * entire I/O with a medium error.
557	 */
558	if (beio->num_errors > 0) {
559		if (beio->bio_cmd == BIO_FLUSH) {
560			/* XXX KDM is there is a better error here? */
561			ctl_set_internal_failure(&io->scsiio,
562						 /*sks_valid*/ 1,
563						 /*retry_count*/ 0xbad2);
564		} else
565			ctl_set_medium_error(&io->scsiio);
566		ctl_complete_beio(beio);
567		return;
568	}
569
570	/*
571	 * If this is a write or a flush, we're all done.
572	 * If this is a read, we can now send the data to the user.
573	 */
574	if ((beio->bio_cmd == BIO_WRITE)
575	 || (beio->bio_cmd == BIO_FLUSH)) {
576		ctl_set_success(&io->scsiio);
577		ctl_complete_beio(beio);
578	} else {
579		io->scsiio.be_move_done = ctl_be_block_move_done;
580		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
581		io->scsiio.kern_data_len = beio->io_len;
582		io->scsiio.kern_total_len = beio->io_len;
583		io->scsiio.kern_rel_offset = 0;
584		io->scsiio.kern_data_resid = 0;
585		io->scsiio.kern_sg_entries = beio->num_segs;
586		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
587#ifdef CTL_TIME_IO
588        	getbintime(&io->io_hdr.dma_start_bt);
589#endif
590		ctl_datamove(io);
591	}
592}
593
594static void
595ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
596			struct ctl_be_block_io *beio)
597{
598	union ctl_io *io;
599	struct mount *mountpoint;
600	int vfs_is_locked, error, lock_flags;
601
602	DPRINTF("entered\n");
603
604	io = beio->io;
605
606	vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
607
608       	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
609
610	if (MNT_SHARED_WRITES(mountpoint)
611	 || ((mountpoint == NULL)
612	  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
613		lock_flags = LK_SHARED;
614	else
615		lock_flags = LK_EXCLUSIVE;
616
617	vn_lock(be_lun->vn, lock_flags | LK_RETRY);
618
619	binuptime(&beio->ds_t0);
620	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
621
622	error = VOP_FSYNC(be_lun->vn, MNT_WAIT, curthread);
623	VOP_UNLOCK(be_lun->vn, 0);
624
625	vn_finished_write(mountpoint);
626
627	VFS_UNLOCK_GIANT(vfs_is_locked);
628
629	if (error == 0)
630		ctl_set_success(&io->scsiio);
631	else {
632		/* XXX KDM is there is a better error here? */
633		ctl_set_internal_failure(&io->scsiio,
634					 /*sks_valid*/ 1,
635					 /*retry_count*/ 0xbad1);
636	}
637
638	ctl_complete_beio(beio);
639}
640
641SDT_PROBE_DEFINE1(cbb, kernel, read, file_start, file_start, "uint64_t");
642SDT_PROBE_DEFINE1(cbb, kernel, write, file_start, file_start, "uint64_t");
643SDT_PROBE_DEFINE1(cbb, kernel, read, file_done, file_done,"uint64_t");
644SDT_PROBE_DEFINE1(cbb, kernel, write, file_done, file_done, "uint64_t");
645
646static void
647ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
648			   struct ctl_be_block_io *beio)
649{
650	struct ctl_be_block_filedata *file_data;
651	union ctl_io *io;
652	struct uio xuio;
653	struct iovec *xiovec;
654	int vfs_is_locked, flags;
655	int error, i;
656
657	DPRINTF("entered\n");
658
659	file_data = &be_lun->backend.file;
660	io = beio->io;
661	flags = beio->bio_flags;
662
663	if (beio->bio_cmd == BIO_READ) {
664		SDT_PROBE(cbb, kernel, read, file_start, 0, 0, 0, 0, 0);
665	} else {
666		SDT_PROBE(cbb, kernel, write, file_start, 0, 0, 0, 0, 0);
667	}
668
669	bzero(&xuio, sizeof(xuio));
670	if (beio->bio_cmd == BIO_READ)
671		xuio.uio_rw = UIO_READ;
672	else
673		xuio.uio_rw = UIO_WRITE;
674
675	xuio.uio_offset = beio->io_offset;
676	xuio.uio_resid = beio->io_len;
677	xuio.uio_segflg = UIO_SYSSPACE;
678	xuio.uio_iov = beio->xiovecs;
679	xuio.uio_iovcnt = beio->num_segs;
680	xuio.uio_td = curthread;
681
682	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
683		xiovec->iov_base = beio->sg_segs[i].addr;
684		xiovec->iov_len = beio->sg_segs[i].len;
685	}
686
687	vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
688	if (beio->bio_cmd == BIO_READ) {
689		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
690
691		binuptime(&beio->ds_t0);
692		devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
693
694		/*
695		 * UFS pays attention to IO_DIRECT for reads.  If the
696		 * DIRECTIO option is configured into the kernel, it calls
697		 * ffs_rawread().  But that only works for single-segment
698		 * uios with user space addresses.  In our case, with a
699		 * kernel uio, it still reads into the buffer cache, but it
700		 * will just try to release the buffer from the cache later
701		 * on in ffs_read().
702		 *
703		 * ZFS does not pay attention to IO_DIRECT for reads.
704		 *
705		 * UFS does not pay attention to IO_SYNC for reads.
706		 *
707		 * ZFS pays attention to IO_SYNC (which translates into the
708		 * Solaris define FRSYNC for zfs_read()) for reads.  It
709		 * attempts to sync the file before reading.
710		 *
711		 * So, to attempt to provide some barrier semantics in the
712		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
713		 */
714		error = VOP_READ(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
715				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
716
717		VOP_UNLOCK(be_lun->vn, 0);
718	} else {
719		struct mount *mountpoint;
720		int lock_flags;
721
722		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
723
724		if (MNT_SHARED_WRITES(mountpoint)
725		 || ((mountpoint == NULL)
726		  && MNT_SHARED_WRITES(be_lun->vn->v_mount)))
727			lock_flags = LK_SHARED;
728		else
729			lock_flags = LK_EXCLUSIVE;
730
731		vn_lock(be_lun->vn, lock_flags | LK_RETRY);
732
733		binuptime(&beio->ds_t0);
734		devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
735
736		/*
737		 * UFS pays attention to IO_DIRECT for writes.  The write
738		 * is done asynchronously.  (Normally the write would just
739		 * get put into cache.
740		 *
741		 * UFS pays attention to IO_SYNC for writes.  It will
742		 * attempt to write the buffer out synchronously if that
743		 * flag is set.
744		 *
745		 * ZFS does not pay attention to IO_DIRECT for writes.
746		 *
747		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
748		 * for writes.  It will flush the transaction from the
749		 * cache before returning.
750		 *
751		 * So if we've got the BIO_ORDERED flag set, we want
752		 * IO_SYNC in either the UFS or ZFS case.
753		 */
754		error = VOP_WRITE(be_lun->vn, &xuio, (flags & BIO_ORDERED) ?
755				  IO_SYNC : 0, file_data->cred);
756		VOP_UNLOCK(be_lun->vn, 0);
757
758		vn_finished_write(mountpoint);
759        }
760        VFS_UNLOCK_GIANT(vfs_is_locked);
761
762	/*
763	 * If we got an error, set the sense data to "MEDIUM ERROR" and
764	 * return the I/O to the user.
765	 */
766	if (error != 0) {
767		char path_str[32];
768
769		ctl_scsi_path_string(io, path_str, sizeof(path_str));
770		/*
771		 * XXX KDM ZFS returns ENOSPC when the underlying
772		 * filesystem fills up.  What kind of SCSI error should we
773		 * return for that?
774		 */
775		printf("%s%s command returned errno %d\n", path_str,
776		       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", error);
777		ctl_set_medium_error(&io->scsiio);
778		ctl_complete_beio(beio);
779		return;
780	}
781
782	/*
783	 * If this is a write, we're all done.
784	 * If this is a read, we can now send the data to the user.
785	 */
786	if (beio->bio_cmd == BIO_WRITE) {
787		ctl_set_success(&io->scsiio);
788		SDT_PROBE(cbb, kernel, write, file_done, 0, 0, 0, 0, 0);
789		ctl_complete_beio(beio);
790	} else {
791		SDT_PROBE(cbb, kernel, read, file_done, 0, 0, 0, 0, 0);
792		io->scsiio.be_move_done = ctl_be_block_move_done;
793		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
794		io->scsiio.kern_data_len = beio->io_len;
795		io->scsiio.kern_total_len = beio->io_len;
796		io->scsiio.kern_rel_offset = 0;
797		io->scsiio.kern_data_resid = 0;
798		io->scsiio.kern_sg_entries = beio->num_segs;
799		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
800#ifdef CTL_TIME_IO
801        	getbintime(&io->io_hdr.dma_start_bt);
802#endif
803		ctl_datamove(io);
804	}
805}
806
807static void
808ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
809		       struct ctl_be_block_io *beio)
810{
811	struct bio *bio;
812	union ctl_io *io;
813	struct ctl_be_block_devdata *dev_data;
814
815	dev_data = &be_lun->backend.dev;
816	io = beio->io;
817
818	DPRINTF("entered\n");
819
820	/* This can't fail, it's a blocking allocation. */
821	bio = g_alloc_bio();
822
823	bio->bio_cmd	    = BIO_FLUSH;
824	bio->bio_flags	   |= BIO_ORDERED;
825	bio->bio_dev	    = dev_data->cdev;
826	bio->bio_offset	    = 0;
827	bio->bio_data	    = 0;
828	bio->bio_done	    = ctl_be_block_biodone;
829	bio->bio_caller1    = beio;
830	bio->bio_pblkno	    = 0;
831
832	/*
833	 * We don't need to acquire the LUN lock here, because we are only
834	 * sending one bio, and so there is no other context to synchronize
835	 * with.
836	 */
837	beio->num_bios_sent = 1;
838	beio->send_complete = 1;
839
840	binuptime(&beio->ds_t0);
841	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
842
843	(*dev_data->csw->d_strategy)(bio);
844}
845
846static void
847ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
848			  struct ctl_be_block_io *beio)
849{
850	int i;
851	struct bio *bio;
852	struct ctl_be_block_devdata *dev_data;
853	off_t cur_offset;
854	int max_iosize;
855
856	DPRINTF("entered\n");
857
858	dev_data = &be_lun->backend.dev;
859
860	/*
861	 * We have to limit our I/O size to the maximum supported by the
862	 * backend device.  Hopefully it is MAXPHYS.  If the driver doesn't
863	 * set it properly, use DFLTPHYS.
864	 */
865	max_iosize = dev_data->cdev->si_iosize_max;
866	if (max_iosize < PAGE_SIZE)
867		max_iosize = DFLTPHYS;
868
869	cur_offset = beio->io_offset;
870
871	/*
872	 * XXX KDM need to accurately reflect the number of I/Os outstanding
873	 * to a device.
874	 */
875	binuptime(&beio->ds_t0);
876	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
877
878	for (i = 0; i < beio->num_segs; i++) {
879		size_t cur_size;
880		uint8_t *cur_ptr;
881
882		cur_size = beio->sg_segs[i].len;
883		cur_ptr = beio->sg_segs[i].addr;
884
885		while (cur_size > 0) {
886			/* This can't fail, it's a blocking allocation. */
887			bio = g_alloc_bio();
888
889			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
890
891			bio->bio_cmd = beio->bio_cmd;
892			bio->bio_flags |= beio->bio_flags;
893			bio->bio_dev = dev_data->cdev;
894			bio->bio_caller1 = beio;
895			bio->bio_length = min(cur_size, max_iosize);
896			bio->bio_offset = cur_offset;
897			bio->bio_data = cur_ptr;
898			bio->bio_done = ctl_be_block_biodone;
899			bio->bio_pblkno = cur_offset / be_lun->blocksize;
900
901			cur_offset += bio->bio_length;
902			cur_ptr += bio->bio_length;
903			cur_size -= bio->bio_length;
904
905			/*
906			 * Make sure we set the complete bit just before we
907			 * issue the last bio so we don't wind up with a
908			 * race.
909			 *
910			 * Use the LUN mutex here instead of a combination
911			 * of atomic variables for simplicity.
912			 *
913			 * XXX KDM we could have a per-IO lock, but that
914			 * would cause additional per-IO setup and teardown
915			 * overhead.  Hopefully there won't be too much
916			 * contention on the LUN lock.
917			 */
918			mtx_lock(&be_lun->lock);
919
920			beio->num_bios_sent++;
921
922			if ((i == beio->num_segs - 1)
923			 && (cur_size == 0))
924				beio->send_complete = 1;
925
926			mtx_unlock(&be_lun->lock);
927
928			(*dev_data->csw->d_strategy)(bio);
929		}
930	}
931}
932
933static void
934ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
935			 union ctl_io *io)
936{
937	struct ctl_be_block_io *beio;
938	struct ctl_be_block_softc *softc;
939
940	DPRINTF("entered\n");
941
942	softc = be_lun->softc;
943	beio = ctl_alloc_beio(softc);
944	if (beio == NULL) {
945		/*
946		 * This should not happen.  ctl_alloc_beio() will call
947		 * ctl_grow_beio() with a blocking malloc as needed.
948		 * A malloc with M_WAITOK should not fail.
949		 */
950		ctl_set_busy(&io->scsiio);
951		ctl_done(io);
952		return;
953	}
954
955	beio->io = io;
956	beio->softc = softc;
957	beio->lun = be_lun;
958	io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
959
960	switch (io->scsiio.cdb[0]) {
961	case SYNCHRONIZE_CACHE:
962	case SYNCHRONIZE_CACHE_16:
963		beio->ds_trans_type = DEVSTAT_NO_DATA;
964		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
965		beio->io_len = 0;
966		be_lun->lun_flush(be_lun, beio);
967		break;
968	default:
969		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
970		break;
971	}
972}
973
974SDT_PROBE_DEFINE1(cbb, kernel, read, start, start, "uint64_t");
975SDT_PROBE_DEFINE1(cbb, kernel, write, start, start, "uint64_t");
976SDT_PROBE_DEFINE1(cbb, kernel, read, alloc_done, alloc_done, "uint64_t");
977SDT_PROBE_DEFINE1(cbb, kernel, write, alloc_done, alloc_done, "uint64_t");
978
979static void
980ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
981			   union ctl_io *io)
982{
983	struct ctl_be_block_io *beio;
984	struct ctl_be_block_softc *softc;
985	struct ctl_lba_len lbalen;
986	uint64_t len_left, io_size_bytes;
987	int i;
988
989	softc = be_lun->softc;
990
991	DPRINTF("entered\n");
992
993	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
994		SDT_PROBE(cbb, kernel, read, start, 0, 0, 0, 0, 0);
995	} else {
996		SDT_PROBE(cbb, kernel, write, start, 0, 0, 0, 0, 0);
997	}
998
999	memcpy(&lbalen, io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes,
1000	       sizeof(lbalen));
1001
1002	io_size_bytes = lbalen.len * be_lun->blocksize;
1003
1004	/*
1005	 * XXX KDM this is temporary, until we implement chaining of beio
1006	 * structures and multiple datamove calls to move all the data in
1007	 * or out.
1008	 */
1009	if (io_size_bytes > CTLBLK_MAX_IO_SIZE) {
1010		printf("%s: IO length %ju > max io size %u\n", __func__,
1011		       io_size_bytes, CTLBLK_MAX_IO_SIZE);
1012		ctl_set_invalid_field(&io->scsiio,
1013				      /*sks_valid*/ 0,
1014				      /*command*/ 1,
1015				      /*field*/ 0,
1016				      /*bit_valid*/ 0,
1017				      /*bit*/ 0);
1018		ctl_done(io);
1019		return;
1020	}
1021
1022	beio = ctl_alloc_beio(softc);
1023	if (beio == NULL) {
1024		/*
1025		 * This should not happen.  ctl_alloc_beio() will call
1026		 * ctl_grow_beio() with a blocking malloc as needed.
1027		 * A malloc with M_WAITOK should not fail.
1028		 */
1029		ctl_set_busy(&io->scsiio);
1030		ctl_done(io);
1031		return;
1032	}
1033
1034	beio->io = io;
1035	beio->softc = softc;
1036	beio->lun = be_lun;
1037	io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr = beio;
1038
1039	/*
1040	 * If the I/O came down with an ordered or head of queue tag, set
1041	 * the BIO_ORDERED attribute.  For head of queue tags, that's
1042	 * pretty much the best we can do.
1043	 *
1044	 * XXX KDM we don't have a great way to easily know about the FUA
1045	 * bit right now (it is decoded in ctl_read_write(), but we don't
1046	 * pass that knowledge to the backend), and in any case we would
1047	 * need to determine how to handle it.
1048	 */
1049	if ((io->scsiio.tag_type == CTL_TAG_ORDERED)
1050	 || (io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE))
1051		beio->bio_flags = BIO_ORDERED;
1052
1053	switch (io->scsiio.tag_type) {
1054	case CTL_TAG_ORDERED:
1055		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1056		break;
1057	case CTL_TAG_HEAD_OF_QUEUE:
1058		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1059		break;
1060	case CTL_TAG_UNTAGGED:
1061	case CTL_TAG_SIMPLE:
1062	case CTL_TAG_ACA:
1063	default:
1064		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1065		break;
1066	}
1067
1068	/*
1069	 * This path handles read and write only.  The config write path
1070	 * handles flush operations.
1071	 */
1072	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) {
1073		beio->bio_cmd = BIO_READ;
1074		beio->ds_trans_type = DEVSTAT_READ;
1075	} else {
1076		beio->bio_cmd = BIO_WRITE;
1077		beio->ds_trans_type = DEVSTAT_WRITE;
1078	}
1079
1080	beio->io_len = lbalen.len * be_lun->blocksize;
1081	beio->io_offset = lbalen.lba * be_lun->blocksize;
1082
1083	DPRINTF("%s at LBA %jx len %u\n",
1084	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
1085	       (uintmax_t)lbalen.lba, lbalen.len);
1086
1087	for (i = 0, len_left = io_size_bytes; i < CTLBLK_MAX_SEGS &&
1088	     len_left > 0; i++) {
1089
1090		/*
1091		 * Setup the S/G entry for this chunk.
1092		 */
1093		beio->sg_segs[i].len = min(MAXPHYS, len_left);
1094		beio->sg_segs[i].addr = uma_zalloc(be_lun->lun_zone, M_WAITOK);
1095		/*
1096		 * uma_zalloc() can in theory return NULL even with M_WAITOK
1097		 * if it can't pull more memory into the zone.
1098		 */
1099		if (beio->sg_segs[i].addr == NULL) {
1100			ctl_set_busy(&io->scsiio);
1101			ctl_complete_beio(beio);
1102			return;
1103		}
1104
1105		DPRINTF("segment %d addr %p len %zd\n", i,
1106			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1107
1108		beio->num_segs++;
1109		len_left -= beio->sg_segs[i].len;
1110	}
1111
1112	/*
1113	 * For the read case, we need to read the data into our buffers and
1114	 * then we can send it back to the user.  For the write case, we
1115	 * need to get the data from the user first.
1116	 */
1117	if (beio->bio_cmd == BIO_READ) {
1118		SDT_PROBE(cbb, kernel, read, alloc_done, 0, 0, 0, 0, 0);
1119		be_lun->dispatch(be_lun, beio);
1120	} else {
1121		SDT_PROBE(cbb, kernel, write, alloc_done, 0, 0, 0, 0, 0);
1122		io->scsiio.be_move_done = ctl_be_block_move_done;
1123		io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs;
1124		io->scsiio.kern_data_len = beio->io_len;
1125		io->scsiio.kern_total_len = beio->io_len;
1126		io->scsiio.kern_rel_offset = 0;
1127		io->scsiio.kern_data_resid = 0;
1128		io->scsiio.kern_sg_entries = beio->num_segs;
1129		io->io_hdr.flags |= CTL_FLAG_ALLOCATED | CTL_FLAG_KDPTR_SGLIST;
1130#ifdef CTL_TIME_IO
1131        	getbintime(&io->io_hdr.dma_start_bt);
1132#endif
1133		ctl_datamove(io);
1134	}
1135}
1136
1137static void
1138ctl_be_block_worker(void *context, int pending)
1139{
1140	struct ctl_be_block_lun *be_lun;
1141	struct ctl_be_block_softc *softc;
1142	union ctl_io *io;
1143
1144	be_lun = (struct ctl_be_block_lun *)context;
1145	softc = be_lun->softc;
1146
1147	DPRINTF("entered\n");
1148
1149	mtx_lock(&be_lun->lock);
1150	for (;;) {
1151		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
1152		if (io != NULL) {
1153			struct ctl_be_block_io *beio;
1154
1155			DPRINTF("datamove queue\n");
1156
1157			STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr,
1158				      ctl_io_hdr, links);
1159
1160			mtx_unlock(&be_lun->lock);
1161
1162			beio = (struct ctl_be_block_io *)
1163			    io->io_hdr.ctl_private[CTL_PRIV_BACKEND].ptr;
1164
1165			be_lun->dispatch(be_lun, beio);
1166
1167			mtx_lock(&be_lun->lock);
1168			continue;
1169		}
1170		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
1171		if (io != NULL) {
1172
1173			DPRINTF("config write queue\n");
1174
1175			STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr,
1176				      ctl_io_hdr, links);
1177
1178			mtx_unlock(&be_lun->lock);
1179
1180			ctl_be_block_cw_dispatch(be_lun, io);
1181
1182			mtx_lock(&be_lun->lock);
1183			continue;
1184		}
1185		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
1186		if (io != NULL) {
1187			DPRINTF("input queue\n");
1188
1189			STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr,
1190				      ctl_io_hdr, links);
1191			mtx_unlock(&be_lun->lock);
1192
1193			/*
1194			 * We must drop the lock, since this routine and
1195			 * its children may sleep.
1196			 */
1197			ctl_be_block_dispatch(be_lun, io);
1198
1199			mtx_lock(&be_lun->lock);
1200			continue;
1201		}
1202
1203		/*
1204		 * If we get here, there is no work left in the queues, so
1205		 * just break out and let the task queue go to sleep.
1206		 */
1207		break;
1208	}
1209	mtx_unlock(&be_lun->lock);
1210}
1211
1212/*
1213 * Entry point from CTL to the backend for I/O.  We queue everything to a
1214 * work thread, so this just puts the I/O on a queue and wakes up the
1215 * thread.
1216 */
1217static int
1218ctl_be_block_submit(union ctl_io *io)
1219{
1220	struct ctl_be_block_lun *be_lun;
1221	struct ctl_be_lun *ctl_be_lun;
1222	int retval;
1223
1224	DPRINTF("entered\n");
1225
1226	retval = CTL_RETVAL_COMPLETE;
1227
1228	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
1229		CTL_PRIV_BACKEND_LUN].ptr;
1230	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
1231
1232	/*
1233	 * Make sure we only get SCSI I/O.
1234	 */
1235	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type "
1236		"%#x) encountered", io->io_hdr.io_type));
1237
1238	mtx_lock(&be_lun->lock);
1239	/*
1240	 * XXX KDM make sure that links is okay to use at this point.
1241	 * Otherwise, we either need to add another field to ctl_io_hdr,
1242	 * or deal with resource allocation here.
1243	 */
1244	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1245	mtx_unlock(&be_lun->lock);
1246
1247	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1248
1249	return (retval);
1250}
1251
1252static int
1253ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
1254			int flag, struct thread *td)
1255{
1256	struct ctl_be_block_softc *softc;
1257	int error;
1258
1259	softc = &backend_block_softc;
1260
1261	error = 0;
1262
1263	switch (cmd) {
1264	case CTL_LUN_REQ: {
1265		struct ctl_lun_req *lun_req;
1266
1267		lun_req = (struct ctl_lun_req *)addr;
1268
1269		switch (lun_req->reqtype) {
1270		case CTL_LUNREQ_CREATE:
1271			error = ctl_be_block_create(softc, lun_req);
1272			break;
1273		case CTL_LUNREQ_RM:
1274			error = ctl_be_block_rm(softc, lun_req);
1275			break;
1276		case CTL_LUNREQ_MODIFY:
1277			error = ctl_be_block_modify(softc, lun_req);
1278			break;
1279		default:
1280			lun_req->status = CTL_LUN_ERROR;
1281			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
1282				 "%s: invalid LUN request type %d", __func__,
1283				 lun_req->reqtype);
1284			break;
1285		}
1286		break;
1287	}
1288	default:
1289		error = ENOTTY;
1290		break;
1291	}
1292
1293	return (error);
1294}
1295
1296static int
1297ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1298{
1299	struct ctl_be_block_filedata *file_data;
1300	struct ctl_lun_create_params *params;
1301	struct vattr		      vattr;
1302	int			      error;
1303
1304	error = 0;
1305	file_data = &be_lun->backend.file;
1306	params = &req->reqdata.create;
1307
1308	be_lun->dev_type = CTL_BE_BLOCK_FILE;
1309	be_lun->dispatch = ctl_be_block_dispatch_file;
1310	be_lun->lun_flush = ctl_be_block_flush_file;
1311
1312	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
1313	if (error != 0) {
1314		snprintf(req->error_str, sizeof(req->error_str),
1315			 "error calling VOP_GETATTR() for file %s",
1316			 be_lun->dev_path);
1317		return (error);
1318	}
1319
1320	/*
1321	 * Verify that we have the ability to upgrade to exclusive
1322	 * access on this file so we can trap errors at open instead
1323	 * of reporting them during first access.
1324	 */
1325	if (VOP_ISLOCKED(be_lun->vn) != LK_EXCLUSIVE) {
1326		vn_lock(be_lun->vn, LK_UPGRADE | LK_RETRY);
1327		if (be_lun->vn->v_iflag & VI_DOOMED) {
1328			error = EBADF;
1329			snprintf(req->error_str, sizeof(req->error_str),
1330				 "error locking file %s", be_lun->dev_path);
1331			return (error);
1332		}
1333	}
1334
1335
1336	file_data->cred = crhold(curthread->td_ucred);
1337	if (params->lun_size_bytes != 0)
1338		be_lun->size_bytes = params->lun_size_bytes;
1339	else
1340		be_lun->size_bytes = vattr.va_size;
1341	/*
1342	 * We set the multi thread flag for file operations because all
1343	 * filesystems (in theory) are capable of allowing multiple readers
1344	 * of a file at once.  So we want to get the maximum possible
1345	 * concurrency.
1346	 */
1347	be_lun->flags |= CTL_BE_BLOCK_LUN_MULTI_THREAD;
1348
1349	/*
1350	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
1351	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
1352	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
1353	 * may not work with other OSes as well.  So just export a sector
1354	 * size of 512 bytes, which should work with any OS or
1355	 * application.  Since our backing is a file, any block size will
1356	 * work fine for the backing store.
1357	 */
1358#if 0
1359	be_lun->blocksize= vattr.va_blocksize;
1360#endif
1361	if (params->blocksize_bytes != 0)
1362		be_lun->blocksize = params->blocksize_bytes;
1363	else
1364		be_lun->blocksize = 512;
1365
1366	/*
1367	 * Sanity check.  The media size has to be at least one
1368	 * sector long.
1369	 */
1370	if (be_lun->size_bytes < be_lun->blocksize) {
1371		error = EINVAL;
1372		snprintf(req->error_str, sizeof(req->error_str),
1373			 "file %s size %ju < block size %u", be_lun->dev_path,
1374			 (uintmax_t)be_lun->size_bytes, be_lun->blocksize);
1375	}
1376	return (error);
1377}
1378
1379static int
1380ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1381{
1382	struct ctl_lun_create_params *params;
1383	struct vattr		      vattr;
1384	struct cdev		     *dev;
1385	struct cdevsw		     *devsw;
1386	int			      error;
1387
1388	params = &req->reqdata.create;
1389
1390	be_lun->dev_type = CTL_BE_BLOCK_DEV;
1391	be_lun->dispatch = ctl_be_block_dispatch_dev;
1392	be_lun->lun_flush = ctl_be_block_flush_dev;
1393	be_lun->backend.dev.cdev = be_lun->vn->v_rdev;
1394	be_lun->backend.dev.csw = dev_refthread(be_lun->backend.dev.cdev,
1395					     &be_lun->backend.dev.dev_ref);
1396	if (be_lun->backend.dev.csw == NULL)
1397		panic("Unable to retrieve device switch");
1398
1399	error = VOP_GETATTR(be_lun->vn, &vattr, NOCRED);
1400	if (error) {
1401		snprintf(req->error_str, sizeof(req->error_str),
1402			 "%s: error getting vnode attributes for device %s",
1403			 __func__, be_lun->dev_path);
1404		return (error);
1405	}
1406
1407	dev = be_lun->vn->v_rdev;
1408	devsw = dev->si_devsw;
1409	if (!devsw->d_ioctl) {
1410		snprintf(req->error_str, sizeof(req->error_str),
1411			 "%s: no d_ioctl for device %s!", __func__,
1412			 be_lun->dev_path);
1413		return (ENODEV);
1414	}
1415
1416	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
1417			       (caddr_t)&be_lun->blocksize, FREAD,
1418			       curthread);
1419	if (error) {
1420		snprintf(req->error_str, sizeof(req->error_str),
1421			 "%s: error %d returned for DIOCGSECTORSIZE ioctl "
1422			 "on %s!", __func__, error, be_lun->dev_path);
1423		return (error);
1424	}
1425
1426	/*
1427	 * If the user has asked for a blocksize that is greater than the
1428	 * backing device's blocksize, we can do it only if the blocksize
1429	 * the user is asking for is an even multiple of the underlying
1430	 * device's blocksize.
1431	 */
1432	if ((params->blocksize_bytes != 0)
1433	 && (params->blocksize_bytes > be_lun->blocksize)) {
1434		uint32_t bs_multiple, tmp_blocksize;
1435
1436		bs_multiple = params->blocksize_bytes / be_lun->blocksize;
1437
1438		tmp_blocksize = bs_multiple * be_lun->blocksize;
1439
1440		if (tmp_blocksize == params->blocksize_bytes) {
1441			be_lun->blocksize = params->blocksize_bytes;
1442		} else {
1443			snprintf(req->error_str, sizeof(req->error_str),
1444				 "%s: requested blocksize %u is not an even "
1445				 "multiple of backing device blocksize %u",
1446				 __func__, params->blocksize_bytes,
1447				 be_lun->blocksize);
1448			return (EINVAL);
1449
1450		}
1451	} else if ((params->blocksize_bytes != 0)
1452		&& (params->blocksize_bytes != be_lun->blocksize)) {
1453		snprintf(req->error_str, sizeof(req->error_str),
1454			 "%s: requested blocksize %u < backing device "
1455			 "blocksize %u", __func__, params->blocksize_bytes,
1456			 be_lun->blocksize);
1457		return (EINVAL);
1458	}
1459
1460	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
1461			       (caddr_t)&be_lun->size_bytes, FREAD,
1462			       curthread);
1463	if (error) {
1464		snprintf(req->error_str, sizeof(req->error_str),
1465			 "%s: error %d returned for DIOCGMEDIASIZE "
1466			 " ioctl on %s!", __func__, error,
1467			 be_lun->dev_path);
1468		return (error);
1469	}
1470
1471	if (params->lun_size_bytes != 0) {
1472		if (params->lun_size_bytes > be_lun->size_bytes) {
1473			snprintf(req->error_str, sizeof(req->error_str),
1474				 "%s: requested LUN size %ju > backing device "
1475				 "size %ju", __func__,
1476				 (uintmax_t)params->lun_size_bytes,
1477				 (uintmax_t)be_lun->size_bytes);
1478			return (EINVAL);
1479		}
1480
1481		be_lun->size_bytes = params->lun_size_bytes;
1482	}
1483
1484	return (0);
1485}
1486
1487static int
1488ctl_be_block_close(struct ctl_be_block_lun *be_lun)
1489{
1490	DROP_GIANT();
1491	if (be_lun->vn) {
1492		int flags = FREAD | FWRITE;
1493		int vfs_is_locked = 0;
1494
1495		switch (be_lun->dev_type) {
1496		case CTL_BE_BLOCK_DEV:
1497			if (be_lun->backend.dev.csw) {
1498				dev_relthread(be_lun->backend.dev.cdev,
1499					      be_lun->backend.dev.dev_ref);
1500				be_lun->backend.dev.csw  = NULL;
1501				be_lun->backend.dev.cdev = NULL;
1502			}
1503			break;
1504		case CTL_BE_BLOCK_FILE:
1505			vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
1506			break;
1507		case CTL_BE_BLOCK_NONE:
1508		default:
1509			panic("Unexpected backend type.");
1510			break;
1511		}
1512
1513		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
1514		be_lun->vn = NULL;
1515
1516		switch (be_lun->dev_type) {
1517		case CTL_BE_BLOCK_DEV:
1518			break;
1519		case CTL_BE_BLOCK_FILE:
1520			VFS_UNLOCK_GIANT(vfs_is_locked);
1521			if (be_lun->backend.file.cred != NULL) {
1522				crfree(be_lun->backend.file.cred);
1523				be_lun->backend.file.cred = NULL;
1524			}
1525			break;
1526		case CTL_BE_BLOCK_NONE:
1527		default:
1528			panic("Unexpected backend type.");
1529			break;
1530		}
1531	}
1532	PICKUP_GIANT();
1533
1534	return (0);
1535}
1536
1537static int
1538ctl_be_block_open(struct ctl_be_block_softc *softc,
1539		       struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
1540{
1541	struct nameidata nd;
1542	int		 flags;
1543	int		 error;
1544	int		 vfs_is_locked;
1545
1546	/*
1547	 * XXX KDM allow a read-only option?
1548	 */
1549	flags = FREAD | FWRITE;
1550	error = 0;
1551
1552	if (rootvnode == NULL) {
1553		snprintf(req->error_str, sizeof(req->error_str),
1554			 "%s: Root filesystem is not mounted", __func__);
1555		return (1);
1556	}
1557
1558	if (!curthread->td_proc->p_fd->fd_cdir) {
1559		curthread->td_proc->p_fd->fd_cdir = rootvnode;
1560		VREF(rootvnode);
1561	}
1562	if (!curthread->td_proc->p_fd->fd_rdir) {
1563		curthread->td_proc->p_fd->fd_rdir = rootvnode;
1564		VREF(rootvnode);
1565	}
1566	if (!curthread->td_proc->p_fd->fd_jdir) {
1567		curthread->td_proc->p_fd->fd_jdir = rootvnode;
1568		VREF(rootvnode);
1569	}
1570
1571 again:
1572	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread);
1573	error = vn_open(&nd, &flags, 0, NULL);
1574	if (error) {
1575		/*
1576		 * This is the only reasonable guess we can make as far as
1577		 * path if the user doesn't give us a fully qualified path.
1578		 * If they want to specify a file, they need to specify the
1579		 * full path.
1580		 */
1581		if (be_lun->dev_path[0] != '/') {
1582			char *dev_path = "/dev/";
1583			char *dev_name;
1584
1585			/* Try adding device path at beginning of name */
1586			dev_name = malloc(strlen(be_lun->dev_path)
1587					+ strlen(dev_path) + 1,
1588					  M_CTLBLK, M_WAITOK);
1589			if (dev_name) {
1590				sprintf(dev_name, "%s%s", dev_path,
1591					be_lun->dev_path);
1592				free(be_lun->dev_path, M_CTLBLK);
1593				be_lun->dev_path = dev_name;
1594				goto again;
1595			}
1596		}
1597		snprintf(req->error_str, sizeof(req->error_str),
1598			 "%s: error opening %s", __func__, be_lun->dev_path);
1599		return (error);
1600	}
1601
1602	vfs_is_locked = NDHASGIANT(&nd);
1603
1604	NDFREE(&nd, NDF_ONLY_PNBUF);
1605
1606	be_lun->vn = nd.ni_vp;
1607
1608	/* We only support disks and files. */
1609	if (vn_isdisk(be_lun->vn, &error)) {
1610		error = ctl_be_block_open_dev(be_lun, req);
1611	} else if (be_lun->vn->v_type == VREG) {
1612		error = ctl_be_block_open_file(be_lun, req);
1613	} else {
1614		error = EINVAL;
1615		snprintf(req->error_str, sizeof(req->error_str),
1616			 "%s is not a disk or file", be_lun->dev_path);
1617	}
1618	VOP_UNLOCK(be_lun->vn, 0);
1619	VFS_UNLOCK_GIANT(vfs_is_locked);
1620
1621	if (error != 0) {
1622		ctl_be_block_close(be_lun);
1623		return (error);
1624	}
1625
1626	be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
1627	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
1628
1629	return (0);
1630}
1631
1632static int
1633ctl_be_block_mem_ctor(void *mem, int size, void *arg, int flags)
1634{
1635	return (0);
1636}
1637
1638static void
1639ctl_be_block_mem_dtor(void *mem, int size, void *arg)
1640{
1641	bzero(mem, size);
1642}
1643
1644static int
1645ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1646{
1647	struct ctl_be_block_lun *be_lun;
1648	struct ctl_lun_create_params *params;
1649	struct ctl_be_arg *file_arg;
1650	char tmpstr[32];
1651	int retval, num_threads;
1652	int i;
1653
1654	params = &req->reqdata.create;
1655	retval = 0;
1656
1657	num_threads = cbb_num_threads;
1658
1659	file_arg = NULL;
1660
1661	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
1662
1663	if (be_lun == NULL) {
1664		snprintf(req->error_str, sizeof(req->error_str),
1665			 "%s: error allocating %zd bytes", __func__,
1666			 sizeof(*be_lun));
1667		goto bailout_error;
1668	}
1669
1670	be_lun->softc = softc;
1671	STAILQ_INIT(&be_lun->input_queue);
1672	STAILQ_INIT(&be_lun->config_write_queue);
1673	STAILQ_INIT(&be_lun->datamove_queue);
1674	sprintf(be_lun->lunname, "cblk%d", softc->num_luns);
1675	mtx_init(&be_lun->lock, be_lun->lunname, NULL, MTX_DEF);
1676
1677	be_lun->lun_zone = uma_zcreate(be_lun->lunname, MAXPHYS,
1678	    ctl_be_block_mem_ctor, ctl_be_block_mem_dtor, NULL, NULL,
1679	    /*align*/ 0, /*flags*/0);
1680
1681	if (be_lun->lun_zone == NULL) {
1682		snprintf(req->error_str, sizeof(req->error_str),
1683			 "%s: error allocating UMA zone", __func__);
1684		goto bailout_error;
1685	}
1686
1687	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
1688		be_lun->ctl_be_lun.lun_type = params->device_type;
1689	else
1690		be_lun->ctl_be_lun.lun_type = T_DIRECT;
1691
1692	if (be_lun->ctl_be_lun.lun_type == T_DIRECT) {
1693		for (i = 0; i < req->num_be_args; i++) {
1694			if (strcmp(req->kern_be_args[i].name, "file") == 0) {
1695				file_arg = &req->kern_be_args[i];
1696				break;
1697			}
1698		}
1699
1700		if (file_arg == NULL) {
1701			snprintf(req->error_str, sizeof(req->error_str),
1702				 "%s: no file argument specified", __func__);
1703			goto bailout_error;
1704		}
1705
1706		be_lun->dev_path = malloc(file_arg->vallen, M_CTLBLK,
1707					  M_WAITOK | M_ZERO);
1708		if (be_lun->dev_path == NULL) {
1709			snprintf(req->error_str, sizeof(req->error_str),
1710				 "%s: error allocating %d bytes", __func__,
1711				 file_arg->vallen);
1712			goto bailout_error;
1713		}
1714
1715		strlcpy(be_lun->dev_path, (char *)file_arg->value,
1716			file_arg->vallen);
1717
1718		retval = ctl_be_block_open(softc, be_lun, req);
1719		if (retval != 0) {
1720			retval = 0;
1721			goto bailout_error;
1722		}
1723
1724		/*
1725		 * Tell the user the size of the file/device.
1726		 */
1727		params->lun_size_bytes = be_lun->size_bytes;
1728
1729		/*
1730		 * The maximum LBA is the size - 1.
1731		 */
1732		be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
1733	} else {
1734		/*
1735		 * For processor devices, we don't have any size.
1736		 */
1737		be_lun->blocksize = 0;
1738		be_lun->size_blocks = 0;
1739		be_lun->size_bytes = 0;
1740		be_lun->ctl_be_lun.maxlba = 0;
1741		params->lun_size_bytes = 0;
1742
1743		/*
1744		 * Default to just 1 thread for processor devices.
1745		 */
1746		num_threads = 1;
1747	}
1748
1749	/*
1750	 * XXX This searching loop might be refactored to be combined with
1751	 * the loop above,
1752	 */
1753	for (i = 0; i < req->num_be_args; i++) {
1754		if (strcmp(req->kern_be_args[i].name, "num_threads") == 0) {
1755			struct ctl_be_arg *thread_arg;
1756			char num_thread_str[16];
1757			int tmp_num_threads;
1758
1759
1760			thread_arg = &req->kern_be_args[i];
1761
1762			strlcpy(num_thread_str, (char *)thread_arg->value,
1763				min(thread_arg->vallen,
1764				sizeof(num_thread_str)));
1765
1766			tmp_num_threads = strtol(num_thread_str, NULL, 0);
1767
1768			/*
1769			 * We don't let the user specify less than one
1770			 * thread, but hope he's clueful enough not to
1771			 * specify 1000 threads.
1772			 */
1773			if (tmp_num_threads < 1) {
1774				snprintf(req->error_str, sizeof(req->error_str),
1775					 "%s: invalid number of threads %s",
1776				         __func__, num_thread_str);
1777				goto bailout_error;
1778			}
1779
1780			num_threads = tmp_num_threads;
1781		}
1782	}
1783
1784	be_lun->flags = CTL_BE_BLOCK_LUN_UNCONFIGURED;
1785	be_lun->ctl_be_lun.flags = CTL_LUN_FLAG_PRIMARY;
1786	be_lun->ctl_be_lun.be_lun = be_lun;
1787	be_lun->ctl_be_lun.blocksize = be_lun->blocksize;
1788	/* Tell the user the blocksize we ended up using */
1789	params->blocksize_bytes = be_lun->blocksize;
1790	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
1791		be_lun->ctl_be_lun.req_lun_id = params->req_lun_id;
1792		be_lun->ctl_be_lun.flags |= CTL_LUN_FLAG_ID_REQ;
1793	} else
1794		be_lun->ctl_be_lun.req_lun_id = 0;
1795
1796	be_lun->ctl_be_lun.lun_shutdown = ctl_be_block_lun_shutdown;
1797	be_lun->ctl_be_lun.lun_config_status =
1798		ctl_be_block_lun_config_status;
1799	be_lun->ctl_be_lun.be = &ctl_be_block_driver;
1800
1801	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
1802		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%4d",
1803			 softc->num_luns);
1804		strncpy((char *)be_lun->ctl_be_lun.serial_num, tmpstr,
1805			ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
1806			sizeof(tmpstr)));
1807
1808		/* Tell the user what we used for a serial number */
1809		strncpy((char *)params->serial_num, tmpstr,
1810			ctl_min(sizeof(params->serial_num), sizeof(tmpstr)));
1811	} else {
1812		strncpy((char *)be_lun->ctl_be_lun.serial_num,
1813			params->serial_num,
1814			ctl_min(sizeof(be_lun->ctl_be_lun.serial_num),
1815			sizeof(params->serial_num)));
1816	}
1817	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
1818		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%4d", softc->num_luns);
1819		strncpy((char *)be_lun->ctl_be_lun.device_id, tmpstr,
1820			ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
1821			sizeof(tmpstr)));
1822
1823		/* Tell the user what we used for a device ID */
1824		strncpy((char *)params->device_id, tmpstr,
1825			ctl_min(sizeof(params->device_id), sizeof(tmpstr)));
1826	} else {
1827		strncpy((char *)be_lun->ctl_be_lun.device_id,
1828			params->device_id,
1829			ctl_min(sizeof(be_lun->ctl_be_lun.device_id),
1830				sizeof(params->device_id)));
1831	}
1832
1833	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
1834
1835	be_lun->io_taskqueue = taskqueue_create(be_lun->lunname, M_WAITOK,
1836	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
1837
1838	if (be_lun->io_taskqueue == NULL) {
1839		snprintf(req->error_str, sizeof(req->error_str),
1840			 "%s: Unable to create taskqueue", __func__);
1841		goto bailout_error;
1842	}
1843
1844	/*
1845	 * Note that we start the same number of threads by default for
1846	 * both the file case and the block device case.  For the file
1847	 * case, we need multiple threads to allow concurrency, because the
1848	 * vnode interface is designed to be a blocking interface.  For the
1849	 * block device case, ZFS zvols at least will block the caller's
1850	 * context in many instances, and so we need multiple threads to
1851	 * overcome that problem.  Other block devices don't need as many
1852	 * threads, but they shouldn't cause too many problems.
1853	 *
1854	 * If the user wants to just have a single thread for a block
1855	 * device, he can specify that when the LUN is created, or change
1856	 * the tunable/sysctl to alter the default number of threads.
1857	 */
1858	retval = taskqueue_start_threads(&be_lun->io_taskqueue,
1859					 /*num threads*/num_threads,
1860					 /*priority*/PWAIT,
1861					 /*thread name*/
1862					 "%s taskq", be_lun->lunname);
1863
1864	if (retval != 0)
1865		goto bailout_error;
1866
1867	be_lun->num_threads = num_threads;
1868
1869	mtx_lock(&softc->lock);
1870	softc->num_luns++;
1871	STAILQ_INSERT_TAIL(&softc->lun_list, be_lun, links);
1872
1873	mtx_unlock(&softc->lock);
1874
1875	retval = ctl_add_lun(&be_lun->ctl_be_lun);
1876	if (retval != 0) {
1877		mtx_lock(&softc->lock);
1878		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
1879			      links);
1880		softc->num_luns--;
1881		mtx_unlock(&softc->lock);
1882		snprintf(req->error_str, sizeof(req->error_str),
1883			 "%s: ctl_add_lun() returned error %d, see dmesg for "
1884			"details", __func__, retval);
1885		retval = 0;
1886		goto bailout_error;
1887	}
1888
1889	mtx_lock(&softc->lock);
1890
1891	/*
1892	 * Tell the config_status routine that we're waiting so it won't
1893	 * clean up the LUN in the event of an error.
1894	 */
1895	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
1896
1897	while (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
1898		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
1899		if (retval == EINTR)
1900			break;
1901	}
1902	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
1903
1904	if (be_lun->flags & CTL_BE_BLOCK_LUN_CONFIG_ERR) {
1905		snprintf(req->error_str, sizeof(req->error_str),
1906			 "%s: LUN configuration error, see dmesg for details",
1907			 __func__);
1908		STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun,
1909			      links);
1910		softc->num_luns--;
1911		mtx_unlock(&softc->lock);
1912		goto bailout_error;
1913	} else {
1914		params->req_lun_id = be_lun->ctl_be_lun.lun_id;
1915	}
1916
1917	mtx_unlock(&softc->lock);
1918
1919	be_lun->disk_stats = devstat_new_entry("cbb", params->req_lun_id,
1920					       be_lun->blocksize,
1921					       DEVSTAT_ALL_SUPPORTED,
1922					       be_lun->ctl_be_lun.lun_type
1923					       | DEVSTAT_TYPE_IF_OTHER,
1924					       DEVSTAT_PRIORITY_OTHER);
1925
1926
1927	req->status = CTL_LUN_OK;
1928
1929	return (retval);
1930
1931bailout_error:
1932	req->status = CTL_LUN_ERROR;
1933
1934	ctl_be_block_close(be_lun);
1935
1936	free(be_lun->dev_path, M_CTLBLK);
1937	free(be_lun, M_CTLBLK);
1938
1939	return (retval);
1940}
1941
1942static int
1943ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
1944{
1945	struct ctl_lun_rm_params *params;
1946	struct ctl_be_block_lun *be_lun;
1947	int retval;
1948
1949	params = &req->reqdata.rm;
1950
1951	mtx_lock(&softc->lock);
1952
1953	be_lun = NULL;
1954
1955	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
1956		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
1957			break;
1958	}
1959	mtx_unlock(&softc->lock);
1960
1961	if (be_lun == NULL) {
1962		snprintf(req->error_str, sizeof(req->error_str),
1963			 "%s: LUN %u is not managed by the block backend",
1964			 __func__, params->lun_id);
1965		goto bailout_error;
1966	}
1967
1968	retval = ctl_disable_lun(&be_lun->ctl_be_lun);
1969
1970	if (retval != 0) {
1971		snprintf(req->error_str, sizeof(req->error_str),
1972			 "%s: error %d returned from ctl_disable_lun() for "
1973			 "LUN %d", __func__, retval, params->lun_id);
1974		goto bailout_error;
1975
1976	}
1977
1978	retval = ctl_invalidate_lun(&be_lun->ctl_be_lun);
1979	if (retval != 0) {
1980		snprintf(req->error_str, sizeof(req->error_str),
1981			 "%s: error %d returned from ctl_invalidate_lun() for "
1982			 "LUN %d", __func__, retval, params->lun_id);
1983		goto bailout_error;
1984	}
1985
1986	mtx_lock(&softc->lock);
1987
1988	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
1989
1990	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
1991                retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblk", 0);
1992                if (retval == EINTR)
1993                        break;
1994        }
1995
1996	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
1997
1998	if ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
1999		snprintf(req->error_str, sizeof(req->error_str),
2000			 "%s: interrupted waiting for LUN to be freed",
2001			 __func__);
2002		mtx_unlock(&softc->lock);
2003		goto bailout_error;
2004	}
2005
2006	STAILQ_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links);
2007
2008	softc->num_luns--;
2009	mtx_unlock(&softc->lock);
2010
2011	taskqueue_drain(be_lun->io_taskqueue, &be_lun->io_task);
2012
2013	taskqueue_free(be_lun->io_taskqueue);
2014
2015	ctl_be_block_close(be_lun);
2016
2017	if (be_lun->disk_stats != NULL)
2018		devstat_remove_entry(be_lun->disk_stats);
2019
2020	uma_zdestroy(be_lun->lun_zone);
2021
2022	free(be_lun->dev_path, M_CTLBLK);
2023
2024	free(be_lun, M_CTLBLK);
2025
2026	req->status = CTL_LUN_OK;
2027
2028	return (0);
2029
2030bailout_error:
2031
2032	req->status = CTL_LUN_ERROR;
2033
2034	return (0);
2035}
2036
2037static int
2038ctl_be_block_modify_file(struct ctl_be_block_lun *be_lun,
2039			 struct ctl_lun_req *req)
2040{
2041	struct vattr vattr;
2042	int error;
2043	struct ctl_lun_modify_params *params;
2044
2045	params = &req->reqdata.modify;
2046
2047	if (params->lun_size_bytes != 0) {
2048		be_lun->size_bytes = params->lun_size_bytes;
2049	} else  {
2050		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
2051		if (error != 0) {
2052			snprintf(req->error_str, sizeof(req->error_str),
2053				 "error calling VOP_GETATTR() for file %s",
2054				 be_lun->dev_path);
2055			return (error);
2056		}
2057
2058		be_lun->size_bytes = vattr.va_size;
2059	}
2060
2061	return (0);
2062}
2063
2064static int
2065ctl_be_block_modify_dev(struct ctl_be_block_lun *be_lun,
2066			struct ctl_lun_req *req)
2067{
2068	struct cdev *dev;
2069	struct cdevsw *devsw;
2070	int error;
2071	struct ctl_lun_modify_params *params;
2072	uint64_t size_bytes;
2073
2074	params = &req->reqdata.modify;
2075
2076	dev = be_lun->vn->v_rdev;
2077	devsw = dev->si_devsw;
2078	if (!devsw->d_ioctl) {
2079		snprintf(req->error_str, sizeof(req->error_str),
2080			 "%s: no d_ioctl for device %s!", __func__,
2081			 be_lun->dev_path);
2082		return (ENODEV);
2083	}
2084
2085	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2086			       (caddr_t)&size_bytes, FREAD,
2087			       curthread);
2088	if (error) {
2089		snprintf(req->error_str, sizeof(req->error_str),
2090			 "%s: error %d returned for DIOCGMEDIASIZE ioctl "
2091			 "on %s!", __func__, error, be_lun->dev_path);
2092		return (error);
2093	}
2094
2095	if (params->lun_size_bytes != 0) {
2096		if (params->lun_size_bytes > size_bytes) {
2097			snprintf(req->error_str, sizeof(req->error_str),
2098				 "%s: requested LUN size %ju > backing device "
2099				 "size %ju", __func__,
2100				 (uintmax_t)params->lun_size_bytes,
2101				 (uintmax_t)size_bytes);
2102			return (EINVAL);
2103		}
2104
2105		be_lun->size_bytes = params->lun_size_bytes;
2106	} else {
2107		be_lun->size_bytes = size_bytes;
2108	}
2109
2110	return (0);
2111}
2112
2113static int
2114ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2115{
2116	struct ctl_lun_modify_params *params;
2117	struct ctl_be_block_lun *be_lun;
2118	int vfs_is_locked, error;
2119
2120	params = &req->reqdata.modify;
2121
2122	mtx_lock(&softc->lock);
2123
2124	be_lun = NULL;
2125
2126	STAILQ_FOREACH(be_lun, &softc->lun_list, links) {
2127		if (be_lun->ctl_be_lun.lun_id == params->lun_id)
2128			break;
2129	}
2130	mtx_unlock(&softc->lock);
2131
2132	if (be_lun == NULL) {
2133		snprintf(req->error_str, sizeof(req->error_str),
2134			 "%s: LUN %u is not managed by the block backend",
2135			 __func__, params->lun_id);
2136		goto bailout_error;
2137	}
2138
2139	if (params->lun_size_bytes != 0) {
2140		if (params->lun_size_bytes < be_lun->blocksize) {
2141			snprintf(req->error_str, sizeof(req->error_str),
2142				"%s: LUN size %ju < blocksize %u", __func__,
2143				params->lun_size_bytes, be_lun->blocksize);
2144			goto bailout_error;
2145		}
2146	}
2147
2148	vfs_is_locked = VFS_LOCK_GIANT(be_lun->vn->v_mount);
2149	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2150
2151	if (be_lun->vn->v_type == VREG)
2152		error = ctl_be_block_modify_file(be_lun, req);
2153	else
2154		error = ctl_be_block_modify_dev(be_lun, req);
2155
2156	VOP_UNLOCK(be_lun->vn, 0);
2157	VFS_UNLOCK_GIANT(vfs_is_locked);
2158
2159	if (error != 0)
2160		goto bailout_error;
2161
2162	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
2163
2164	/*
2165	 * The maximum LBA is the size - 1.
2166	 *
2167	 * XXX: Note that this field is being updated without locking,
2168	 * 	which might cause problems on 32-bit architectures.
2169	 */
2170	be_lun->ctl_be_lun.maxlba = be_lun->size_blocks - 1;
2171	ctl_lun_capacity_changed(&be_lun->ctl_be_lun);
2172
2173	/* Tell the user the exact size we ended up using */
2174	params->lun_size_bytes = be_lun->size_bytes;
2175
2176	req->status = CTL_LUN_OK;
2177
2178	return (0);
2179
2180bailout_error:
2181	req->status = CTL_LUN_ERROR;
2182
2183	return (0);
2184}
2185
2186static void
2187ctl_be_block_lun_shutdown(void *be_lun)
2188{
2189	struct ctl_be_block_lun *lun;
2190	struct ctl_be_block_softc *softc;
2191
2192	lun = (struct ctl_be_block_lun *)be_lun;
2193
2194	softc = lun->softc;
2195
2196	mtx_lock(&softc->lock);
2197	lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
2198	if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2199		wakeup(lun);
2200	mtx_unlock(&softc->lock);
2201
2202}
2203
2204static void
2205ctl_be_block_lun_config_status(void *be_lun, ctl_lun_config_status status)
2206{
2207	struct ctl_be_block_lun *lun;
2208	struct ctl_be_block_softc *softc;
2209
2210	lun = (struct ctl_be_block_lun *)be_lun;
2211	softc = lun->softc;
2212
2213	if (status == CTL_LUN_CONFIG_OK) {
2214		mtx_lock(&softc->lock);
2215		lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2216		if (lun->flags & CTL_BE_BLOCK_LUN_WAITING)
2217			wakeup(lun);
2218		mtx_unlock(&softc->lock);
2219
2220		/*
2221		 * We successfully added the LUN, attempt to enable it.
2222		 */
2223		if (ctl_enable_lun(&lun->ctl_be_lun) != 0) {
2224			printf("%s: ctl_enable_lun() failed!\n", __func__);
2225			if (ctl_invalidate_lun(&lun->ctl_be_lun) != 0) {
2226				printf("%s: ctl_invalidate_lun() failed!\n",
2227				       __func__);
2228			}
2229		}
2230
2231		return;
2232	}
2233
2234
2235	mtx_lock(&softc->lock);
2236	lun->flags &= ~CTL_BE_BLOCK_LUN_UNCONFIGURED;
2237	lun->flags |= CTL_BE_BLOCK_LUN_CONFIG_ERR;
2238	wakeup(lun);
2239	mtx_unlock(&softc->lock);
2240}
2241
2242
2243static int
2244ctl_be_block_config_write(union ctl_io *io)
2245{
2246	struct ctl_be_block_lun *be_lun;
2247	struct ctl_be_lun *ctl_be_lun;
2248	int retval;
2249
2250	retval = 0;
2251
2252	DPRINTF("entered\n");
2253
2254	ctl_be_lun = (struct ctl_be_lun *)io->io_hdr.ctl_private[
2255		CTL_PRIV_BACKEND_LUN].ptr;
2256	be_lun = (struct ctl_be_block_lun *)ctl_be_lun->be_lun;
2257
2258	switch (io->scsiio.cdb[0]) {
2259	case SYNCHRONIZE_CACHE:
2260	case SYNCHRONIZE_CACHE_16:
2261		/*
2262		 * The upper level CTL code will filter out any CDBs with
2263		 * the immediate bit set and return the proper error.
2264		 *
2265		 * We don't really need to worry about what LBA range the
2266		 * user asked to be synced out.  When they issue a sync
2267		 * cache command, we'll sync out the whole thing.
2268		 */
2269		mtx_lock(&be_lun->lock);
2270		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
2271				   links);
2272		mtx_unlock(&be_lun->lock);
2273		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2274		break;
2275	case START_STOP_UNIT: {
2276		struct scsi_start_stop_unit *cdb;
2277
2278		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
2279
2280		if (cdb->how & SSS_START)
2281			retval = ctl_start_lun(ctl_be_lun);
2282		else {
2283			retval = ctl_stop_lun(ctl_be_lun);
2284			/*
2285			 * XXX KDM Copan-specific offline behavior.
2286			 * Figure out a reasonable way to port this?
2287			 */
2288#ifdef NEEDTOPORT
2289			if ((retval == 0)
2290			 && (cdb->byte2 & SSS_ONOFFLINE))
2291				retval = ctl_lun_offline(ctl_be_lun);
2292#endif
2293		}
2294
2295		/*
2296		 * In general, the above routines should not fail.  They
2297		 * just set state for the LUN.  So we've got something
2298		 * pretty wrong here if we can't start or stop the LUN.
2299		 */
2300		if (retval != 0) {
2301			ctl_set_internal_failure(&io->scsiio,
2302						 /*sks_valid*/ 1,
2303						 /*retry_count*/ 0xf051);
2304			retval = CTL_RETVAL_COMPLETE;
2305		} else {
2306			ctl_set_success(&io->scsiio);
2307		}
2308		ctl_config_write_done(io);
2309		break;
2310	}
2311	default:
2312		ctl_set_invalid_opcode(&io->scsiio);
2313		ctl_config_write_done(io);
2314		retval = CTL_RETVAL_COMPLETE;
2315		break;
2316	}
2317
2318	return (retval);
2319
2320}
2321
2322static int
2323ctl_be_block_config_read(union ctl_io *io)
2324{
2325	return (0);
2326}
2327
2328static int
2329ctl_be_block_lun_info(void *be_lun, struct sbuf *sb)
2330{
2331	struct ctl_be_block_lun *lun;
2332	int retval;
2333
2334	lun = (struct ctl_be_block_lun *)be_lun;
2335	retval = 0;
2336
2337	retval = sbuf_printf(sb, "<num_threads>");
2338
2339	if (retval != 0)
2340		goto bailout;
2341
2342	retval = sbuf_printf(sb, "%d", lun->num_threads);
2343
2344	if (retval != 0)
2345		goto bailout;
2346
2347	retval = sbuf_printf(sb, "</num_threads>");
2348
2349	/*
2350	 * For processor devices, we don't have a path variable.
2351	 */
2352	if ((retval != 0)
2353	 || (lun->dev_path == NULL))
2354		goto bailout;
2355
2356	retval = sbuf_printf(sb, "<file>");
2357
2358	if (retval != 0)
2359		goto bailout;
2360
2361	retval = ctl_sbuf_printf_esc(sb, lun->dev_path);
2362
2363	if (retval != 0)
2364		goto bailout;
2365
2366	retval = sbuf_printf(sb, "</file>\n");
2367
2368bailout:
2369
2370	return (retval);
2371}
2372
2373int
2374ctl_be_block_init(void)
2375{
2376	struct ctl_be_block_softc *softc;
2377	int retval;
2378
2379	softc = &backend_block_softc;
2380	retval = 0;
2381
2382	mtx_init(&softc->lock, "ctlblk", NULL, MTX_DEF);
2383	STAILQ_INIT(&softc->beio_free_queue);
2384	STAILQ_INIT(&softc->disk_list);
2385	STAILQ_INIT(&softc->lun_list);
2386	ctl_grow_beio(softc, 200);
2387
2388	return (retval);
2389}
2390