block_if.c revision 276429
1/*-
2 * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 276429 2014-12-30 22:22:46Z neel $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 276429 2014-12-30 22:22:46Z neel $");
31
32#include <sys/param.h>
33#include <sys/queue.h>
34#include <sys/errno.h>
35#include <sys/stat.h>
36#include <sys/ioctl.h>
37#include <sys/disk.h>
38
39#include <assert.h>
40#include <fcntl.h>
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44#include <pthread.h>
45#include <pthread_np.h>
46#include <signal.h>
47#include <unistd.h>
48
49#include <machine/atomic.h>
50
51#include "bhyverun.h"
52#include "mevent.h"
53#include "block_if.h"
54
55#define BLOCKIF_SIG	0xb109b109
56
57#define BLOCKIF_MAXREQ	33
58
59enum blockop {
60	BOP_READ,
61	BOP_WRITE,
62	BOP_FLUSH
63};
64
65enum blockstat {
66	BST_FREE,
67	BST_PEND,
68	BST_BUSY,
69	BST_DONE
70};
71
72struct blockif_elem {
73	TAILQ_ENTRY(blockif_elem) be_link;
74	struct blockif_req  *be_req;
75	enum blockop	     be_op;
76	enum blockstat	     be_status;
77	pthread_t            be_tid;
78};
79
80struct blockif_ctxt {
81	int			bc_magic;
82	int			bc_fd;
83	int			bc_rdonly;
84	off_t			bc_size;
85	int			bc_sectsz;
86	pthread_t		bc_btid;
87        pthread_mutex_t		bc_mtx;
88        pthread_cond_t		bc_cond;
89	int			bc_closing;
90
91	/* Request elements and free/pending/busy queues */
92	TAILQ_HEAD(, blockif_elem) bc_freeq;
93	TAILQ_HEAD(, blockif_elem) bc_pendq;
94	TAILQ_HEAD(, blockif_elem) bc_busyq;
95	u_int			bc_req_count;
96	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
97};
98
99static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
100
101struct blockif_sig_elem {
102	pthread_mutex_t			bse_mtx;
103	pthread_cond_t			bse_cond;
104	int				bse_pending;
105	struct blockif_sig_elem		*bse_next;
106};
107
108static struct blockif_sig_elem *blockif_bse_head;
109
110static int
111blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
112		enum blockop op)
113{
114	struct blockif_elem *be;
115
116	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
117
118	be = TAILQ_FIRST(&bc->bc_freeq);
119	assert(be != NULL);
120	assert(be->be_status == BST_FREE);
121
122	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
123	be->be_status = BST_PEND;
124	be->be_req = breq;
125	be->be_op = op;
126	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
127
128	bc->bc_req_count++;
129
130	return (0);
131}
132
133static int
134blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
135{
136	struct blockif_elem *be;
137
138	if (bc->bc_req_count == 0)
139		return (ENOENT);
140
141	be = TAILQ_FIRST(&bc->bc_pendq);
142	assert(be != NULL);
143	assert(be->be_status == BST_PEND);
144	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
145	be->be_status = BST_BUSY;
146	be->be_tid = bc->bc_btid;
147	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
148
149	*bep = be;
150
151	return (0);
152}
153
154static void
155blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
156{
157	assert(be->be_status == BST_DONE);
158
159	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
160	be->be_tid = 0;
161	be->be_status = BST_FREE;
162	be->be_req = NULL;
163	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
164
165	bc->bc_req_count--;
166}
167
168static void
169blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
170{
171	struct blockif_req *br;
172	int err;
173
174	br = be->be_req;
175	err = 0;
176
177	switch (be->be_op) {
178	case BOP_READ:
179		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
180			   br->br_offset) < 0)
181			err = errno;
182		break;
183	case BOP_WRITE:
184		if (bc->bc_rdonly)
185			err = EROFS;
186		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
187			     br->br_offset) < 0)
188			err = errno;
189		break;
190	case BOP_FLUSH:
191		break;
192	default:
193		err = EINVAL;
194		break;
195	}
196
197	be->be_status = BST_DONE;
198
199	(*br->br_callback)(br, err);
200}
201
202static void *
203blockif_thr(void *arg)
204{
205	struct blockif_ctxt *bc;
206	struct blockif_elem *be;
207
208	bc = arg;
209
210	for (;;) {
211		pthread_mutex_lock(&bc->bc_mtx);
212		while (!blockif_dequeue(bc, &be)) {
213			pthread_mutex_unlock(&bc->bc_mtx);
214			blockif_proc(bc, be);
215			pthread_mutex_lock(&bc->bc_mtx);
216			blockif_complete(bc, be);
217		}
218		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
219		pthread_mutex_unlock(&bc->bc_mtx);
220
221		/*
222		 * Check ctxt status here to see if exit requested
223		 */
224		if (bc->bc_closing)
225			pthread_exit(NULL);
226	}
227
228	/* Not reached */
229	return (NULL);
230}
231
232static void
233blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
234{
235	struct blockif_sig_elem *bse;
236
237	for (;;) {
238		/*
239		 * Process the entire list even if not intended for
240		 * this thread.
241		 */
242		do {
243			bse = blockif_bse_head;
244			if (bse == NULL)
245				return;
246		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
247					    (uintptr_t)bse,
248					    (uintptr_t)bse->bse_next));
249
250		pthread_mutex_lock(&bse->bse_mtx);
251		bse->bse_pending = 0;
252		pthread_cond_signal(&bse->bse_cond);
253		pthread_mutex_unlock(&bse->bse_mtx);
254	}
255}
256
257static void
258blockif_init(void)
259{
260	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
261	(void) signal(SIGCONT, SIG_IGN);
262}
263
264struct blockif_ctxt *
265blockif_open(const char *optstr, const char *ident)
266{
267	char tname[MAXCOMLEN + 1];
268	char *nopt, *xopts;
269	struct blockif_ctxt *bc;
270	struct stat sbuf;
271	off_t size;
272	int extra, fd, i, sectsz;
273	int nocache, sync, ro;
274
275	pthread_once(&blockif_once, blockif_init);
276
277	nocache = 0;
278	sync = 0;
279	ro = 0;
280
281	/*
282	 * The first element in the optstring is always a pathname.
283	 * Optional elements follow
284	 */
285	nopt = strdup(optstr);
286	for (xopts = strtok(nopt, ",");
287	     xopts != NULL;
288	     xopts = strtok(NULL, ",")) {
289		if (!strcmp(xopts, "nocache"))
290			nocache = 1;
291		else if (!strcmp(xopts, "sync"))
292			sync = 1;
293		else if (!strcmp(xopts, "ro"))
294			ro = 1;
295	}
296
297	extra = 0;
298	if (nocache)
299		extra |= O_DIRECT;
300	if (sync)
301		extra |= O_SYNC;
302
303	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
304	if (fd < 0 && !ro) {
305		/* Attempt a r/w fail with a r/o open */
306		fd = open(nopt, O_RDONLY | extra);
307		ro = 1;
308	}
309
310	if (fd < 0) {
311		perror("Could not open backing file");
312		return (NULL);
313	}
314
315        if (fstat(fd, &sbuf) < 0) {
316                perror("Could not stat backing file");
317                close(fd);
318                return (NULL);
319        }
320
321        /*
322	 * Deal with raw devices
323	 */
324        size = sbuf.st_size;
325	sectsz = DEV_BSIZE;
326	if (S_ISCHR(sbuf.st_mode)) {
327		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
328		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
329			perror("Could not fetch dev blk/sector size");
330			close(fd);
331			return (NULL);
332		}
333		assert(size != 0);
334		assert(sectsz != 0);
335	}
336
337	bc = calloc(1, sizeof(struct blockif_ctxt));
338	if (bc == NULL) {
339		close(fd);
340		return (NULL);
341	}
342
343	bc->bc_magic = BLOCKIF_SIG;
344	bc->bc_fd = fd;
345	bc->bc_rdonly = ro;
346	bc->bc_size = size;
347	bc->bc_sectsz = sectsz;
348	pthread_mutex_init(&bc->bc_mtx, NULL);
349	pthread_cond_init(&bc->bc_cond, NULL);
350	TAILQ_INIT(&bc->bc_freeq);
351	TAILQ_INIT(&bc->bc_pendq);
352	TAILQ_INIT(&bc->bc_busyq);
353	bc->bc_req_count = 0;
354	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
355		bc->bc_reqs[i].be_status = BST_FREE;
356		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
357	}
358
359	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
360
361	snprintf(tname, sizeof(tname), "blk-%s", ident);
362	pthread_set_name_np(bc->bc_btid, tname);
363
364	return (bc);
365}
366
367static int
368blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
369		enum blockop op)
370{
371	int err;
372
373	err = 0;
374
375	pthread_mutex_lock(&bc->bc_mtx);
376	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
377		/*
378		 * Enqueue and inform the block i/o thread
379		 * that there is work available
380		 */
381		blockif_enqueue(bc, breq, op);
382		pthread_cond_signal(&bc->bc_cond);
383	} else {
384		/*
385		 * Callers are not allowed to enqueue more than
386		 * the specified blockif queue limit. Return an
387		 * error to indicate that the queue length has been
388		 * exceeded.
389		 */
390		err = E2BIG;
391	}
392	pthread_mutex_unlock(&bc->bc_mtx);
393
394	return (err);
395}
396
397int
398blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
399{
400
401	assert(bc->bc_magic == BLOCKIF_SIG);
402	return (blockif_request(bc, breq, BOP_READ));
403}
404
405int
406blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
407{
408
409	assert(bc->bc_magic == BLOCKIF_SIG);
410	return (blockif_request(bc, breq, BOP_WRITE));
411}
412
413int
414blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
415{
416
417	assert(bc->bc_magic == BLOCKIF_SIG);
418	return (blockif_request(bc, breq, BOP_FLUSH));
419}
420
421int
422blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
423{
424	struct blockif_elem *be;
425
426	assert(bc->bc_magic == BLOCKIF_SIG);
427
428	pthread_mutex_lock(&bc->bc_mtx);
429	/*
430	 * Check pending requests.
431	 */
432	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
433		if (be->be_req == breq)
434			break;
435	}
436	if (be != NULL) {
437		/*
438		 * Found it.
439		 */
440		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
441		be->be_status = BST_FREE;
442		be->be_req = NULL;
443		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
444		bc->bc_req_count--;
445		pthread_mutex_unlock(&bc->bc_mtx);
446
447		return (0);
448	}
449
450	/*
451	 * Check in-flight requests.
452	 */
453	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
454		if (be->be_req == breq)
455			break;
456	}
457	if (be == NULL) {
458		/*
459		 * Didn't find it.
460		 */
461		pthread_mutex_unlock(&bc->bc_mtx);
462		return (EINVAL);
463	}
464
465	/*
466	 * Interrupt the processing thread to force it return
467	 * prematurely via it's normal callback path.
468	 */
469	while (be->be_status == BST_BUSY) {
470		struct blockif_sig_elem bse, *old_head;
471
472		pthread_mutex_init(&bse.bse_mtx, NULL);
473		pthread_cond_init(&bse.bse_cond, NULL);
474
475		bse.bse_pending = 1;
476
477		do {
478			old_head = blockif_bse_head;
479			bse.bse_next = old_head;
480		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
481					    (uintptr_t)old_head,
482					    (uintptr_t)&bse));
483
484		pthread_kill(be->be_tid, SIGCONT);
485
486		pthread_mutex_lock(&bse.bse_mtx);
487		while (bse.bse_pending)
488			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
489		pthread_mutex_unlock(&bse.bse_mtx);
490	}
491
492	pthread_mutex_unlock(&bc->bc_mtx);
493
494	/*
495	 * The processing thread has been interrupted.  Since it's not
496	 * clear if the callback has been invoked yet, return EBUSY.
497	 */
498	return (EBUSY);
499}
500
501int
502blockif_close(struct blockif_ctxt *bc)
503{
504	void *jval;
505	int err;
506
507	err = 0;
508
509	assert(bc->bc_magic == BLOCKIF_SIG);
510
511	/*
512	 * Stop the block i/o thread
513	 */
514	bc->bc_closing = 1;
515	pthread_cond_signal(&bc->bc_cond);
516	pthread_join(bc->bc_btid, &jval);
517
518	/* XXX Cancel queued i/o's ??? */
519
520	/*
521	 * Release resources
522	 */
523	bc->bc_magic = 0;
524	close(bc->bc_fd);
525	free(bc);
526
527	return (0);
528}
529
530/*
531 * Return virtual C/H/S values for a given block. Use the algorithm
532 * outlined in the VHD specification to calculate values.
533 */
534void
535blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
536{
537	off_t sectors;		/* total sectors of the block dev */
538	off_t hcyl;		/* cylinders times heads */
539	uint16_t secpt;		/* sectors per track */
540	uint8_t heads;
541
542	assert(bc->bc_magic == BLOCKIF_SIG);
543
544	sectors = bc->bc_size / bc->bc_sectsz;
545
546	/* Clamp the size to the largest possible with CHS */
547	if (sectors > 65535UL*16*255)
548		sectors = 65535UL*16*255;
549
550	if (sectors >= 65536UL*16*63) {
551		secpt = 255;
552		heads = 16;
553		hcyl = sectors / secpt;
554	} else {
555		secpt = 17;
556		hcyl = sectors / secpt;
557		heads = (hcyl + 1023) / 1024;
558
559		if (heads < 4)
560			heads = 4;
561
562		if (hcyl >= (heads * 1024) || heads > 16) {
563			secpt = 31;
564			heads = 16;
565			hcyl = sectors / secpt;
566		}
567		if (hcyl >= (heads * 1024)) {
568			secpt = 63;
569			heads = 16;
570			hcyl = sectors / secpt;
571		}
572	}
573
574	*c = hcyl / heads;
575	*h = heads;
576	*s = secpt;
577}
578
579/*
580 * Accessors
581 */
582off_t
583blockif_size(struct blockif_ctxt *bc)
584{
585
586	assert(bc->bc_magic == BLOCKIF_SIG);
587	return (bc->bc_size);
588}
589
590int
591blockif_sectsz(struct blockif_ctxt *bc)
592{
593
594	assert(bc->bc_magic == BLOCKIF_SIG);
595	return (bc->bc_sectsz);
596}
597
598int
599blockif_queuesz(struct blockif_ctxt *bc)
600{
601
602	assert(bc->bc_magic == BLOCKIF_SIG);
603	return (BLOCKIF_MAXREQ - 1);
604}
605
606int
607blockif_is_ro(struct blockif_ctxt *bc)
608{
609
610	assert(bc->bc_magic == BLOCKIF_SIG);
611	return (bc->bc_rdonly);
612}
613