geom_io.c revision 112027
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The names of the authors may not be used to endorse or promote
20 *    products derived from this software without specific prior written
21 *    permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * $FreeBSD: head/sys/geom/geom_io.c 112027 2003-03-09 09:59:48Z phk $
36 */
37
38
39#include <sys/param.h>
40#include <sys/stdint.h>
41#ifndef _KERNEL
42#include <stdio.h>
43#include <string.h>
44#include <stdlib.h>
45#include <signal.h>
46#include <err.h>
47#include <sched.h>
48#else
49#include <sys/systm.h>
50#include <sys/kernel.h>
51#include <sys/malloc.h>
52#include <sys/bio.h>
53#endif
54
55#include <sys/errno.h>
56#include <geom/geom.h>
57#include <geom/geom_int.h>
58#include <geom/geom_stats.h>
59
60static struct g_bioq g_bio_run_down;
61static struct g_bioq g_bio_run_up;
62static struct g_bioq g_bio_run_task;
63static struct g_bioq g_bio_idle;
64
65static u_int pace;
66
67#include <machine/atomic.h>
68
69static void
70g_bioq_lock(struct g_bioq *bq)
71{
72
73	mtx_lock(&bq->bio_queue_lock);
74}
75
76static void
77g_bioq_unlock(struct g_bioq *bq)
78{
79
80	mtx_unlock(&bq->bio_queue_lock);
81}
82
83#if 0
84static void
85g_bioq_destroy(struct g_bioq *bq)
86{
87
88	mtx_destroy(&bq->bio_queue_lock);
89}
90#endif
91
92static void
93g_bioq_init(struct g_bioq *bq)
94{
95
96	TAILQ_INIT(&bq->bio_queue);
97	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
98}
99
100static struct bio *
101g_bioq_first(struct g_bioq *bq)
102{
103	struct bio *bp;
104
105	bp = TAILQ_FIRST(&bq->bio_queue);
106	if (bp != NULL) {
107		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
108		bq->bio_queue_length--;
109	}
110	return (bp);
111}
112
113static void
114g_bioq_enqueue_tail(struct bio *bp, struct g_bioq *rq)
115{
116
117	g_bioq_lock(rq);
118	TAILQ_INSERT_TAIL(&rq->bio_queue, bp, bio_queue);
119	rq->bio_queue_length++;
120	g_bioq_unlock(rq);
121}
122
123struct bio *
124g_new_bio(void)
125{
126	struct bio *bp;
127
128	g_bioq_lock(&g_bio_idle);
129	bp = g_bioq_first(&g_bio_idle);
130	g_bioq_unlock(&g_bio_idle);
131	if (bp == NULL)
132		bp = g_malloc(sizeof *bp, M_NOWAIT | M_ZERO);
133	/* g_trace(G_T_BIO, "g_new_bio() = %p", bp); */
134	return (bp);
135}
136
137void
138g_destroy_bio(struct bio *bp)
139{
140
141	/* g_trace(G_T_BIO, "g_destroy_bio(%p)", bp); */
142	bzero(bp, sizeof *bp);
143	g_bioq_enqueue_tail(bp, &g_bio_idle);
144}
145
146struct bio *
147g_clone_bio(struct bio *bp)
148{
149	struct bio *bp2;
150
151	bp2 = g_new_bio();
152	if (bp2 != NULL) {
153		bp2->bio_parent = bp;
154		bp2->bio_cmd = bp->bio_cmd;
155		bp2->bio_length = bp->bio_length;
156		bp2->bio_offset = bp->bio_offset;
157		bp2->bio_data = bp->bio_data;
158		bp2->bio_attribute = bp->bio_attribute;
159		bp->bio_children++;
160	}
161	/* g_trace(G_T_BIO, "g_clone_bio(%p) = %p", bp, bp2); */
162	return(bp2);
163}
164
165void
166g_io_init()
167{
168
169	g_bioq_init(&g_bio_run_down);
170	g_bioq_init(&g_bio_run_up);
171	g_bioq_init(&g_bio_run_task);
172	g_bioq_init(&g_bio_idle);
173}
174
175int
176g_io_setattr(const char *attr, struct g_consumer *cp, int len, void *ptr)
177{
178	struct bio *bp;
179	int error;
180
181	g_trace(G_T_BIO, "bio_setattr(%s)", attr);
182	bp = g_new_bio();
183	bp->bio_cmd = BIO_SETATTR;
184	bp->bio_done = NULL;
185	bp->bio_attribute = attr;
186	bp->bio_length = len;
187	bp->bio_data = ptr;
188	g_io_request(bp, cp);
189	error = biowait(bp, "gsetattr");
190	g_destroy_bio(bp);
191	return (error);
192}
193
194
195int
196g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
197{
198	struct bio *bp;
199	int error;
200
201	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
202	bp = g_new_bio();
203	bp->bio_cmd = BIO_GETATTR;
204	bp->bio_done = NULL;
205	bp->bio_attribute = attr;
206	bp->bio_length = *len;
207	bp->bio_data = ptr;
208	g_io_request(bp, cp);
209	error = biowait(bp, "ggetattr");
210	*len = bp->bio_completed;
211	g_destroy_bio(bp);
212	return (error);
213}
214
215static int
216g_io_check(struct bio *bp)
217{
218	struct g_consumer *cp;
219	struct g_provider *pp;
220
221	cp = bp->bio_from;
222	pp = bp->bio_to;
223
224	/* Fail if access counters dont allow the operation */
225	switch(bp->bio_cmd) {
226	case BIO_READ:
227	case BIO_GETATTR:
228		if (cp->acr == 0)
229			return (EPERM);
230		break;
231	case BIO_WRITE:
232	case BIO_DELETE:
233	case BIO_SETATTR:
234		if (cp->acw == 0)
235			return (EPERM);
236		break;
237	default:
238		return (EPERM);
239	}
240	/* if provider is marked for error, don't disturb. */
241	if (pp->error)
242		return (pp->error);
243
244	switch(bp->bio_cmd) {
245	case BIO_READ:
246	case BIO_WRITE:
247	case BIO_DELETE:
248		/* Reject I/O not on sector boundary */
249		if (bp->bio_offset % pp->sectorsize)
250			return (EINVAL);
251		/* Reject I/O not integral sector long */
252		if (bp->bio_length % pp->sectorsize)
253			return (EINVAL);
254		/* Reject requests past the end of media. */
255		if (bp->bio_offset > pp->mediasize)
256			return (EIO);
257		break;
258	default:
259		break;
260	}
261	return (0);
262}
263
264void
265g_io_request(struct bio *bp, struct g_consumer *cp)
266{
267	struct g_provider *pp;
268	struct bintime bt;
269
270	pp = cp->provider;
271	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
272	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
273	KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request"));
274	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
275
276	bp->bio_from = cp;
277	bp->bio_to = pp;
278	bp->bio_error = 0;
279	bp->bio_completed = 0;
280
281	if (g_collectstats) {
282		binuptime(&bt);
283		bp->bio_t0 = bt;
284		if (cp->nstart == cp->nend)
285			cp->stat->wentbusy = bt; /* Consumer is idle */
286		if (pp->nstart == pp->nend)
287			pp->stat->wentbusy = bt; /* Provider is idle */
288		cp->stat->nop++;
289		pp->stat->nop++;
290	}
291	cp->nstart++;
292	pp->nstart++;
293
294	/* Pass it on down. */
295	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
296	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
297	g_bioq_enqueue_tail(bp, &g_bio_run_down);
298	wakeup(&g_wait_down);
299}
300
301void
302g_io_deliver(struct bio *bp, int error)
303{
304	struct g_consumer *cp;
305	struct g_provider *pp;
306	struct bintime t1, dt;
307	int idx;
308
309	cp = bp->bio_from;
310	pp = bp->bio_to;
311	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
312	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
313	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
314	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
315
316	g_trace(G_T_BIO,
317"g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
318	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
319	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
320
321	if (g_collectstats) {
322		switch (bp->bio_cmd) {
323		case BIO_READ:    idx =  G_STAT_IDX_READ;    break;
324		case BIO_WRITE:   idx =  G_STAT_IDX_WRITE;   break;
325		case BIO_DELETE:  idx =  G_STAT_IDX_DELETE;  break;
326		case BIO_GETATTR: idx =  -1; break;
327		case BIO_SETATTR: idx =  -1; break;
328		default:
329			panic("unknown bio_cmd in g_io_deliver");
330			break;
331		}
332		binuptime(&t1);
333		/* Raise the "inconsistent" flag for userland */
334		atomic_add_acq_int(&cp->stat->seq0, 1);
335		atomic_add_acq_int(&pp->stat->seq0, 1);
336		if (idx >= 0) {
337			/* Account the service time */
338			dt = t1;
339			bintime_sub(&dt, &bp->bio_t0);
340			bintime_add(&cp->stat->ops[idx].dt, &dt);
341			bintime_add(&pp->stat->ops[idx].dt, &dt);
342			/* ... and the metrics */
343			pp->stat->ops[idx].nbyte += bp->bio_completed;
344			cp->stat->ops[idx].nbyte += bp->bio_completed;
345			pp->stat->ops[idx].nop++;
346			cp->stat->ops[idx].nop++;
347			/* ... and any errors */
348			if (error == ENOMEM) {
349				cp->stat->ops[idx].nmem++;
350				pp->stat->ops[idx].nmem++;
351			} else if (error != 0) {
352				cp->stat->ops[idx].nerr++;
353				pp->stat->ops[idx].nerr++;
354			}
355		}
356		/* Account for busy time on the consumer */
357		dt = t1;
358		bintime_sub(&dt, &cp->stat->wentbusy);
359		bintime_add(&cp->stat->bt, &dt);
360		cp->stat->wentbusy = t1;
361		/* Account for busy time on the provider */
362		dt = t1;
363		bintime_sub(&dt, &pp->stat->wentbusy);
364		bintime_add(&pp->stat->bt, &dt);
365		pp->stat->wentbusy = t1;
366		/* Mark the structures as consistent again */
367		atomic_add_acq_int(&cp->stat->seq1, 1);
368		atomic_add_acq_int(&pp->stat->seq1, 1);
369		cp->stat->nend++;
370		pp->stat->nend++;
371	}
372	cp->nend++;
373	pp->nend++;
374
375	if (error == ENOMEM) {
376		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
377		g_io_request(bp, cp);
378		pace++;
379		return;
380	}
381	bp->bio_error = error;
382	g_bioq_enqueue_tail(bp, &g_bio_run_up);
383	wakeup(&g_wait_up);
384}
385
386void
387g_io_schedule_down(struct thread *tp __unused)
388{
389	struct bio *bp;
390	off_t excess;
391	int error;
392	struct mtx mymutex;
393
394	bzero(&mymutex, sizeof mymutex);
395	mtx_init(&mymutex, "g_xdown", MTX_DEF, 0);
396
397	for(;;) {
398		g_bioq_lock(&g_bio_run_down);
399		bp = g_bioq_first(&g_bio_run_down);
400		if (bp == NULL) {
401			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
402			    PRIBIO | PDROP, "g_down", hz/10);
403			continue;
404		}
405		g_bioq_unlock(&g_bio_run_down);
406		error = g_io_check(bp);
407		if (error) {
408			g_io_deliver(bp, error);
409			continue;
410		}
411		switch (bp->bio_cmd) {
412		case BIO_READ:
413		case BIO_WRITE:
414		case BIO_DELETE:
415			/* Truncate requests to the end of providers media. */
416			excess = bp->bio_offset + bp->bio_length;
417			if (excess > bp->bio_to->mediasize) {
418				excess -= bp->bio_to->mediasize;
419				bp->bio_length -= excess;
420			}
421			/* Deliver zero length transfers right here. */
422			if (bp->bio_length == 0) {
423				g_io_deliver(bp, 0);
424				continue;
425			}
426			break;
427		default:
428			break;
429		}
430		mtx_lock(&mymutex);
431		bp->bio_to->geom->start(bp);
432		mtx_unlock(&mymutex);
433		if (pace) {
434			pace--;
435			break;
436		}
437	}
438}
439
440void
441bio_taskqueue(struct bio *bp, bio_task_t *func, void *arg)
442{
443	bp->bio_task = func;
444	bp->bio_task_arg = arg;
445	/*
446	 * The taskqueue is actually just a second queue off the "up"
447	 * queue, so we use the same lock.
448	 */
449	g_bioq_lock(&g_bio_run_up);
450	TAILQ_INSERT_TAIL(&g_bio_run_task.bio_queue, bp, bio_queue);
451	g_bio_run_task.bio_queue_length++;
452	wakeup(&g_wait_up);
453	g_bioq_unlock(&g_bio_run_up);
454}
455
456
457void
458g_io_schedule_up(struct thread *tp __unused)
459{
460	struct bio *bp;
461	struct mtx mymutex;
462
463	bzero(&mymutex, sizeof mymutex);
464	mtx_init(&mymutex, "g_xup", MTX_DEF, 0);
465	for(;;) {
466		g_bioq_lock(&g_bio_run_up);
467		bp = g_bioq_first(&g_bio_run_task);
468		if (bp != NULL) {
469			g_bioq_unlock(&g_bio_run_up);
470			mtx_lock(&mymutex);
471			bp->bio_task(bp, bp->bio_task_arg);
472			mtx_unlock(&mymutex);
473			continue;
474		}
475		bp = g_bioq_first(&g_bio_run_up);
476		if (bp != NULL) {
477			g_bioq_unlock(&g_bio_run_up);
478			mtx_lock(&mymutex);
479			biodone(bp);
480			mtx_unlock(&mymutex);
481			continue;
482		}
483		msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
484		    PRIBIO | PDROP, "g_up", hz/10);
485	}
486}
487
488void *
489g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
490{
491	struct bio *bp;
492	void *ptr;
493	int errorc;
494
495	bp = g_new_bio();
496	bp->bio_cmd = BIO_READ;
497	bp->bio_done = NULL;
498	bp->bio_offset = offset;
499	bp->bio_length = length;
500	ptr = g_malloc(length, M_WAITOK);
501	bp->bio_data = ptr;
502	g_io_request(bp, cp);
503	errorc = biowait(bp, "gread");
504	if (error != NULL)
505		*error = errorc;
506	g_destroy_bio(bp);
507	if (errorc) {
508		g_free(ptr);
509		ptr = NULL;
510	}
511	return (ptr);
512}
513
514int
515g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
516{
517	struct bio *bp;
518	int error;
519
520	bp = g_new_bio();
521	bp->bio_cmd = BIO_WRITE;
522	bp->bio_done = NULL;
523	bp->bio_offset = offset;
524	bp->bio_length = length;
525	bp->bio_data = ptr;
526	g_io_request(bp, cp);
527	error = biowait(bp, "gwrite");
528	g_destroy_bio(bp);
529	return (error);
530}
531