geom_io.c revision 110736
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The names of the authors may not be used to endorse or promote
20 *    products derived from this software without specific prior written
21 *    permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * $FreeBSD: head/sys/geom/geom_io.c 110736 2003-02-11 22:30:26Z phk $
36 */
37
38
39#include <sys/param.h>
40#include <sys/stdint.h>
41#ifndef _KERNEL
42#include <stdio.h>
43#include <string.h>
44#include <stdlib.h>
45#include <signal.h>
46#include <err.h>
47#include <sched.h>
48#else
49#include <sys/systm.h>
50#include <sys/kernel.h>
51#include <sys/malloc.h>
52#include <sys/bio.h>
53#endif
54
55#include <sys/errno.h>
56#include <geom/geom.h>
57#include <geom/geom_int.h>
58#include <geom/geom_stats.h>
59
60static struct g_bioq g_bio_run_down;
61static struct g_bioq g_bio_run_up;
62static struct g_bioq g_bio_run_task;
63static struct g_bioq g_bio_idle;
64
65static u_int pace;
66
67#include <machine/atomic.h>
68
69static void
70g_bioq_lock(struct g_bioq *bq)
71{
72
73	mtx_lock(&bq->bio_queue_lock);
74}
75
76static void
77g_bioq_unlock(struct g_bioq *bq)
78{
79
80	mtx_unlock(&bq->bio_queue_lock);
81}
82
83#if 0
84static void
85g_bioq_destroy(struct g_bioq *bq)
86{
87
88	mtx_destroy(&bq->bio_queue_lock);
89}
90#endif
91
92static void
93g_bioq_init(struct g_bioq *bq)
94{
95
96	TAILQ_INIT(&bq->bio_queue);
97	mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF);
98}
99
100static struct bio *
101g_bioq_first(struct g_bioq *bq)
102{
103	struct bio *bp;
104
105	bp = TAILQ_FIRST(&bq->bio_queue);
106	if (bp != NULL) {
107		TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue);
108		bq->bio_queue_length--;
109	}
110	return (bp);
111}
112
113static void
114g_bioq_enqueue_tail(struct bio *bp, struct g_bioq *rq)
115{
116
117	g_bioq_lock(rq);
118	TAILQ_INSERT_TAIL(&rq->bio_queue, bp, bio_queue);
119	rq->bio_queue_length++;
120	g_bioq_unlock(rq);
121}
122
123struct bio *
124g_new_bio(void)
125{
126	struct bio *bp;
127
128	g_bioq_lock(&g_bio_idle);
129	bp = g_bioq_first(&g_bio_idle);
130	g_bioq_unlock(&g_bio_idle);
131	if (bp == NULL)
132		bp = g_malloc(sizeof *bp, M_NOWAIT | M_ZERO);
133	/* g_trace(G_T_BIO, "g_new_bio() = %p", bp); */
134	return (bp);
135}
136
137void
138g_destroy_bio(struct bio *bp)
139{
140
141	/* g_trace(G_T_BIO, "g_destroy_bio(%p)", bp); */
142	bzero(bp, sizeof *bp);
143	g_bioq_enqueue_tail(bp, &g_bio_idle);
144}
145
146struct bio *
147g_clone_bio(struct bio *bp)
148{
149	struct bio *bp2;
150
151	bp2 = g_new_bio();
152	if (bp2 != NULL) {
153		bp2->bio_parent = bp;
154		bp2->bio_cmd = bp->bio_cmd;
155		bp2->bio_length = bp->bio_length;
156		bp2->bio_offset = bp->bio_offset;
157		bp2->bio_data = bp->bio_data;
158		bp2->bio_attribute = bp->bio_attribute;
159		bp->bio_children++;
160	}
161	/* g_trace(G_T_BIO, "g_clone_bio(%p) = %p", bp, bp2); */
162	return(bp2);
163}
164
165void
166g_io_init()
167{
168
169	g_bioq_init(&g_bio_run_down);
170	g_bioq_init(&g_bio_run_up);
171	g_bioq_init(&g_bio_run_task);
172	g_bioq_init(&g_bio_idle);
173}
174
175int
176g_io_setattr(const char *attr, struct g_consumer *cp, int len, void *ptr)
177{
178	struct bio *bp;
179	int error;
180
181	g_trace(G_T_BIO, "bio_setattr(%s)", attr);
182	bp = g_new_bio();
183	bp->bio_cmd = BIO_SETATTR;
184	bp->bio_done = NULL;
185	bp->bio_attribute = attr;
186	bp->bio_length = len;
187	bp->bio_data = ptr;
188	g_io_request(bp, cp);
189	error = biowait(bp, "gsetattr");
190	g_destroy_bio(bp);
191	return (error);
192}
193
194
195int
196g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr)
197{
198	struct bio *bp;
199	int error;
200
201	g_trace(G_T_BIO, "bio_getattr(%s)", attr);
202	bp = g_new_bio();
203	bp->bio_cmd = BIO_GETATTR;
204	bp->bio_done = NULL;
205	bp->bio_attribute = attr;
206	bp->bio_length = *len;
207	bp->bio_data = ptr;
208	g_io_request(bp, cp);
209	error = biowait(bp, "ggetattr");
210	*len = bp->bio_completed;
211	g_destroy_bio(bp);
212	return (error);
213}
214
215static int
216g_io_check(struct bio *bp)
217{
218	struct g_consumer *cp;
219	struct g_provider *pp;
220
221	cp = bp->bio_from;
222	pp = bp->bio_to;
223
224	/* Fail if access counters dont allow the operation */
225	switch(bp->bio_cmd) {
226	case BIO_READ:
227	case BIO_GETATTR:
228		if (cp->acr == 0)
229			return (EPERM);
230		break;
231	case BIO_WRITE:
232	case BIO_DELETE:
233	case BIO_SETATTR:
234		if (cp->acw == 0)
235			return (EPERM);
236		break;
237	default:
238		return (EPERM);
239	}
240	/* if provider is marked for error, don't disturb. */
241	if (pp->error)
242		return (pp->error);
243
244	switch(bp->bio_cmd) {
245	case BIO_READ:
246	case BIO_WRITE:
247	case BIO_DELETE:
248		/* Reject I/O not on sector boundary */
249		if (bp->bio_offset % pp->sectorsize)
250			return (EINVAL);
251		/* Reject I/O not integral sector long */
252		if (bp->bio_length % pp->sectorsize)
253			return (EINVAL);
254		/* Reject requests past the end of media. */
255		if (bp->bio_offset > pp->mediasize)
256			return (EIO);
257		break;
258	default:
259		break;
260	}
261	return (0);
262}
263
264void
265g_io_request(struct bio *bp, struct g_consumer *cp)
266{
267	struct g_provider *pp;
268	struct bintime bt;
269
270	pp = cp->provider;
271	KASSERT(cp != NULL, ("NULL cp in g_io_request"));
272	KASSERT(bp != NULL, ("NULL bp in g_io_request"));
273	KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request"));
274	KASSERT(pp != NULL, ("consumer not attached in g_io_request"));
275
276	bp->bio_from = cp;
277	bp->bio_to = pp;
278	bp->bio_error = 0;
279	bp->bio_completed = 0;
280
281	if (g_collectstats) {
282		binuptime(&bt);
283		bp->bio_t0 = bt;
284		if (cp->stat->nop == cp->stat->nend)
285			cp->stat->wentbusy = bt; /* Consumer is idle */
286		if (pp->stat->nop == pp->stat->nend)
287			pp->stat->wentbusy = bt; /* Provider is idle */
288	}
289	cp->stat->nop++;
290	pp->stat->nop++;
291
292	/* Pass it on down. */
293	g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d",
294	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd);
295	g_bioq_enqueue_tail(bp, &g_bio_run_down);
296	wakeup(&g_wait_down);
297}
298
299void
300g_io_deliver(struct bio *bp, int error)
301{
302	struct g_consumer *cp;
303	struct g_provider *pp;
304	struct bintime t1, dt;
305	int idx;
306
307	cp = bp->bio_from;
308	pp = bp->bio_to;
309	KASSERT(bp != NULL, ("NULL bp in g_io_deliver"));
310	KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver"));
311	KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver"));
312	KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver"));
313
314	g_trace(G_T_BIO,
315"g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd",
316	    bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error,
317	    (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length);
318
319	if (g_collectstats) {
320		switch (bp->bio_cmd) {
321		case BIO_READ:    idx =  G_STAT_IDX_READ;    break;
322		case BIO_WRITE:   idx =  G_STAT_IDX_WRITE;   break;
323		case BIO_DELETE:  idx =  G_STAT_IDX_DELETE;  break;
324		case BIO_GETATTR: idx =  -1; break;
325		case BIO_SETATTR: idx =  -1; break;
326		default:
327			panic("unknown bio_cmd in g_io_deliver");
328			break;
329		}
330		binuptime(&t1);
331		/* Raise the "inconsistent" flag for userland */
332		atomic_add_acq_int(&cp->stat->seq0, 1);
333		atomic_add_acq_int(&pp->stat->seq0, 1);
334		if (idx >= 0) {
335			/* Account the service time */
336			dt = t1;
337			bintime_sub(&dt, &bp->bio_t0);
338			bintime_add(&cp->stat->ops[idx].dt, &dt);
339			bintime_add(&pp->stat->ops[idx].dt, &dt);
340			/* ... and the metrics */
341			pp->stat->ops[idx].nbyte += bp->bio_completed;
342			cp->stat->ops[idx].nbyte += bp->bio_completed;
343			pp->stat->ops[idx].nop++;
344			cp->stat->ops[idx].nop++;
345			/* ... and any errors */
346			if (error == ENOMEM) {
347				cp->stat->ops[idx].nmem++;
348				pp->stat->ops[idx].nmem++;
349			} else if (error != 0) {
350				cp->stat->ops[idx].nerr++;
351				pp->stat->ops[idx].nerr++;
352			}
353		}
354		/* Account for busy time on the consumer */
355		dt = t1;
356		bintime_sub(&dt, &cp->stat->wentbusy);
357		bintime_add(&cp->stat->bt, &dt);
358		cp->stat->wentbusy = t1;
359		/* Account for busy time on the provider */
360		dt = t1;
361		bintime_sub(&dt, &pp->stat->wentbusy);
362		bintime_add(&pp->stat->bt, &dt);
363		pp->stat->wentbusy = t1;
364		/* Mark the structures as consistent again */
365		atomic_add_acq_int(&cp->stat->seq1, 1);
366		atomic_add_acq_int(&pp->stat->seq1, 1);
367	}
368	cp->stat->nend++;
369	pp->stat->nend++;
370
371	if (error == ENOMEM) {
372		printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name);
373		g_io_request(bp, cp);
374		pace++;
375		return;
376	}
377	bp->bio_error = error;
378	g_bioq_enqueue_tail(bp, &g_bio_run_up);
379	wakeup(&g_wait_up);
380}
381
382void
383g_io_schedule_down(struct thread *tp __unused)
384{
385	struct bio *bp;
386	off_t excess;
387	int error;
388	struct mtx mymutex;
389
390	bzero(&mymutex, sizeof mymutex);
391	mtx_init(&mymutex, "g_xdown", MTX_DEF, 0);
392
393	for(;;) {
394		g_bioq_lock(&g_bio_run_down);
395		bp = g_bioq_first(&g_bio_run_down);
396		if (bp == NULL) {
397			msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock,
398			    PRIBIO | PDROP, "g_down", hz/10);
399			continue;
400		}
401		g_bioq_unlock(&g_bio_run_down);
402		error = g_io_check(bp);
403		if (error) {
404			g_io_deliver(bp, error);
405			continue;
406		}
407		switch (bp->bio_cmd) {
408		case BIO_READ:
409		case BIO_WRITE:
410		case BIO_DELETE:
411			/* Truncate requests to the end of providers media. */
412			excess = bp->bio_offset + bp->bio_length;
413			if (excess > bp->bio_to->mediasize) {
414				excess -= bp->bio_to->mediasize;
415				bp->bio_length -= excess;
416			}
417			/* Deliver zero length transfers right here. */
418			if (bp->bio_length == 0) {
419				g_io_deliver(bp, 0);
420				continue;
421			}
422			break;
423		default:
424			break;
425		}
426		mtx_lock(&mymutex);
427		bp->bio_to->geom->start(bp);
428		mtx_unlock(&mymutex);
429		if (pace) {
430			pace--;
431			break;
432		}
433	}
434}
435
436void
437bio_taskqueue(struct bio *bp, bio_task_t *func, void *arg)
438{
439	bp->bio_task = func;
440	bp->bio_task_arg = arg;
441	/*
442	 * The taskqueue is actually just a second queue off the "up"
443	 * queue, so we use the same lock.
444	 */
445	g_bioq_lock(&g_bio_run_up);
446	TAILQ_INSERT_TAIL(&g_bio_run_task.bio_queue, bp, bio_queue);
447	g_bio_run_task.bio_queue_length++;
448	wakeup(&g_wait_up);
449	g_bioq_unlock(&g_bio_run_up);
450}
451
452
453void
454g_io_schedule_up(struct thread *tp __unused)
455{
456	struct bio *bp;
457	struct mtx mymutex;
458
459	bzero(&mymutex, sizeof mymutex);
460	mtx_init(&mymutex, "g_xup", MTX_DEF, 0);
461	for(;;) {
462		g_bioq_lock(&g_bio_run_up);
463		bp = g_bioq_first(&g_bio_run_task);
464		if (bp != NULL) {
465			g_bioq_unlock(&g_bio_run_up);
466			mtx_lock(&mymutex);
467			bp->bio_task(bp, bp->bio_task_arg);
468			mtx_unlock(&mymutex);
469			continue;
470		}
471		bp = g_bioq_first(&g_bio_run_up);
472		if (bp != NULL) {
473			g_bioq_unlock(&g_bio_run_up);
474			mtx_lock(&mymutex);
475			biodone(bp);
476			mtx_unlock(&mymutex);
477			continue;
478		}
479		msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock,
480		    PRIBIO | PDROP, "g_up", hz/10);
481	}
482}
483
484void *
485g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error)
486{
487	struct bio *bp;
488	void *ptr;
489	int errorc;
490
491	bp = g_new_bio();
492	bp->bio_cmd = BIO_READ;
493	bp->bio_done = NULL;
494	bp->bio_offset = offset;
495	bp->bio_length = length;
496	ptr = g_malloc(length, 0);
497	bp->bio_data = ptr;
498	g_io_request(bp, cp);
499	errorc = biowait(bp, "gread");
500	if (error != NULL)
501		*error = errorc;
502	g_destroy_bio(bp);
503	if (errorc) {
504		g_free(ptr);
505		ptr = NULL;
506	}
507	return (ptr);
508}
509
510int
511g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length)
512{
513	struct bio *bp;
514	int error;
515
516	bp = g_new_bio();
517	bp->bio_cmd = BIO_WRITE;
518	bp->bio_done = NULL;
519	bp->bio_offset = offset;
520	bp->bio_length = length;
521	bp->bio_data = ptr;
522	g_io_request(bp, cp);
523	error = biowait(bp, "gwrite");
524	g_destroy_bio(bp);
525	return (error);
526}
527