g_bde_work.c revision 115504
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * $FreeBSD: head/sys/geom/bde/g_bde_work.c 115504 2003-05-31 19:08:23Z phk $
33 *
34 * This source file contains the state-engine which makes things happen in the
35 * right order.
36 *
37 * Outline:
38 *   1) g_bde_start1()
39 *	Break the struct bio into multiple work packets one per zone.
40 *   2) g_bde_start2()
41 *	Setup the necessary sector buffers and start those read operations
42 *	which we can start at this time and put the item on the work-list.
43 *   3) g_bde_worker()
44 *	Scan the work-list for items which are ready for crypto processing
45 *	and call the matching crypto function in g_bde_crypt.c and schedule
46 *	any writes needed.  Read operations finish here by releasing the
47 *	sector buffers and delivering the original bio request.
48 *   4) g_bde_write_done()
49 *	Release sector buffers and deliver the original bio request.
50 *
51 * Because of the C-scope rules, the functions are almost perfectly in the
52 * opposite order in this source file.
53 *
54 * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
55 * XXX: additional states to this state-engine.  Since no hardware available
56 * XXX: at this time has AES support, implementing this has been postponed
57 * XXX: until such time as it would result in a benefit.
58 */
59
60#include <sys/param.h>
61#include <sys/bio.h>
62#include <sys/lock.h>
63#include <sys/mutex.h>
64#include <sys/queue.h>
65#include <sys/malloc.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/sysctl.h>
69#include <sys/proc.h>
70#include <sys/kthread.h>
71
72#include <crypto/rijndael/rijndael.h>
73#include <crypto/sha2/sha2.h>
74#include <geom/geom.h>
75#include <geom/bde/g_bde.h>
76
77static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
78static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
79static void g_bde_release_keysector(struct g_bde_work *wp);
80static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
81static int g_bde_start_read(struct g_bde_sector *sp);
82static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
83
84/*
85 * Work item allocation.
86 *
87 * C++ would call these constructors and destructors.
88 */
89static u_int g_bde_nwork;
90SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
91
92static MALLOC_DEFINE(M_GBDE, "GBDE", "GBDE data structures");
93
94static struct g_bde_work *
95g_bde_new_work(struct g_bde_softc *sc)
96{
97	struct g_bde_work *wp;
98
99	wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
100	if (wp == NULL)
101		return (wp);
102	wp->state = SETUP;
103	wp->softc = sc;
104	g_bde_nwork++;
105	sc->nwork++;
106	TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
107	return (wp);
108}
109
110static void
111g_bde_delete_work(struct g_bde_work *wp)
112{
113	struct g_bde_softc *sc;
114
115	sc = wp->softc;
116	g_bde_nwork--;
117	sc->nwork--;
118	TAILQ_REMOVE(&sc->worklist, wp, list);
119	free(wp, M_GBDE);
120}
121
122/*
123 * Sector buffer allocation
124 *
125 * These two functions allocate and free back variable sized sector buffers
126 */
127
128static u_int g_bde_nsect;
129SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
130
131static void
132g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
133{
134
135	g_bde_nsect--;
136	sc->nsect--;
137	if (sp->malloc)
138		free(sp->data, M_GBDE);
139	free(sp, M_GBDE);
140}
141
142static struct g_bde_sector *
143g_bde_new_sector(struct g_bde_work *wp, u_int len)
144{
145	struct g_bde_sector *sp;
146
147	sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
148	if (sp == NULL)
149		return (sp);
150	if (len > 0) {
151		sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
152		if (sp->data == NULL) {
153			free(sp, M_GBDE);
154			return (NULL);
155		}
156		sp->malloc = 1;
157	}
158	g_bde_nsect++;
159	wp->softc->nsect++;
160	sp->size = len;
161	sp->softc = wp->softc;
162	sp->ref = 1;
163	sp->owner = wp;
164	sp->offset = wp->so;
165	sp->state = JUNK;
166	return (sp);
167}
168
169/*
170 * Skey sector cache.
171 *
172 * Nothing prevents two separate I/O requests from addressing the same zone
173 * and thereby needing the same skey sector.  We therefore need to sequence
174 * I/O operations to the skey sectors.  A certain amount of caching is also
175 * desirable, although the extent of benefit from this is not at this point
176 * determined.
177 *
178 * XXX: GEOM may be able to grow a generic caching facility at some point
179 * XXX: to support such needs.
180 */
181
182static u_int g_bde_ncache;
183SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
184
185static void
186g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
187{
188
189	g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
190	if (sp->ref != 0)
191		return;
192	TAILQ_REMOVE(&sc->freelist, sp, list);
193	g_bde_ncache--;
194	sc->ncache--;
195	bzero(sp->data, sp->size);
196	g_bde_delete_sector(sc, sp);
197}
198
199static struct g_bde_sector *
200g_bde_get_keysector(struct g_bde_work *wp)
201{
202	struct g_bde_sector *sp;
203	struct g_bde_softc *sc;
204	off_t offset;
205
206	offset = wp->kso;
207	g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
208	sc = wp->softc;
209
210	if (malloc_last_fail() < g_bde_ncache)
211		g_bde_purge_sector(sc, -1);
212
213	sp = TAILQ_FIRST(&sc->freelist);
214	if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
215		g_bde_purge_one_sector(sc, sp);
216
217	TAILQ_FOREACH(sp, &sc->freelist, list) {
218		if (sp->offset == offset)
219			break;
220	}
221	if (sp != NULL) {
222		sp->ref++;
223		KASSERT(sp->offset == offset, ("wrong offset"));
224		KASSERT(sp->softc == wp->softc, ("wrong softc"));
225		if (sp->ref == 1)
226			sp->owner = wp;
227	} else {
228		if (malloc_last_fail() < g_bde_ncache) {
229			TAILQ_FOREACH(sp, &sc->freelist, list)
230				if (sp->ref == 0)
231					break;
232		}
233		if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
234			sp = TAILQ_FIRST(&sc->freelist);
235		if (sp != NULL && sp->ref > 0)
236			sp = NULL;
237		if (sp == NULL) {
238			sp = g_bde_new_sector(wp, sc->sectorsize);
239			if (sp != NULL) {
240				g_bde_ncache++;
241				sc->ncache++;
242				TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
243				sp->malloc = 2;
244			}
245		}
246		if (sp != NULL) {
247			sp->offset = offset;
248			sp->softc = wp->softc;
249			sp->ref = 1;
250			sp->owner = wp;
251			sp->state = JUNK;
252			sp->error = 0;
253		}
254	}
255	if (sp != NULL) {
256		TAILQ_REMOVE(&sc->freelist, sp, list);
257		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
258		sp->used = time_uptime;
259	}
260	wp->ksp = sp;
261	return(sp);
262}
263
264static void
265g_bde_release_keysector(struct g_bde_work *wp)
266{
267	struct g_bde_softc *sc;
268	struct g_bde_work *wp2;
269	struct g_bde_sector *sp;
270
271	sp = wp->ksp;
272	g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
273	KASSERT(sp->malloc == 2, ("Wrong sector released"));
274	sc = sp->softc;
275	KASSERT(sc != NULL, ("NULL sp->softc"));
276	KASSERT(wp == sp->owner, ("Releasing, not owner"));
277	sp->owner = NULL;
278	wp->ksp = NULL;
279	sp->ref--;
280	if (sp->ref > 0) {
281		TAILQ_REMOVE(&sc->freelist, sp, list);
282		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
283		TAILQ_FOREACH(wp2, &sc->worklist, list) {
284			if (wp2->ksp == sp) {
285				KASSERT(wp2 != wp, ("Self-reowning"));
286				sp->owner = wp2;
287				wakeup(sp->softc);
288				break;
289			}
290		}
291		KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
292	} else if (sp->error != 0) {
293		sp->offset = ~0;
294		sp->error = 0;
295		sp->state = JUNK;
296	}
297	TAILQ_REMOVE(&sc->freelist, sp, list);
298	TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
299}
300
301static void
302g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
303{
304	struct g_bde_sector *sp;
305	int n;
306
307	g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
308	if (fraction > 0)
309		n = sc->ncache / fraction + 1;
310	else
311		n = g_bde_ncache - malloc_last_fail();
312	if (n < 0)
313		return;
314	if (n > sc->ncache)
315		n = sc->ncache;
316	while(n--) {
317		TAILQ_FOREACH(sp, &sc->freelist, list) {
318			if (sp->ref != 0)
319				continue;
320			TAILQ_REMOVE(&sc->freelist, sp, list);
321			g_bde_ncache--;
322			sc->ncache--;
323			bzero(sp->data, sp->size);
324			g_bde_delete_sector(sc, sp);
325			break;
326		}
327	}
328}
329
330static struct g_bde_sector *
331g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
332{
333	struct g_bde_sector *sp;
334
335	g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
336	sp = g_bde_get_keysector(wp);
337	if (sp == NULL) {
338		g_bde_purge_sector(sc, -1);
339		sp = g_bde_get_keysector(wp);
340	}
341	if (sp == NULL)
342		return (sp);
343	if (sp->owner != wp)
344		return (sp);
345	if (sp->state == VALID)
346		return (sp);
347	if (g_bde_start_read(sp) == 0)
348		return (sp);
349	g_bde_release_keysector(wp);
350	return (NULL);
351}
352
353/*
354 * Contribute to the completion of the original bio request.
355 *
356 * We have no simple way to tell how many bits the original bio request has
357 * been segmented into, so the easiest way to determine when we can deliver
358 * it is to keep track of the number of bytes we have completed.  We keep
359 * track of any errors underway and latch onto the first one.
360 *
361 * We always report "nothing done" in case of error, because random bits here
362 * and there may be completed and returning a number of completed bytes does
363 * not convey any useful information about which bytes they were.  If some
364 * piece of broken code somewhere interprets this to mean that nothing has
365 * changed on the underlying media they deserve the lossage headed for them.
366 *
367 * A single mutex per g_bde instance is used to prevent contention.
368 */
369
370static void
371g_bde_contribute(struct bio *bp, off_t bytes, int error)
372{
373
374	g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
375	     bp, (intmax_t)bytes, error);
376	if (bp->bio_error == 0)
377		bp->bio_error = error;
378	bp->bio_completed += bytes;
379	KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
380	if (bp->bio_completed == bp->bio_length) {
381		if (bp->bio_error != 0)
382			bp->bio_completed = 0;
383		g_io_deliver(bp, bp->bio_error);
384	}
385}
386
387/*
388 * A write operation has finished.  When we have all expected cows in the
389 * barn close the door and call it a day.
390 */
391
392static void
393g_bde_write_done(struct bio *bp)
394{
395	struct g_bde_sector *sp;
396	struct g_bde_work *wp;
397	struct g_bde_softc *sc;
398
399	sp = bp->bio_caller1;
400	sc = bp->bio_caller2;
401	mtx_lock(&sc->worklist_mutex);
402	KASSERT(sp != NULL, ("NULL sp"));
403	KASSERT(sc != NULL, ("NULL sc"));
404	KASSERT(sp->owner != NULL, ("NULL sp->owner"));
405	g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
406	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
407		bp->bio_error = EIO;
408	sp->error = bp->bio_error;
409	g_destroy_bio(bp);
410	wp = sp->owner;
411	if (wp->error == 0)
412		wp->error = sp->error;
413
414	if (wp->bp->bio_cmd == BIO_DELETE) {
415		KASSERT(sp == wp->sp, ("trashed delete op"));
416		g_bde_contribute(wp->bp, wp->length, wp->error);
417		g_bde_delete_sector(sc, sp);
418		g_bde_delete_work(wp);
419		mtx_unlock(&sc->worklist_mutex);
420		return;
421	}
422
423	KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
424	KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
425	if (wp->sp == sp) {
426		g_bde_delete_sector(sc, wp->sp);
427		wp->sp = NULL;
428	} else {
429		sp->state = VALID;
430	}
431	if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID) {
432		g_bde_contribute(wp->bp, wp->length, wp->error);
433		g_bde_release_keysector(wp);
434		g_bde_delete_work(wp);
435	}
436	mtx_unlock(&sc->worklist_mutex);
437	return;
438}
439
440/*
441 * Send a write request for the given sector down the pipeline.
442 */
443
444static int
445g_bde_start_write(struct g_bde_sector *sp)
446{
447	struct bio *bp;
448	struct g_bde_softc *sc;
449
450	g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
451	sc = sp->softc;
452	KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
453	KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
454	bp = g_new_bio();
455	if (bp == NULL)
456		return (ENOMEM);
457	bp->bio_cmd = BIO_WRITE;
458	bp->bio_offset = sp->offset;
459	bp->bio_data = sp->data;
460	bp->bio_length = sp->size;
461	bp->bio_done = g_bde_write_done;
462	bp->bio_caller1 = sp;
463	bp->bio_caller2 = sc;
464	sp->state = IO;
465	g_io_request(bp, sc->consumer);
466	return(0);
467}
468
469/*
470 * A read operation has finished.  Mark the sector no longer iobusy and
471 * wake up the worker thread and let it do its thing.
472 */
473
474static void
475g_bde_read_done(struct bio *bp)
476{
477	struct g_bde_sector *sp;
478	struct g_bde_softc *sc;
479
480	sp = bp->bio_caller1;
481	g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
482	sc = bp->bio_caller2;
483	mtx_lock(&sc->worklist_mutex);
484	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
485		bp->bio_error = EIO;
486	sp->error = bp->bio_error;
487	if (sp->error == 0)
488		sp->state = VALID;
489	else
490		sp->state = JUNK;
491	wakeup(sc);
492	g_destroy_bio(bp);
493	mtx_unlock(&sc->worklist_mutex);
494}
495
496/*
497 * Send a read request for the given sector down the pipeline.
498 */
499
500static int
501g_bde_start_read(struct g_bde_sector *sp)
502{
503	struct bio *bp;
504	struct g_bde_softc *sc;
505
506	g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
507	sc = sp->softc;
508	KASSERT(sc != NULL, ("Null softc in sp %p", sp));
509	bp = g_new_bio();
510	if (bp == NULL)
511		return (ENOMEM);
512	bp->bio_cmd = BIO_READ;
513	bp->bio_offset = sp->offset;
514	bp->bio_data = sp->data;
515	bp->bio_length = sp->size;
516	bp->bio_done = g_bde_read_done;
517	bp->bio_caller1 = sp;
518	bp->bio_caller2 = sc;
519	sp->state = IO;
520	g_io_request(bp, sc->consumer);
521	return(0);
522}
523
524/*
525 * The worker thread.
526 *
527 * The up/down path of GEOM is not allowed to sleep or do any major work
528 * so we use this thread to do the actual crypto operations and to push
529 * the state engine onwards.
530 *
531 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
532 * XXX: using a thread here is probably not needed.
533 */
534
535void
536g_bde_worker(void *arg)
537{
538	struct g_bde_softc *sc;
539	struct g_bde_work *wp;
540	struct g_geom *gp;
541	int busy, error;
542
543	gp = arg;
544	sc = gp->softc;
545
546	mtx_lock(&sc->worklist_mutex);
547	for (;;) {
548		busy = 0;
549		g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
550		TAILQ_FOREACH(wp, &sc->worklist, list) {
551			KASSERT(wp != NULL, ("NULL wp"));
552			KASSERT(wp->softc != NULL, ("NULL wp->softc"));
553			if (wp->state != WAIT)
554				continue;		/* Not interesting here */
555
556			KASSERT(wp->bp != NULL, ("NULL wp->bp"));
557			KASSERT(wp->sp != NULL, ("NULL wp->sp"));
558
559			if (wp->ksp != NULL) {
560				if (wp->ksp->owner != wp)
561					continue;
562				if (wp->ksp->state == IO)
563					continue;
564				KASSERT(wp->ksp->state == VALID,
565				    ("Illegal sector state (JUNK ?)"));
566			}
567
568			if (wp->bp->bio_cmd == BIO_READ &&
569			     wp->sp->state == IO)
570				continue;
571
572			if (wp->ksp != NULL && wp->ksp->error != 0) {
573				g_bde_contribute(wp->bp, wp->length,
574				    wp->ksp->error);
575				g_bde_delete_sector(sc, wp->sp);
576				g_bde_release_keysector(wp);
577				g_bde_delete_work(wp);
578				busy++;
579				break;
580			}
581			switch(wp->bp->bio_cmd) {
582			case BIO_READ:
583				if (wp->ksp == NULL) {
584					KASSERT(wp->error != 0,
585					    ("BIO_READ, no ksp and no error"));
586					g_bde_contribute(wp->bp, wp->length,
587						    wp->error);
588				} else {
589					if (wp->sp->error == 0) {
590						mtx_unlock(&sc->worklist_mutex);
591						g_bde_crypt_read(wp);
592						mtx_lock(&sc->worklist_mutex);
593					}
594					g_bde_contribute(wp->bp, wp->length,
595						    wp->sp->error);
596				}
597				g_bde_delete_sector(sc, wp->sp);
598				if (wp->ksp != NULL)
599					g_bde_release_keysector(wp);
600				g_bde_delete_work(wp);
601				break;
602			case BIO_WRITE:
603				wp->state = FINISH;
604				KASSERT(wp->sp->owner == wp, ("Write not owner sp"));
605				KASSERT(wp->ksp->owner == wp, ("Write not owner ksp"));
606				mtx_unlock(&sc->worklist_mutex);
607				g_bde_crypt_write(wp);
608				mtx_lock(&sc->worklist_mutex);
609				error = g_bde_start_write(wp->sp);
610				if (error) {
611					g_bde_contribute(wp->bp, wp->length, error);
612					g_bde_release_keysector(wp);
613					g_bde_delete_sector(sc, wp->sp);
614					g_bde_delete_work(wp);
615					break;
616				}
617				error = g_bde_start_write(wp->ksp);
618				if (wp->error == 0)
619					wp->error = error;
620				break;
621			case BIO_DELETE:
622				wp->state = FINISH;
623				mtx_unlock(&sc->worklist_mutex);
624				g_bde_crypt_delete(wp);
625				mtx_lock(&sc->worklist_mutex);
626				g_bde_start_write(wp->sp);
627				break;
628			}
629			busy++;
630			break;
631		}
632		if (!busy) {
633			/*
634			 * We don't look for our death-warrant until we are
635			 * idle.  Shouldn't make a difference in practice.
636			 */
637			if (sc->dead)
638				break;
639			g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
640			error = msleep(sc, &sc->worklist_mutex,
641			    PRIBIO, "g_bde", hz);
642			if (error == EWOULDBLOCK) {
643				/*
644				 * Loose our skey cache in an orderly fashion.
645				 * The exact rate can be tuned to be less
646				 * aggressive if this is desirable.  10% per
647				 * second means that the cache is gone in a
648				 * few minutes.
649				 */
650				g_bde_purge_sector(sc, 10);
651			}
652		}
653	}
654	g_trace(G_T_TOPOLOGY, "g_bde_worker die");
655	g_bde_purge_sector(sc, 1);
656	KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
657	KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
658	KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
659	mtx_unlock(&sc->worklist_mutex);
660	sc->dead = 2;
661	wakeup(sc);
662	mtx_lock(&Giant);
663	kthread_exit(0);
664}
665
666/*
667 * g_bde_start1 has chopped the incoming request up so all the requests
668 * we see here are inside a single zone.  Map the data and key locations
669 * grab the buffers we need and fire off the first volley of read requests.
670 */
671
672static void
673g_bde_start2(struct g_bde_work *wp)
674{
675	struct g_bde_softc *sc;
676
677	KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
678	KASSERT(wp->softc != NULL, ("NULL wp->softc"));
679	g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
680	sc = wp->softc;
681	if (wp->bp->bio_cmd == BIO_READ) {
682		wp->sp = g_bde_new_sector(wp, 0);
683		if (wp->sp == NULL) {
684			g_bde_contribute(wp->bp, wp->length, ENOMEM);
685			g_bde_delete_work(wp);
686			return;
687		}
688		wp->sp->size = wp->length;
689		wp->sp->data = wp->data;
690		if (g_bde_start_read(wp->sp) != 0) {
691			g_bde_contribute(wp->bp, wp->length, ENOMEM);
692			g_bde_delete_sector(sc, wp->sp);
693			g_bde_delete_work(wp);
694			return;
695		}
696		g_bde_read_keysector(sc, wp);
697		if (wp->ksp == NULL)
698			wp->error = ENOMEM;
699	} else if (wp->bp->bio_cmd == BIO_DELETE) {
700		wp->sp = g_bde_new_sector(wp, wp->length);
701		if (wp->sp == NULL) {
702			g_bde_contribute(wp->bp, wp->length, ENOMEM);
703			g_bde_delete_work(wp);
704			return;
705		}
706	} else if (wp->bp->bio_cmd == BIO_WRITE) {
707		wp->sp = g_bde_new_sector(wp, wp->length);
708		if (wp->sp == NULL) {
709			g_bde_contribute(wp->bp, wp->length, ENOMEM);
710			g_bde_delete_work(wp);
711			return;
712		}
713		g_bde_read_keysector(sc, wp);
714		if (wp->ksp == NULL) {
715			g_bde_contribute(wp->bp, wp->length, ENOMEM);
716			g_bde_delete_sector(sc, wp->sp);
717			g_bde_delete_work(wp);
718			return;
719		}
720	} else {
721		KASSERT(0 == 1,
722		    ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
723	}
724
725	wp->state = WAIT;
726	wakeup(sc);
727}
728
729/*
730 * Create a sequence of work structures, and have g_bde_map_sector() determine
731 * how long they each can be.  Feed them to g_bde_start2().
732 */
733
734void
735g_bde_start1(struct bio *bp)
736{
737	struct g_bde_softc *sc;
738	struct g_bde_work *wp;
739	off_t done;
740
741	sc = bp->bio_to->geom->softc;
742	bp->bio_driver1 = sc;
743
744	mtx_lock(&sc->worklist_mutex);
745	for(done = 0; done < bp->bio_length; ) {
746		wp = g_bde_new_work(sc);
747		if (wp != NULL) {
748			wp->bp = bp;
749			wp->offset = bp->bio_offset + done;
750			wp->data = bp->bio_data + done;
751			wp->length = bp->bio_length - done;
752			g_bde_map_sector(wp);
753			done += wp->length;
754			g_bde_start2(wp);
755		}
756		if (wp == NULL || bp->bio_error != 0) {
757			g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
758			break;
759		}
760	}
761	mtx_unlock(&sc->worklist_mutex);
762	return;
763}
764