1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2002 Poul-Henning Kamp
5 * Copyright (c) 2002 Networks Associates Technology, Inc.
6 * All rights reserved.
7 *
8 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
9 * and NAI Labs, the Security Research Division of Network Associates, Inc.
10 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
11 * DARPA CHATS research program.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $FreeBSD$
35 */
36/*
37 * This source file contains the state-engine which makes things happen in the
38 * right order.
39 *
40 * Outline:
41 *   1) g_bde_start1()
42 *	Break the struct bio into multiple work packets one per zone.
43 *   2) g_bde_start2()
44 *	Setup the necessary sector buffers and start those read operations
45 *	which we can start at this time and put the item on the work-list.
46 *   3) g_bde_worker()
47 *	Scan the work-list for items which are ready for crypto processing
48 *	and call the matching crypto function in g_bde_crypt.c and schedule
49 *	any writes needed.  Read operations finish here by releasing the
50 *	sector buffers and delivering the original bio request.
51 *   4) g_bde_write_done()
52 *	Release sector buffers and deliver the original bio request.
53 *
54 * Because of the C-scope rules, the functions are almost perfectly in the
55 * opposite order in this source file.
56 *
57 * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
58 * XXX: additional states to this state-engine.  Since no hardware available
59 * XXX: at this time has AES support, implementing this has been postponed
60 * XXX: until such time as it would result in a benefit.
61 */
62
63#include <sys/param.h>
64#include <sys/bio.h>
65#include <sys/lock.h>
66#include <sys/mutex.h>
67#include <sys/queue.h>
68#include <sys/malloc.h>
69#include <sys/systm.h>
70#include <sys/kernel.h>
71#include <sys/sysctl.h>
72#include <sys/proc.h>
73#include <sys/kthread.h>
74
75#include <crypto/rijndael/rijndael-api-fst.h>
76#include <crypto/sha2/sha512.h>
77#include <geom/geom.h>
78#include <geom/bde/g_bde.h>
79
80/*
81 * FIXME: This used to call malloc_last_fail which in practice was almost
82 * guaranteed to return time_uptime even in face of severe memory shortage.
83 * As GBDE is the only consumer the kludge below was added to facilitate the
84 * removal with minimial changes. The code should be fixed to respond to memory
85 * pressure (e.g., by using lowmem eventhandler) instead.
86 */
87static int
88g_bde_malloc_last_fail(void)
89{
90
91	return (time_uptime);
92}
93
94static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
95static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
96static void g_bde_release_keysector(struct g_bde_work *wp);
97static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
98static int g_bde_start_read(struct g_bde_sector *sp);
99static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
100
101/*
102 * Work item allocation.
103 *
104 * C++ would call these constructors and destructors.
105 */
106static u_int g_bde_nwork;
107SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
108
109static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
110
111static struct g_bde_work *
112g_bde_new_work(struct g_bde_softc *sc)
113{
114	struct g_bde_work *wp;
115
116	wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
117	if (wp == NULL)
118		return (wp);
119	wp->state = SETUP;
120	wp->softc = sc;
121	g_bde_nwork++;
122	sc->nwork++;
123	TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
124	return (wp);
125}
126
127static void
128g_bde_delete_work(struct g_bde_work *wp)
129{
130	struct g_bde_softc *sc;
131
132	sc = wp->softc;
133	g_bde_nwork--;
134	sc->nwork--;
135	TAILQ_REMOVE(&sc->worklist, wp, list);
136	free(wp, M_GBDE);
137}
138
139/*
140 * Sector buffer allocation
141 *
142 * These two functions allocate and free back variable sized sector buffers
143 */
144
145static u_int g_bde_nsect;
146SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
147
148static void
149g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
150{
151
152	g_bde_nsect--;
153	sc->nsect--;
154	if (sp->malloc)
155		free(sp->data, M_GBDE);
156	free(sp, M_GBDE);
157}
158
159static struct g_bde_sector *
160g_bde_new_sector(struct g_bde_work *wp, u_int len)
161{
162	struct g_bde_sector *sp;
163
164	sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
165	if (sp == NULL)
166		return (sp);
167	if (len > 0) {
168		sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
169		if (sp->data == NULL) {
170			free(sp, M_GBDE);
171			return (NULL);
172		}
173		sp->malloc = 1;
174	}
175	g_bde_nsect++;
176	wp->softc->nsect++;
177	sp->size = len;
178	sp->softc = wp->softc;
179	sp->ref = 1;
180	sp->owner = wp;
181	sp->offset = wp->so;
182	sp->state = JUNK;
183	return (sp);
184}
185
186/*
187 * Skey sector cache.
188 *
189 * Nothing prevents two separate I/O requests from addressing the same zone
190 * and thereby needing the same skey sector.  We therefore need to sequence
191 * I/O operations to the skey sectors.  A certain amount of caching is also
192 * desirable, although the extent of benefit from this is not at this point
193 * determined.
194 *
195 * XXX: GEOM may be able to grow a generic caching facility at some point
196 * XXX: to support such needs.
197 */
198
199static u_int g_bde_ncache;
200SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
201
202static void
203g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
204{
205
206	g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
207	if (sp->ref != 0)
208		return;
209	TAILQ_REMOVE(&sc->freelist, sp, list);
210	g_bde_ncache--;
211	sc->ncache--;
212	bzero(sp->data, sp->size);
213	g_bde_delete_sector(sc, sp);
214}
215
216static struct g_bde_sector *
217g_bde_get_keysector(struct g_bde_work *wp)
218{
219	struct g_bde_sector *sp;
220	struct g_bde_softc *sc;
221	off_t offset;
222
223	offset = wp->kso;
224	g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
225	sc = wp->softc;
226
227	if (g_bde_malloc_last_fail() < g_bde_ncache)
228		g_bde_purge_sector(sc, -1);
229
230	sp = TAILQ_FIRST(&sc->freelist);
231	if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
232		g_bde_purge_one_sector(sc, sp);
233
234	TAILQ_FOREACH(sp, &sc->freelist, list) {
235		if (sp->offset == offset)
236			break;
237	}
238	if (sp != NULL) {
239		sp->ref++;
240		KASSERT(sp->offset == offset, ("wrong offset"));
241		KASSERT(sp->softc == wp->softc, ("wrong softc"));
242		if (sp->ref == 1)
243			sp->owner = wp;
244	} else {
245		if (g_bde_malloc_last_fail() < g_bde_ncache) {
246			TAILQ_FOREACH(sp, &sc->freelist, list)
247				if (sp->ref == 0)
248					break;
249		}
250		if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
251			sp = TAILQ_FIRST(&sc->freelist);
252		if (sp != NULL && sp->ref > 0)
253			sp = NULL;
254		if (sp == NULL) {
255			sp = g_bde_new_sector(wp, sc->sectorsize);
256			if (sp != NULL) {
257				g_bde_ncache++;
258				sc->ncache++;
259				TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
260				sp->malloc = 2;
261			}
262		}
263		if (sp != NULL) {
264			sp->offset = offset;
265			sp->softc = wp->softc;
266			sp->ref = 1;
267			sp->owner = wp;
268			sp->state = JUNK;
269			sp->error = 0;
270		}
271	}
272	if (sp != NULL) {
273		TAILQ_REMOVE(&sc->freelist, sp, list);
274		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
275		sp->used = time_uptime;
276	}
277	wp->ksp = sp;
278	return(sp);
279}
280
281static void
282g_bde_release_keysector(struct g_bde_work *wp)
283{
284	struct g_bde_softc *sc;
285	struct g_bde_work *wp2;
286	struct g_bde_sector *sp;
287
288	sp = wp->ksp;
289	g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
290	KASSERT(sp->malloc == 2, ("Wrong sector released"));
291	sc = sp->softc;
292	KASSERT(sc != NULL, ("NULL sp->softc"));
293	KASSERT(wp == sp->owner, ("Releasing, not owner"));
294	sp->owner = NULL;
295	wp->ksp = NULL;
296	sp->ref--;
297	if (sp->ref > 0) {
298		TAILQ_REMOVE(&sc->freelist, sp, list);
299		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
300		TAILQ_FOREACH(wp2, &sc->worklist, list) {
301			if (wp2->ksp == sp) {
302				KASSERT(wp2 != wp, ("Self-reowning"));
303				sp->owner = wp2;
304				wakeup(sp->softc);
305				break;
306			}
307		}
308		KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
309	} else if (sp->error != 0) {
310		sp->offset = ~0;
311		sp->error = 0;
312		sp->state = JUNK;
313	}
314	TAILQ_REMOVE(&sc->freelist, sp, list);
315	TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
316}
317
318static void
319g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
320{
321	struct g_bde_sector *sp;
322	int n;
323
324	g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
325	if (fraction > 0)
326		n = sc->ncache / fraction + 1;
327	else
328		n = g_bde_ncache - g_bde_malloc_last_fail();
329	if (n < 0)
330		return;
331	if (n > sc->ncache)
332		n = sc->ncache;
333	while(n--) {
334		TAILQ_FOREACH(sp, &sc->freelist, list) {
335			if (sp->ref != 0)
336				continue;
337			TAILQ_REMOVE(&sc->freelist, sp, list);
338			g_bde_ncache--;
339			sc->ncache--;
340			bzero(sp->data, sp->size);
341			g_bde_delete_sector(sc, sp);
342			break;
343		}
344	}
345}
346
347static struct g_bde_sector *
348g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
349{
350	struct g_bde_sector *sp;
351
352	g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
353	sp = g_bde_get_keysector(wp);
354	if (sp == NULL) {
355		g_bde_purge_sector(sc, -1);
356		sp = g_bde_get_keysector(wp);
357	}
358	if (sp == NULL)
359		return (sp);
360	if (sp->owner != wp)
361		return (sp);
362	if (sp->state == VALID)
363		return (sp);
364	if (g_bde_start_read(sp) == 0)
365		return (sp);
366	g_bde_release_keysector(wp);
367	return (NULL);
368}
369
370/*
371 * Contribute to the completion of the original bio request.
372 *
373 * We have no simple way to tell how many bits the original bio request has
374 * been segmented into, so the easiest way to determine when we can deliver
375 * it is to keep track of the number of bytes we have completed.  We keep
376 * track of any errors underway and latch onto the first one.
377 *
378 * We always report "nothing done" in case of error, because random bits here
379 * and there may be completed and returning a number of completed bytes does
380 * not convey any useful information about which bytes they were.  If some
381 * piece of broken code somewhere interprets this to mean that nothing has
382 * changed on the underlying media they deserve the lossage headed for them.
383 *
384 * A single mutex per g_bde instance is used to prevent contention.
385 */
386
387static void
388g_bde_contribute(struct bio *bp, off_t bytes, int error)
389{
390
391	g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
392	     bp, (intmax_t)bytes, error);
393	if (bp->bio_error == 0)
394		bp->bio_error = error;
395	bp->bio_completed += bytes;
396	KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
397	if (bp->bio_completed == bp->bio_length) {
398		if (bp->bio_error != 0)
399			bp->bio_completed = 0;
400		g_io_deliver(bp, bp->bio_error);
401	}
402}
403
404/*
405 * This is the common case "we're done with this work package" function
406 */
407
408static void
409g_bde_work_done(struct g_bde_work *wp, int error)
410{
411
412	g_bde_contribute(wp->bp, wp->length, error);
413	if (wp->sp != NULL)
414		g_bde_delete_sector(wp->softc, wp->sp);
415	if (wp->ksp != NULL)
416		g_bde_release_keysector(wp);
417	g_bde_delete_work(wp);
418}
419
420/*
421 * A write operation has finished.  When we have all expected cows in the
422 * barn close the door and call it a day.
423 */
424
425static void
426g_bde_write_done(struct bio *bp)
427{
428	struct g_bde_sector *sp;
429	struct g_bde_work *wp;
430	struct g_bde_softc *sc;
431
432	sp = bp->bio_caller1;
433	sc = bp->bio_caller2;
434	mtx_lock(&sc->worklist_mutex);
435	KASSERT(sp != NULL, ("NULL sp"));
436	KASSERT(sc != NULL, ("NULL sc"));
437	KASSERT(sp->owner != NULL, ("NULL sp->owner"));
438	g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
439	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
440		bp->bio_error = EIO;
441	sp->error = bp->bio_error;
442	g_destroy_bio(bp);
443	wp = sp->owner;
444	if (wp->error == 0)
445		wp->error = sp->error;
446
447	if (wp->bp->bio_cmd == BIO_DELETE) {
448		KASSERT(sp == wp->sp, ("trashed delete op"));
449		g_bde_work_done(wp, wp->error);
450		mtx_unlock(&sc->worklist_mutex);
451		return;
452	}
453
454	KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
455	KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
456	if (wp->sp == sp) {
457		g_bde_delete_sector(sc, wp->sp);
458		wp->sp = NULL;
459	} else {
460		sp->state = VALID;
461	}
462	if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
463		g_bde_work_done(wp, wp->error);
464	mtx_unlock(&sc->worklist_mutex);
465	return;
466}
467
468/*
469 * Send a write request for the given sector down the pipeline.
470 */
471
472static int
473g_bde_start_write(struct g_bde_sector *sp)
474{
475	struct bio *bp;
476	struct g_bde_softc *sc;
477
478	g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
479	sc = sp->softc;
480	KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
481	KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
482	bp = g_new_bio();
483	if (bp == NULL)
484		return (ENOMEM);
485	bp->bio_cmd = BIO_WRITE;
486	bp->bio_offset = sp->offset;
487	bp->bio_data = sp->data;
488	bp->bio_length = sp->size;
489	bp->bio_done = g_bde_write_done;
490	bp->bio_caller1 = sp;
491	bp->bio_caller2 = sc;
492	sp->state = IO;
493	g_io_request(bp, sc->consumer);
494	return(0);
495}
496
497/*
498 * A read operation has finished.  Mark the sector no longer iobusy and
499 * wake up the worker thread and let it do its thing.
500 */
501
502static void
503g_bde_read_done(struct bio *bp)
504{
505	struct g_bde_sector *sp;
506	struct g_bde_softc *sc;
507
508	sp = bp->bio_caller1;
509	g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
510	sc = bp->bio_caller2;
511	mtx_lock(&sc->worklist_mutex);
512	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
513		bp->bio_error = EIO;
514	sp->error = bp->bio_error;
515	if (sp->error == 0)
516		sp->state = VALID;
517	else
518		sp->state = JUNK;
519	wakeup(sc);
520	g_destroy_bio(bp);
521	mtx_unlock(&sc->worklist_mutex);
522}
523
524/*
525 * Send a read request for the given sector down the pipeline.
526 */
527
528static int
529g_bde_start_read(struct g_bde_sector *sp)
530{
531	struct bio *bp;
532	struct g_bde_softc *sc;
533
534	g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
535	sc = sp->softc;
536	KASSERT(sc != NULL, ("Null softc in sp %p", sp));
537	bp = g_new_bio();
538	if (bp == NULL)
539		return (ENOMEM);
540	bp->bio_cmd = BIO_READ;
541	bp->bio_offset = sp->offset;
542	bp->bio_data = sp->data;
543	bp->bio_length = sp->size;
544	bp->bio_done = g_bde_read_done;
545	bp->bio_caller1 = sp;
546	bp->bio_caller2 = sc;
547	sp->state = IO;
548	g_io_request(bp, sc->consumer);
549	return(0);
550}
551
552/*
553 * The worker thread.
554 *
555 * The up/down path of GEOM is not allowed to sleep or do any major work
556 * so we use this thread to do the actual crypto operations and to push
557 * the state engine onwards.
558 *
559 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
560 * XXX: using a thread here is probably not needed.
561 */
562
563void
564g_bde_worker(void *arg)
565{
566	struct g_bde_softc *sc;
567	struct g_bde_work *wp, *twp;
568	struct g_geom *gp;
569	int restart, error;
570
571	gp = arg;
572	sc = gp->softc;
573
574	mtx_lock(&sc->worklist_mutex);
575	for (;;) {
576		restart = 0;
577		g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
578		TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
579			KASSERT(wp != NULL, ("NULL wp"));
580			KASSERT(wp->softc != NULL, ("NULL wp->softc"));
581			if (wp->state != WAIT)
582				continue;	/* Not interesting here */
583
584			KASSERT(wp->bp != NULL, ("NULL wp->bp"));
585			KASSERT(wp->sp != NULL, ("NULL wp->sp"));
586
587			if (wp->ksp != NULL) {
588				if (wp->ksp->owner != wp)
589					continue;
590				if (wp->ksp->state == IO)
591					continue;
592				KASSERT(wp->ksp->state == VALID,
593				    ("Illegal sector state (%d)",
594				    wp->ksp->state));
595			}
596
597			if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
598				continue;
599
600			if (wp->ksp != NULL && wp->ksp->error != 0) {
601				g_bde_work_done(wp, wp->ksp->error);
602				continue;
603			}
604			switch(wp->bp->bio_cmd) {
605			case BIO_READ:
606				if (wp->ksp == NULL) {
607					KASSERT(wp->error != 0,
608					    ("BIO_READ, no ksp and no error"));
609					g_bde_work_done(wp, wp->error);
610					break;
611				}
612				if (wp->sp->error != 0) {
613					g_bde_work_done(wp, wp->sp->error);
614					break;
615				}
616				mtx_unlock(&sc->worklist_mutex);
617				g_bde_crypt_read(wp);
618				mtx_lock(&sc->worklist_mutex);
619				restart++;
620				g_bde_work_done(wp, wp->sp->error);
621				break;
622			case BIO_WRITE:
623				wp->state = FINISH;
624				KASSERT(wp->sp->owner == wp,
625				    ("Write not owner sp"));
626				KASSERT(wp->ksp->owner == wp,
627				    ("Write not owner ksp"));
628				mtx_unlock(&sc->worklist_mutex);
629				g_bde_crypt_write(wp);
630				mtx_lock(&sc->worklist_mutex);
631				restart++;
632				error = g_bde_start_write(wp->sp);
633				if (error) {
634					g_bde_work_done(wp, error);
635					break;
636				}
637				error = g_bde_start_write(wp->ksp);
638				if (wp->error != 0)
639					wp->error = error;
640				break;
641			case BIO_DELETE:
642				wp->state = FINISH;
643				mtx_unlock(&sc->worklist_mutex);
644				g_bde_crypt_delete(wp);
645				mtx_lock(&sc->worklist_mutex);
646				restart++;
647				g_bde_start_write(wp->sp);
648				break;
649			}
650			if (restart)
651				break;
652		}
653		if (!restart) {
654			/*
655			 * We don't look for our death-warrant until we are
656			 * idle.  Shouldn't make a difference in practice.
657			 */
658			if (sc->dead)
659				break;
660			g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
661			error = msleep(sc, &sc->worklist_mutex,
662			    PRIBIO, "-", hz);
663			if (error == EWOULDBLOCK) {
664				/*
665				 * Lose our skey cache in an orderly fashion.
666				 * The exact rate can be tuned to be less
667				 * aggressive if this is desirable.  10% per
668				 * second means that the cache is gone in a
669				 * few minutes.
670				 */
671				g_bde_purge_sector(sc, 10);
672			}
673		}
674	}
675	g_trace(G_T_TOPOLOGY, "g_bde_worker die");
676	g_bde_purge_sector(sc, 1);
677	KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
678	KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
679	KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
680	mtx_unlock(&sc->worklist_mutex);
681	sc->dead = 2;
682	wakeup(sc);
683	kproc_exit(0);
684}
685
686/*
687 * g_bde_start1 has chopped the incoming request up so all the requests
688 * we see here are inside a single zone.  Map the data and key locations
689 * grab the buffers we need and fire off the first volley of read requests.
690 */
691
692static void
693g_bde_start2(struct g_bde_work *wp)
694{
695	struct g_bde_softc *sc;
696
697	KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
698	KASSERT(wp->softc != NULL, ("NULL wp->softc"));
699	g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
700	sc = wp->softc;
701	switch (wp->bp->bio_cmd) {
702	case BIO_READ:
703		wp->sp = g_bde_new_sector(wp, 0);
704		if (wp->sp == NULL) {
705			g_bde_work_done(wp, ENOMEM);
706			return;
707		}
708		wp->sp->size = wp->length;
709		wp->sp->data = wp->data;
710		if (g_bde_start_read(wp->sp) != 0) {
711			g_bde_work_done(wp, ENOMEM);
712			return;
713		}
714		g_bde_read_keysector(sc, wp);
715		if (wp->ksp == NULL)
716			wp->error = ENOMEM;
717		break;
718	case BIO_DELETE:
719		wp->sp = g_bde_new_sector(wp, wp->length);
720		if (wp->sp == NULL) {
721			g_bde_work_done(wp, ENOMEM);
722			return;
723		}
724		break;
725	case BIO_WRITE:
726		wp->sp = g_bde_new_sector(wp, wp->length);
727		if (wp->sp == NULL) {
728			g_bde_work_done(wp, ENOMEM);
729			return;
730		}
731		g_bde_read_keysector(sc, wp);
732		if (wp->ksp == NULL) {
733			g_bde_work_done(wp, ENOMEM);
734			return;
735		}
736		break;
737	default:
738		KASSERT(0 == 1,
739		    ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
740	}
741
742	wp->state = WAIT;
743	wakeup(sc);
744}
745
746/*
747 * Create a sequence of work structures, and have g_bde_map_sector() determine
748 * how long they each can be.  Feed them to g_bde_start2().
749 */
750
751void
752g_bde_start1(struct bio *bp)
753{
754	struct g_bde_softc *sc;
755	struct g_bde_work *wp;
756	off_t done;
757
758	sc = bp->bio_to->geom->softc;
759	bp->bio_driver1 = sc;
760
761	mtx_lock(&sc->worklist_mutex);
762	for(done = 0; done < bp->bio_length; ) {
763		wp = g_bde_new_work(sc);
764		if (wp != NULL) {
765			wp->bp = bp;
766			wp->offset = bp->bio_offset + done;
767			wp->data = bp->bio_data + done;
768			wp->length = bp->bio_length - done;
769			g_bde_map_sector(wp);
770			done += wp->length;
771			g_bde_start2(wp);
772		}
773		if (wp == NULL || bp->bio_error != 0) {
774			g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
775			break;
776		}
777	}
778	mtx_unlock(&sc->worklist_mutex);
779	return;
780}
781