g_bde_work.c revision 125591
1169691Skan/*-
2169691Skan * Copyright (c) 2002 Poul-Henning Kamp
3169691Skan * Copyright (c) 2002 Networks Associates Technology, Inc.
4169691Skan * All rights reserved.
5169691Skan *
6169691Skan * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7169691Skan * and NAI Labs, the Security Research Division of Network Associates, Inc.
8169691Skan * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9169691Skan * DARPA CHATS research program.
10169691Skan *
11169691Skan * Redistribution and use in source and binary forms, with or without
12169691Skan * modification, are permitted provided that the following conditions
13169691Skan * are met:
14169691Skan * 1. Redistributions of source code must retain the above copyright
15169691Skan *    notice, this list of conditions and the following disclaimer.
16169691Skan * 2. Redistributions in binary form must reproduce the above copyright
17169691Skan *    notice, this list of conditions and the following disclaimer in the
18169691Skan *    documentation and/or other materials provided with the distribution.
19169691Skan *
20169691Skan * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21169691Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22169691Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23169691Skan * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24169691Skan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25169691Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26169691Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27169691Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28169691Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29169691Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30169691Skan * SUCH DAMAGE.
31169691Skan *
32169691Skan * $FreeBSD: head/sys/geom/bde/g_bde_work.c 125591 2004-02-08 10:19:18Z phk $
33169691Skan *
34169691Skan * This source file contains the state-engine which makes things happen in the
35169691Skan * right order.
36169691Skan *
37169691Skan * Outline:
38169691Skan *   1) g_bde_start1()
39169691Skan *	Break the struct bio into multiple work packets one per zone.
40169691Skan *   2) g_bde_start2()
41169691Skan *	Setup the necessary sector buffers and start those read operations
42169691Skan *	which we can start at this time and put the item on the work-list.
43169691Skan *   3) g_bde_worker()
44169691Skan *	Scan the work-list for items which are ready for crypto processing
45169691Skan *	and call the matching crypto function in g_bde_crypt.c and schedule
46169691Skan *	any writes needed.  Read operations finish here by releasing the
47169691Skan *	sector buffers and delivering the original bio request.
48169691Skan *   4) g_bde_write_done()
49169691Skan *	Release sector buffers and deliver the original bio request.
50169691Skan *
51169691Skan * Because of the C-scope rules, the functions are almost perfectly in the
52169691Skan * opposite order in this source file.
53169691Skan *
54169691Skan * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
55169691Skan * XXX: additional states to this state-engine.  Since no hardware available
56169691Skan * XXX: at this time has AES support, implementing this has been postponed
57169691Skan * XXX: until such time as it would result in a benefit.
58169691Skan */
59169691Skan
60169691Skan#include <sys/param.h>
61169691Skan#include <sys/bio.h>
62169691Skan#include <sys/lock.h>
63169691Skan#include <sys/mutex.h>
64169691Skan#include <sys/queue.h>
65169691Skan#include <sys/malloc.h>
66169691Skan#include <sys/systm.h>
67169691Skan#include <sys/kernel.h>
68169691Skan#include <sys/sysctl.h>
69169691Skan#include <sys/proc.h>
70169691Skan#include <sys/kthread.h>
71169691Skan
72169691Skan#include <crypto/rijndael/rijndael.h>
73169691Skan#include <crypto/sha2/sha2.h>
74169691Skan#include <geom/geom.h>
75169691Skan#include <geom/bde/g_bde.h>
76169691Skan
77169691Skanstatic void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
78169691Skanstatic struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
79169691Skanstatic void g_bde_release_keysector(struct g_bde_work *wp);
80169691Skanstatic struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
81169691Skanstatic int g_bde_start_read(struct g_bde_sector *sp);
82169691Skanstatic void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
83169691Skan
84169691Skan/*
85169691Skan * Work item allocation.
86169691Skan *
87169691Skan * C++ would call these constructors and destructors.
88169691Skan */
89169691Skanstatic u_int g_bde_nwork;
90169691SkanSYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
91169691Skan
92169691Skanstatic MALLOC_DEFINE(M_GBDE, "GBDE", "GBDE data structures");
93169691Skan
94169691Skanstatic struct g_bde_work *
95169691Skang_bde_new_work(struct g_bde_softc *sc)
96169691Skan{
97169691Skan	struct g_bde_work *wp;
98169691Skan
99169691Skan	wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
100169691Skan	if (wp == NULL)
101169691Skan		return (wp);
102169691Skan	wp->state = SETUP;
103169691Skan	wp->softc = sc;
104169691Skan	g_bde_nwork++;
105169691Skan	sc->nwork++;
106169691Skan	TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
107169691Skan	return (wp);
108169691Skan}
109169691Skan
110169691Skanstatic void
111169691Skang_bde_delete_work(struct g_bde_work *wp)
112169691Skan{
113169691Skan	struct g_bde_softc *sc;
114169691Skan
115169691Skan	sc = wp->softc;
116169691Skan	g_bde_nwork--;
117169691Skan	sc->nwork--;
118169691Skan	TAILQ_REMOVE(&sc->worklist, wp, list);
119169691Skan	free(wp, M_GBDE);
120169691Skan}
121169691Skan
122169691Skan/*
123169691Skan * Sector buffer allocation
124169691Skan *
125169691Skan * These two functions allocate and free back variable sized sector buffers
126169691Skan */
127169691Skan
128169691Skanstatic u_int g_bde_nsect;
129169691SkanSYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
130169691Skan
131169691Skanstatic void
132169691Skang_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
133169691Skan{
134169691Skan
135169691Skan	g_bde_nsect--;
136169691Skan	sc->nsect--;
137169691Skan	if (sp->malloc)
138169691Skan		free(sp->data, M_GBDE);
139169691Skan	free(sp, M_GBDE);
140169691Skan}
141169691Skan
142169691Skanstatic struct g_bde_sector *
143169691Skang_bde_new_sector(struct g_bde_work *wp, u_int len)
144169691Skan{
145169691Skan	struct g_bde_sector *sp;
146169691Skan
147169691Skan	sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
148169691Skan	if (sp == NULL)
149169691Skan		return (sp);
150169691Skan	if (len > 0) {
151169691Skan		sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
152169691Skan		if (sp->data == NULL) {
153169691Skan			free(sp, M_GBDE);
154169691Skan			return (NULL);
155169691Skan		}
156169691Skan		sp->malloc = 1;
157169691Skan	}
158169691Skan	g_bde_nsect++;
159169691Skan	wp->softc->nsect++;
160169691Skan	sp->size = len;
161169691Skan	sp->softc = wp->softc;
162169691Skan	sp->ref = 1;
163169691Skan	sp->owner = wp;
164169691Skan	sp->offset = wp->so;
165169691Skan	sp->state = JUNK;
166169691Skan	return (sp);
167169691Skan}
168169691Skan
169169691Skan/*
170169691Skan * Skey sector cache.
171169691Skan *
172169691Skan * Nothing prevents two separate I/O requests from addressing the same zone
173169691Skan * and thereby needing the same skey sector.  We therefore need to sequence
174169691Skan * I/O operations to the skey sectors.  A certain amount of caching is also
175169691Skan * desirable, although the extent of benefit from this is not at this point
176169691Skan * determined.
177169691Skan *
178169691Skan * XXX: GEOM may be able to grow a generic caching facility at some point
179169691Skan * XXX: to support such needs.
180169691Skan */
181169691Skan
182169691Skanstatic u_int g_bde_ncache;
183169691SkanSYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
184169691Skan
185169691Skanstatic void
186169691Skang_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
187169691Skan{
188169691Skan
189169691Skan	g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
190169691Skan	if (sp->ref != 0)
191169691Skan		return;
192169691Skan	TAILQ_REMOVE(&sc->freelist, sp, list);
193169691Skan	g_bde_ncache--;
194169691Skan	sc->ncache--;
195169691Skan	bzero(sp->data, sp->size);
196169691Skan	g_bde_delete_sector(sc, sp);
197169691Skan}
198169691Skan
199169691Skanstatic struct g_bde_sector *
200169691Skang_bde_get_keysector(struct g_bde_work *wp)
201169691Skan{
202169691Skan	struct g_bde_sector *sp;
203169691Skan	struct g_bde_softc *sc;
204169691Skan	off_t offset;
205169691Skan
206169691Skan	offset = wp->kso;
207169691Skan	g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
208169691Skan	sc = wp->softc;
209169691Skan
210169691Skan	if (malloc_last_fail() < g_bde_ncache)
211169691Skan		g_bde_purge_sector(sc, -1);
212169691Skan
213169691Skan	sp = TAILQ_FIRST(&sc->freelist);
214169691Skan	if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
215169691Skan		g_bde_purge_one_sector(sc, sp);
216169691Skan
217169691Skan	TAILQ_FOREACH(sp, &sc->freelist, list) {
218169691Skan		if (sp->offset == offset)
219169691Skan			break;
220169691Skan	}
221169691Skan	if (sp != NULL) {
222169691Skan		sp->ref++;
223169691Skan		KASSERT(sp->offset == offset, ("wrong offset"));
224169691Skan		KASSERT(sp->softc == wp->softc, ("wrong softc"));
225169691Skan		if (sp->ref == 1)
226169691Skan			sp->owner = wp;
227169691Skan	} else {
228169691Skan		if (malloc_last_fail() < g_bde_ncache) {
229169691Skan			TAILQ_FOREACH(sp, &sc->freelist, list)
230169691Skan				if (sp->ref == 0)
231169691Skan					break;
232169691Skan		}
233169691Skan		if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
234169691Skan			sp = TAILQ_FIRST(&sc->freelist);
235169691Skan		if (sp != NULL && sp->ref > 0)
236169691Skan			sp = NULL;
237169691Skan		if (sp == NULL) {
238169691Skan			sp = g_bde_new_sector(wp, sc->sectorsize);
239169691Skan			if (sp != NULL) {
240169691Skan				g_bde_ncache++;
241169691Skan				sc->ncache++;
242169691Skan				TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
243169691Skan				sp->malloc = 2;
244169691Skan			}
245169691Skan		}
246169691Skan		if (sp != NULL) {
247169691Skan			sp->offset = offset;
248169691Skan			sp->softc = wp->softc;
249169691Skan			sp->ref = 1;
250169691Skan			sp->owner = wp;
251169691Skan			sp->state = JUNK;
252169691Skan			sp->error = 0;
253169691Skan		}
254169691Skan	}
255169691Skan	if (sp != NULL) {
256169691Skan		TAILQ_REMOVE(&sc->freelist, sp, list);
257169691Skan		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
258169691Skan		sp->used = time_uptime;
259169691Skan	}
260169691Skan	wp->ksp = sp;
261169691Skan	return(sp);
262169691Skan}
263169691Skan
264169691Skanstatic void
265169691Skang_bde_release_keysector(struct g_bde_work *wp)
266169691Skan{
267169691Skan	struct g_bde_softc *sc;
268169691Skan	struct g_bde_work *wp2;
269169691Skan	struct g_bde_sector *sp;
270169691Skan
271169691Skan	sp = wp->ksp;
272169691Skan	g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
273169691Skan	KASSERT(sp->malloc == 2, ("Wrong sector released"));
274169691Skan	sc = sp->softc;
275169691Skan	KASSERT(sc != NULL, ("NULL sp->softc"));
276169691Skan	KASSERT(wp == sp->owner, ("Releasing, not owner"));
277169691Skan	sp->owner = NULL;
278169691Skan	wp->ksp = NULL;
279169691Skan	sp->ref--;
280169691Skan	if (sp->ref > 0) {
281169691Skan		TAILQ_REMOVE(&sc->freelist, sp, list);
282169691Skan		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
283169691Skan		TAILQ_FOREACH(wp2, &sc->worklist, list) {
284169691Skan			if (wp2->ksp == sp) {
285169691Skan				KASSERT(wp2 != wp, ("Self-reowning"));
286169691Skan				sp->owner = wp2;
287169691Skan				wakeup(sp->softc);
288169691Skan				break;
289169691Skan			}
290169691Skan		}
291169691Skan		KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
292169691Skan	} else if (sp->error != 0) {
293169691Skan		sp->offset = ~0;
294169691Skan		sp->error = 0;
295169691Skan		sp->state = JUNK;
296169691Skan	}
297169691Skan	TAILQ_REMOVE(&sc->freelist, sp, list);
298169691Skan	TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
299169691Skan}
300169691Skan
301169691Skanstatic void
302169691Skang_bde_purge_sector(struct g_bde_softc *sc, int fraction)
303169691Skan{
304169691Skan	struct g_bde_sector *sp;
305169691Skan	int n;
306169691Skan
307169691Skan	g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
308169691Skan	if (fraction > 0)
309169691Skan		n = sc->ncache / fraction + 1;
310169691Skan	else
311169691Skan		n = g_bde_ncache - malloc_last_fail();
312169691Skan	if (n < 0)
313169691Skan		return;
314169691Skan	if (n > sc->ncache)
315169691Skan		n = sc->ncache;
316169691Skan	while(n--) {
317169691Skan		TAILQ_FOREACH(sp, &sc->freelist, list) {
318169691Skan			if (sp->ref != 0)
319169691Skan				continue;
320169691Skan			TAILQ_REMOVE(&sc->freelist, sp, list);
321169691Skan			g_bde_ncache--;
322169691Skan			sc->ncache--;
323169691Skan			bzero(sp->data, sp->size);
324169691Skan			g_bde_delete_sector(sc, sp);
325169691Skan			break;
326169691Skan		}
327169691Skan	}
328169691Skan}
329169691Skan
330169691Skanstatic struct g_bde_sector *
331169691Skang_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
332169691Skan{
333169691Skan	struct g_bde_sector *sp;
334169691Skan
335169691Skan	g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
336169691Skan	sp = g_bde_get_keysector(wp);
337169691Skan	if (sp == NULL) {
338169691Skan		g_bde_purge_sector(sc, -1);
339169691Skan		sp = g_bde_get_keysector(wp);
340169691Skan	}
341169691Skan	if (sp == NULL)
342169691Skan		return (sp);
343169691Skan	if (sp->owner != wp)
344169691Skan		return (sp);
345169691Skan	if (sp->state == VALID)
346169691Skan		return (sp);
347169691Skan	if (g_bde_start_read(sp) == 0)
348169691Skan		return (sp);
349169691Skan	g_bde_release_keysector(wp);
350169691Skan	return (NULL);
351169691Skan}
352169691Skan
353169691Skan/*
354169691Skan * Contribute to the completion of the original bio request.
355169691Skan *
356169691Skan * We have no simple way to tell how many bits the original bio request has
357169691Skan * been segmented into, so the easiest way to determine when we can deliver
358 * it is to keep track of the number of bytes we have completed.  We keep
359 * track of any errors underway and latch onto the first one.
360 *
361 * We always report "nothing done" in case of error, because random bits here
362 * and there may be completed and returning a number of completed bytes does
363 * not convey any useful information about which bytes they were.  If some
364 * piece of broken code somewhere interprets this to mean that nothing has
365 * changed on the underlying media they deserve the lossage headed for them.
366 *
367 * A single mutex per g_bde instance is used to prevent contention.
368 */
369
370static void
371g_bde_contribute(struct bio *bp, off_t bytes, int error)
372{
373
374	g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
375	     bp, (intmax_t)bytes, error);
376	if (bp->bio_error == 0)
377		bp->bio_error = error;
378	bp->bio_completed += bytes;
379	KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
380	if (bp->bio_completed == bp->bio_length) {
381		if (bp->bio_error != 0)
382			bp->bio_completed = 0;
383		g_io_deliver(bp, bp->bio_error);
384	}
385}
386
387/*
388 * This is the common case "we're done with this work package" function
389 */
390
391static void
392g_bde_work_done(struct g_bde_work *wp, int error)
393{
394
395	g_bde_contribute(wp->bp, wp->length, error);
396	if (wp->sp != NULL)
397		g_bde_delete_sector(wp->softc, wp->sp);
398	if (wp->ksp != NULL)
399		g_bde_release_keysector(wp);
400	g_bde_delete_work(wp);
401}
402
403/*
404 * A write operation has finished.  When we have all expected cows in the
405 * barn close the door and call it a day.
406 */
407
408static void
409g_bde_write_done(struct bio *bp)
410{
411	struct g_bde_sector *sp;
412	struct g_bde_work *wp;
413	struct g_bde_softc *sc;
414
415	sp = bp->bio_caller1;
416	sc = bp->bio_caller2;
417	mtx_lock(&sc->worklist_mutex);
418	KASSERT(sp != NULL, ("NULL sp"));
419	KASSERT(sc != NULL, ("NULL sc"));
420	KASSERT(sp->owner != NULL, ("NULL sp->owner"));
421	g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
422	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
423		bp->bio_error = EIO;
424	sp->error = bp->bio_error;
425	g_destroy_bio(bp);
426	wp = sp->owner;
427	if (wp->error == 0)
428		wp->error = sp->error;
429
430	if (wp->bp->bio_cmd == BIO_DELETE) {
431		KASSERT(sp == wp->sp, ("trashed delete op"));
432		g_bde_work_done(wp, wp->error);
433		mtx_unlock(&sc->worklist_mutex);
434		return;
435	}
436
437	KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
438	KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
439	if (wp->sp == sp) {
440		g_bde_delete_sector(sc, wp->sp);
441		wp->sp = NULL;
442	} else {
443		sp->state = VALID;
444	}
445	if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
446		g_bde_work_done(wp, wp->error);
447	mtx_unlock(&sc->worklist_mutex);
448	return;
449}
450
451/*
452 * Send a write request for the given sector down the pipeline.
453 */
454
455static int
456g_bde_start_write(struct g_bde_sector *sp)
457{
458	struct bio *bp;
459	struct g_bde_softc *sc;
460
461	g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
462	sc = sp->softc;
463	KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
464	KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
465	bp = g_new_bio();
466	if (bp == NULL)
467		return (ENOMEM);
468	bp->bio_cmd = BIO_WRITE;
469	bp->bio_offset = sp->offset;
470	bp->bio_data = sp->data;
471	bp->bio_length = sp->size;
472	bp->bio_done = g_bde_write_done;
473	bp->bio_caller1 = sp;
474	bp->bio_caller2 = sc;
475	sp->state = IO;
476	g_io_request(bp, sc->consumer);
477	return(0);
478}
479
480/*
481 * A read operation has finished.  Mark the sector no longer iobusy and
482 * wake up the worker thread and let it do its thing.
483 */
484
485static void
486g_bde_read_done(struct bio *bp)
487{
488	struct g_bde_sector *sp;
489	struct g_bde_softc *sc;
490
491	sp = bp->bio_caller1;
492	g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
493	sc = bp->bio_caller2;
494	mtx_lock(&sc->worklist_mutex);
495	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
496		bp->bio_error = EIO;
497	sp->error = bp->bio_error;
498	if (sp->error == 0)
499		sp->state = VALID;
500	else
501		sp->state = JUNK;
502	wakeup(sc);
503	g_destroy_bio(bp);
504	mtx_unlock(&sc->worklist_mutex);
505}
506
507/*
508 * Send a read request for the given sector down the pipeline.
509 */
510
511static int
512g_bde_start_read(struct g_bde_sector *sp)
513{
514	struct bio *bp;
515	struct g_bde_softc *sc;
516
517	g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
518	sc = sp->softc;
519	KASSERT(sc != NULL, ("Null softc in sp %p", sp));
520	bp = g_new_bio();
521	if (bp == NULL)
522		return (ENOMEM);
523	bp->bio_cmd = BIO_READ;
524	bp->bio_offset = sp->offset;
525	bp->bio_data = sp->data;
526	bp->bio_length = sp->size;
527	bp->bio_done = g_bde_read_done;
528	bp->bio_caller1 = sp;
529	bp->bio_caller2 = sc;
530	sp->state = IO;
531	g_io_request(bp, sc->consumer);
532	return(0);
533}
534
535/*
536 * The worker thread.
537 *
538 * The up/down path of GEOM is not allowed to sleep or do any major work
539 * so we use this thread to do the actual crypto operations and to push
540 * the state engine onwards.
541 *
542 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
543 * XXX: using a thread here is probably not needed.
544 */
545
546void
547g_bde_worker(void *arg)
548{
549	struct g_bde_softc *sc;
550	struct g_bde_work *wp, *twp;
551	struct g_geom *gp;
552	int restart, error;
553
554	gp = arg;
555	sc = gp->softc;
556
557	mtx_lock(&sc->worklist_mutex);
558	for (;;) {
559		restart = 0;
560		g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
561		TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
562			KASSERT(wp != NULL, ("NULL wp"));
563			KASSERT(wp->softc != NULL, ("NULL wp->softc"));
564			if (wp->state != WAIT)
565				continue;	/* Not interesting here */
566
567			KASSERT(wp->bp != NULL, ("NULL wp->bp"));
568			KASSERT(wp->sp != NULL, ("NULL wp->sp"));
569
570			if (wp->ksp != NULL) {
571				if (wp->ksp->owner != wp)
572					continue;
573				if (wp->ksp->state == IO)
574					continue;
575				KASSERT(wp->ksp->state == VALID,
576				    ("Illegal sector state (%d)",
577				    wp->ksp->state));
578			}
579
580			if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
581				continue;
582
583			if (wp->ksp != NULL && wp->ksp->error != 0) {
584				g_bde_work_done(wp, wp->ksp->error);
585				continue;
586			}
587			switch(wp->bp->bio_cmd) {
588			case BIO_READ:
589				if (wp->ksp == NULL) {
590					KASSERT(wp->error != 0,
591					    ("BIO_READ, no ksp and no error"));
592					g_bde_work_done(wp, wp->error);
593					break;
594				}
595				if (wp->sp->error != 0) {
596					g_bde_work_done(wp, wp->sp->error);
597					break;
598				}
599				mtx_unlock(&sc->worklist_mutex);
600				g_bde_crypt_read(wp);
601				mtx_lock(&sc->worklist_mutex);
602				restart++;
603				g_bde_work_done(wp, wp->sp->error);
604				break;
605			case BIO_WRITE:
606				wp->state = FINISH;
607				KASSERT(wp->sp->owner == wp,
608				    ("Write not owner sp"));
609				KASSERT(wp->ksp->owner == wp,
610				    ("Write not owner ksp"));
611				mtx_unlock(&sc->worklist_mutex);
612				g_bde_crypt_write(wp);
613				mtx_lock(&sc->worklist_mutex);
614				restart++;
615				error = g_bde_start_write(wp->sp);
616				if (error) {
617					g_bde_work_done(wp, error);
618					break;
619				}
620				error = g_bde_start_write(wp->ksp);
621				if (wp->error != 0)
622					wp->error = error;
623				break;
624			case BIO_DELETE:
625				wp->state = FINISH;
626				mtx_unlock(&sc->worklist_mutex);
627				g_bde_crypt_delete(wp);
628				mtx_lock(&sc->worklist_mutex);
629				restart++;
630				g_bde_start_write(wp->sp);
631				break;
632			}
633			if (restart)
634				break;
635		}
636		if (!restart) {
637			/*
638			 * We don't look for our death-warrant until we are
639			 * idle.  Shouldn't make a difference in practice.
640			 */
641			if (sc->dead)
642				break;
643			g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
644			error = msleep(sc, &sc->worklist_mutex,
645			    PRIBIO, "-", hz);
646			if (error == EWOULDBLOCK) {
647				/*
648				 * Loose our skey cache in an orderly fashion.
649				 * The exact rate can be tuned to be less
650				 * aggressive if this is desirable.  10% per
651				 * second means that the cache is gone in a
652				 * few minutes.
653				 */
654				g_bde_purge_sector(sc, 10);
655			}
656		}
657	}
658	g_trace(G_T_TOPOLOGY, "g_bde_worker die");
659	g_bde_purge_sector(sc, 1);
660	KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
661	KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
662	KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
663	mtx_unlock(&sc->worklist_mutex);
664	sc->dead = 2;
665	wakeup(sc);
666	mtx_lock(&Giant);
667	kthread_exit(0);
668}
669
670/*
671 * g_bde_start1 has chopped the incoming request up so all the requests
672 * we see here are inside a single zone.  Map the data and key locations
673 * grab the buffers we need and fire off the first volley of read requests.
674 */
675
676static void
677g_bde_start2(struct g_bde_work *wp)
678{
679	struct g_bde_softc *sc;
680
681	KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
682	KASSERT(wp->softc != NULL, ("NULL wp->softc"));
683	g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
684	sc = wp->softc;
685	switch (wp->bp->bio_cmd) {
686	case BIO_READ:
687		wp->sp = g_bde_new_sector(wp, 0);
688		if (wp->sp == NULL) {
689			g_bde_work_done(wp, ENOMEM);
690			return;
691		}
692		wp->sp->size = wp->length;
693		wp->sp->data = wp->data;
694		if (g_bde_start_read(wp->sp) != 0) {
695			g_bde_work_done(wp, ENOMEM);
696			return;
697		}
698		g_bde_read_keysector(sc, wp);
699		if (wp->ksp == NULL)
700			wp->error = ENOMEM;
701		break;
702	case BIO_DELETE:
703		wp->sp = g_bde_new_sector(wp, wp->length);
704		if (wp->sp == NULL) {
705			g_bde_work_done(wp, ENOMEM);
706			return;
707		}
708		break;
709	case BIO_WRITE:
710		wp->sp = g_bde_new_sector(wp, wp->length);
711		if (wp->sp == NULL) {
712			g_bde_work_done(wp, ENOMEM);
713			return;
714		}
715		g_bde_read_keysector(sc, wp);
716		if (wp->ksp == NULL) {
717			g_bde_work_done(wp, ENOMEM);
718			return;
719		}
720		break;
721	default:
722		KASSERT(0 == 1,
723		    ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
724	}
725
726	wp->state = WAIT;
727	wakeup(sc);
728}
729
730/*
731 * Create a sequence of work structures, and have g_bde_map_sector() determine
732 * how long they each can be.  Feed them to g_bde_start2().
733 */
734
735void
736g_bde_start1(struct bio *bp)
737{
738	struct g_bde_softc *sc;
739	struct g_bde_work *wp;
740	off_t done;
741
742	sc = bp->bio_to->geom->softc;
743	bp->bio_driver1 = sc;
744
745	mtx_lock(&sc->worklist_mutex);
746	for(done = 0; done < bp->bio_length; ) {
747		wp = g_bde_new_work(sc);
748		if (wp != NULL) {
749			wp->bp = bp;
750			wp->offset = bp->bio_offset + done;
751			wp->data = bp->bio_data + done;
752			wp->length = bp->bio_length - done;
753			g_bde_map_sector(wp);
754			done += wp->length;
755			g_bde_start2(wp);
756		}
757		if (wp == NULL || bp->bio_error != 0) {
758			g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
759			break;
760		}
761	}
762	mtx_unlock(&sc->worklist_mutex);
763	return;
764}
765