1105464Sphk/*-
2105464Sphk * Copyright (c) 2002 Poul-Henning Kamp
3105464Sphk * Copyright (c) 2002 Networks Associates Technology, Inc.
4105464Sphk * All rights reserved.
5105464Sphk *
6105464Sphk * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7105464Sphk * and NAI Labs, the Security Research Division of Network Associates, Inc.
8105464Sphk * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9105464Sphk * DARPA CHATS research program.
10105464Sphk *
11105464Sphk * Redistribution and use in source and binary forms, with or without
12105464Sphk * modification, are permitted provided that the following conditions
13105464Sphk * are met:
14105464Sphk * 1. Redistributions of source code must retain the above copyright
15105464Sphk *    notice, this list of conditions and the following disclaimer.
16105464Sphk * 2. Redistributions in binary form must reproduce the above copyright
17105464Sphk *    notice, this list of conditions and the following disclaimer in the
18105464Sphk *    documentation and/or other materials provided with the distribution.
19105464Sphk *
20105464Sphk * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21105464Sphk * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22105464Sphk * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23105464Sphk * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24105464Sphk * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25105464Sphk * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26105464Sphk * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27105464Sphk * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28105464Sphk * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29105464Sphk * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30105464Sphk * SUCH DAMAGE.
31105464Sphk *
32105464Sphk * $FreeBSD$
33139778Simp */
34139778Simp/*
35105464Sphk * This source file contains the state-engine which makes things happen in the
36105464Sphk * right order.
37105464Sphk *
38105464Sphk * Outline:
39105464Sphk *   1) g_bde_start1()
40105464Sphk *	Break the struct bio into multiple work packets one per zone.
41105464Sphk *   2) g_bde_start2()
42105464Sphk *	Setup the necessary sector buffers and start those read operations
43105464Sphk *	which we can start at this time and put the item on the work-list.
44105464Sphk *   3) g_bde_worker()
45105464Sphk *	Scan the work-list for items which are ready for crypto processing
46105464Sphk *	and call the matching crypto function in g_bde_crypt.c and schedule
47105464Sphk *	any writes needed.  Read operations finish here by releasing the
48105464Sphk *	sector buffers and delivering the original bio request.
49105464Sphk *   4) g_bde_write_done()
50105464Sphk *	Release sector buffers and deliver the original bio request.
51105464Sphk *
52105464Sphk * Because of the C-scope rules, the functions are almost perfectly in the
53105464Sphk * opposite order in this source file.
54105464Sphk *
55105464Sphk * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
56105464Sphk * XXX: additional states to this state-engine.  Since no hardware available
57105464Sphk * XXX: at this time has AES support, implementing this has been postponed
58105464Sphk * XXX: until such time as it would result in a benefit.
59105464Sphk */
60105464Sphk
61105464Sphk#include <sys/param.h>
62105464Sphk#include <sys/bio.h>
63105464Sphk#include <sys/lock.h>
64105464Sphk#include <sys/mutex.h>
65105464Sphk#include <sys/queue.h>
66105464Sphk#include <sys/malloc.h>
67105464Sphk#include <sys/systm.h>
68105464Sphk#include <sys/kernel.h>
69105464Sphk#include <sys/sysctl.h>
70105464Sphk#include <sys/proc.h>
71105464Sphk#include <sys/kthread.h>
72105464Sphk
73143418Sume#include <crypto/rijndael/rijndael-api-fst.h>
74106407Sphk#include <crypto/sha2/sha2.h>
75105464Sphk#include <geom/geom.h>
76105464Sphk#include <geom/bde/g_bde.h>
77105464Sphk
78105464Sphkstatic void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
79105464Sphkstatic struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
80114034Sphkstatic void g_bde_release_keysector(struct g_bde_work *wp);
81114153Sphkstatic struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
82105464Sphkstatic int g_bde_start_read(struct g_bde_sector *sp);
83106407Sphkstatic void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
84105464Sphk
85105464Sphk/*
86105464Sphk * Work item allocation.
87105464Sphk *
88105464Sphk * C++ would call these constructors and destructors.
89105464Sphk */
90105464Sphkstatic u_int g_bde_nwork;
91105464SphkSYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
92105464Sphk
93151897Srwatsonstatic MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
94114715Sphk
95105464Sphkstatic struct g_bde_work *
96105464Sphkg_bde_new_work(struct g_bde_softc *sc)
97105464Sphk{
98105464Sphk	struct g_bde_work *wp;
99105464Sphk
100114715Sphk	wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
101105464Sphk	if (wp == NULL)
102105464Sphk		return (wp);
103105464Sphk	wp->state = SETUP;
104105464Sphk	wp->softc = sc;
105105464Sphk	g_bde_nwork++;
106105464Sphk	sc->nwork++;
107105464Sphk	TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
108105464Sphk	return (wp);
109105464Sphk}
110105464Sphk
111105464Sphkstatic void
112105464Sphkg_bde_delete_work(struct g_bde_work *wp)
113105464Sphk{
114105464Sphk	struct g_bde_softc *sc;
115105464Sphk
116105464Sphk	sc = wp->softc;
117105464Sphk	g_bde_nwork--;
118105464Sphk	sc->nwork--;
119105464Sphk	TAILQ_REMOVE(&sc->worklist, wp, list);
120114715Sphk	free(wp, M_GBDE);
121105464Sphk}
122105464Sphk
123105464Sphk/*
124105464Sphk * Sector buffer allocation
125105464Sphk *
126105464Sphk * These two functions allocate and free back variable sized sector buffers
127105464Sphk */
128105464Sphk
129105464Sphkstatic u_int g_bde_nsect;
130105464SphkSYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
131105464Sphk
132105520Sphkstatic void
133105464Sphkg_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
134105464Sphk{
135105464Sphk
136105464Sphk	g_bde_nsect--;
137105464Sphk	sc->nsect--;
138105464Sphk	if (sp->malloc)
139114715Sphk		free(sp->data, M_GBDE);
140114715Sphk	free(sp, M_GBDE);
141105464Sphk}
142105464Sphk
143105520Sphkstatic struct g_bde_sector *
144105464Sphkg_bde_new_sector(struct g_bde_work *wp, u_int len)
145105464Sphk{
146105464Sphk	struct g_bde_sector *sp;
147105464Sphk
148114715Sphk	sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
149105464Sphk	if (sp == NULL)
150105464Sphk		return (sp);
151105464Sphk	if (len > 0) {
152114715Sphk		sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
153105464Sphk		if (sp->data == NULL) {
154114715Sphk			free(sp, M_GBDE);
155105464Sphk			return (NULL);
156105464Sphk		}
157105464Sphk		sp->malloc = 1;
158105464Sphk	}
159105464Sphk	g_bde_nsect++;
160105464Sphk	wp->softc->nsect++;
161105464Sphk	sp->size = len;
162105464Sphk	sp->softc = wp->softc;
163105464Sphk	sp->ref = 1;
164105464Sphk	sp->owner = wp;
165105464Sphk	sp->offset = wp->so;
166105464Sphk	sp->state = JUNK;
167105464Sphk	return (sp);
168105464Sphk}
169105464Sphk
170105464Sphk/*
171105464Sphk * Skey sector cache.
172105464Sphk *
173105464Sphk * Nothing prevents two separate I/O requests from addressing the same zone
174105464Sphk * and thereby needing the same skey sector.  We therefore need to sequence
175105464Sphk * I/O operations to the skey sectors.  A certain amount of caching is also
176105464Sphk * desirable, although the extent of benefit from this is not at this point
177105464Sphk * determined.
178105464Sphk *
179105464Sphk * XXX: GEOM may be able to grow a generic caching facility at some point
180105464Sphk * XXX: to support such needs.
181105464Sphk */
182105464Sphk
183105464Sphkstatic u_int g_bde_ncache;
184105464SphkSYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
185105464Sphk
186106407Sphkstatic void
187106407Sphkg_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
188106407Sphk{
189106407Sphk
190106407Sphk	g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
191106407Sphk	if (sp->ref != 0)
192106407Sphk		return;
193106407Sphk	TAILQ_REMOVE(&sc->freelist, sp, list);
194106407Sphk	g_bde_ncache--;
195106407Sphk	sc->ncache--;
196106407Sphk	bzero(sp->data, sp->size);
197106407Sphk	g_bde_delete_sector(sc, sp);
198106407Sphk}
199106407Sphk
200105464Sphkstatic struct g_bde_sector *
201114153Sphkg_bde_get_keysector(struct g_bde_work *wp)
202105464Sphk{
203105464Sphk	struct g_bde_sector *sp;
204105464Sphk	struct g_bde_softc *sc;
205114153Sphk	off_t offset;
206105464Sphk
207114153Sphk	offset = wp->kso;
208114153Sphk	g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
209105464Sphk	sc = wp->softc;
210106407Sphk
211106407Sphk	if (malloc_last_fail() < g_bde_ncache)
212106407Sphk		g_bde_purge_sector(sc, -1);
213106407Sphk
214106407Sphk	sp = TAILQ_FIRST(&sc->freelist);
215106407Sphk	if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
216106407Sphk		g_bde_purge_one_sector(sc, sp);
217106407Sphk
218105464Sphk	TAILQ_FOREACH(sp, &sc->freelist, list) {
219105464Sphk		if (sp->offset == offset)
220105464Sphk			break;
221105464Sphk	}
222105464Sphk	if (sp != NULL) {
223105464Sphk		sp->ref++;
224105464Sphk		KASSERT(sp->offset == offset, ("wrong offset"));
225105464Sphk		KASSERT(sp->softc == wp->softc, ("wrong softc"));
226105464Sphk		if (sp->ref == 1)
227105464Sphk			sp->owner = wp;
228105464Sphk	} else {
229106407Sphk		if (malloc_last_fail() < g_bde_ncache) {
230106407Sphk			TAILQ_FOREACH(sp, &sc->freelist, list)
231106407Sphk				if (sp->ref == 0)
232106407Sphk					break;
233106407Sphk		}
234106407Sphk		if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
235105464Sphk			sp = TAILQ_FIRST(&sc->freelist);
236105464Sphk		if (sp != NULL && sp->ref > 0)
237105464Sphk			sp = NULL;
238105464Sphk		if (sp == NULL) {
239105464Sphk			sp = g_bde_new_sector(wp, sc->sectorsize);
240105464Sphk			if (sp != NULL) {
241114035Sphk				g_bde_ncache++;
242114035Sphk				sc->ncache++;
243105464Sphk				TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
244105464Sphk				sp->malloc = 2;
245105464Sphk			}
246105464Sphk		}
247105464Sphk		if (sp != NULL) {
248105464Sphk			sp->offset = offset;
249105464Sphk			sp->softc = wp->softc;
250105464Sphk			sp->ref = 1;
251105464Sphk			sp->owner = wp;
252105464Sphk			sp->state = JUNK;
253105464Sphk			sp->error = 0;
254105464Sphk		}
255105464Sphk	}
256105464Sphk	if (sp != NULL) {
257105464Sphk		TAILQ_REMOVE(&sc->freelist, sp, list);
258105464Sphk		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
259114041Sphk		sp->used = time_uptime;
260105464Sphk	}
261105464Sphk	wp->ksp = sp;
262105464Sphk	return(sp);
263105464Sphk}
264105464Sphk
265105464Sphkstatic void
266114034Sphkg_bde_release_keysector(struct g_bde_work *wp)
267105464Sphk{
268105464Sphk	struct g_bde_softc *sc;
269105464Sphk	struct g_bde_work *wp2;
270114034Sphk	struct g_bde_sector *sp;
271105464Sphk
272114034Sphk	sp = wp->ksp;
273114034Sphk	g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
274105464Sphk	KASSERT(sp->malloc == 2, ("Wrong sector released"));
275105464Sphk	sc = sp->softc;
276105464Sphk	KASSERT(sc != NULL, ("NULL sp->softc"));
277105464Sphk	KASSERT(wp == sp->owner, ("Releasing, not owner"));
278105464Sphk	sp->owner = NULL;
279105464Sphk	wp->ksp = NULL;
280105464Sphk	sp->ref--;
281105464Sphk	if (sp->ref > 0) {
282105464Sphk		TAILQ_REMOVE(&sc->freelist, sp, list);
283105464Sphk		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
284105464Sphk		TAILQ_FOREACH(wp2, &sc->worklist, list) {
285105464Sphk			if (wp2->ksp == sp) {
286105464Sphk				KASSERT(wp2 != wp, ("Self-reowning"));
287105464Sphk				sp->owner = wp2;
288105464Sphk				wakeup(sp->softc);
289105464Sphk				break;
290105464Sphk			}
291105464Sphk		}
292105464Sphk		KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
293105464Sphk	} else if (sp->error != 0) {
294105464Sphk		sp->offset = ~0;
295105464Sphk		sp->error = 0;
296105464Sphk		sp->state = JUNK;
297105464Sphk	}
298105464Sphk	TAILQ_REMOVE(&sc->freelist, sp, list);
299105464Sphk	TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
300105464Sphk}
301105464Sphk
302105464Sphkstatic void
303105464Sphkg_bde_purge_sector(struct g_bde_softc *sc, int fraction)
304105464Sphk{
305105464Sphk	struct g_bde_sector *sp;
306105464Sphk	int n;
307105464Sphk
308105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
309106407Sphk	if (fraction > 0)
310106407Sphk		n = sc->ncache / fraction + 1;
311106407Sphk	else
312106407Sphk		n = g_bde_ncache - malloc_last_fail();
313106407Sphk	if (n < 0)
314106407Sphk		return;
315106407Sphk	if (n > sc->ncache)
316106407Sphk		n = sc->ncache;
317105464Sphk	while(n--) {
318105464Sphk		TAILQ_FOREACH(sp, &sc->freelist, list) {
319105464Sphk			if (sp->ref != 0)
320105464Sphk				continue;
321105464Sphk			TAILQ_REMOVE(&sc->freelist, sp, list);
322105464Sphk			g_bde_ncache--;
323105464Sphk			sc->ncache--;
324105464Sphk			bzero(sp->data, sp->size);
325105464Sphk			g_bde_delete_sector(sc, sp);
326105464Sphk			break;
327105464Sphk		}
328105464Sphk	}
329105464Sphk}
330105464Sphk
331105464Sphkstatic struct g_bde_sector *
332114033Sphkg_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
333105464Sphk{
334105464Sphk	struct g_bde_sector *sp;
335105464Sphk
336114033Sphk	g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
337114153Sphk	sp = g_bde_get_keysector(wp);
338114152Sphk	if (sp == NULL) {
339114152Sphk		g_bde_purge_sector(sc, -1);
340114153Sphk		sp = g_bde_get_keysector(wp);
341114152Sphk	}
342114041Sphk	if (sp == NULL)
343105464Sphk		return (sp);
344105464Sphk	if (sp->owner != wp)
345105464Sphk		return (sp);
346105464Sphk	if (sp->state == VALID)
347105464Sphk		return (sp);
348105464Sphk	if (g_bde_start_read(sp) == 0)
349105464Sphk		return (sp);
350114034Sphk	g_bde_release_keysector(wp);
351105464Sphk	return (NULL);
352105464Sphk}
353105464Sphk
354105464Sphk/*
355105464Sphk * Contribute to the completion of the original bio request.
356105464Sphk *
357105464Sphk * We have no simple way to tell how many bits the original bio request has
358105464Sphk * been segmented into, so the easiest way to determine when we can deliver
359105464Sphk * it is to keep track of the number of bytes we have completed.  We keep
360105464Sphk * track of any errors underway and latch onto the first one.
361105464Sphk *
362105464Sphk * We always report "nothing done" in case of error, because random bits here
363105464Sphk * and there may be completed and returning a number of completed bytes does
364105464Sphk * not convey any useful information about which bytes they were.  If some
365105464Sphk * piece of broken code somewhere interprets this to mean that nothing has
366105464Sphk * changed on the underlying media they deserve the lossage headed for them.
367105464Sphk *
368105464Sphk * A single mutex per g_bde instance is used to prevent contention.
369105464Sphk */
370105464Sphk
371105464Sphkstatic void
372105464Sphkg_bde_contribute(struct bio *bp, off_t bytes, int error)
373105464Sphk{
374105464Sphk
375105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
376105464Sphk	     bp, (intmax_t)bytes, error);
377105464Sphk	if (bp->bio_error == 0)
378105464Sphk		bp->bio_error = error;
379105464Sphk	bp->bio_completed += bytes;
380105464Sphk	KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
381105464Sphk	if (bp->bio_completed == bp->bio_length) {
382105464Sphk		if (bp->bio_error != 0)
383105464Sphk			bp->bio_completed = 0;
384105464Sphk		g_io_deliver(bp, bp->bio_error);
385105464Sphk	}
386105464Sphk}
387105464Sphk
388105464Sphk/*
389125591Sphk * This is the common case "we're done with this work package" function
390125591Sphk */
391125591Sphk
392125591Sphkstatic void
393125591Sphkg_bde_work_done(struct g_bde_work *wp, int error)
394125591Sphk{
395125591Sphk
396125591Sphk	g_bde_contribute(wp->bp, wp->length, error);
397125591Sphk	if (wp->sp != NULL)
398125591Sphk		g_bde_delete_sector(wp->softc, wp->sp);
399125591Sphk	if (wp->ksp != NULL)
400125591Sphk		g_bde_release_keysector(wp);
401125591Sphk	g_bde_delete_work(wp);
402125591Sphk}
403125591Sphk
404125591Sphk/*
405105464Sphk * A write operation has finished.  When we have all expected cows in the
406105464Sphk * barn close the door and call it a day.
407105464Sphk */
408105464Sphk
409105464Sphkstatic void
410105464Sphkg_bde_write_done(struct bio *bp)
411105464Sphk{
412105464Sphk	struct g_bde_sector *sp;
413105464Sphk	struct g_bde_work *wp;
414105464Sphk	struct g_bde_softc *sc;
415105464Sphk
416105464Sphk	sp = bp->bio_caller1;
417105464Sphk	sc = bp->bio_caller2;
418105464Sphk	mtx_lock(&sc->worklist_mutex);
419105464Sphk	KASSERT(sp != NULL, ("NULL sp"));
420105464Sphk	KASSERT(sc != NULL, ("NULL sc"));
421105464Sphk	KASSERT(sp->owner != NULL, ("NULL sp->owner"));
422105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
423114249Sphk	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
424114249Sphk		bp->bio_error = EIO;
425105464Sphk	sp->error = bp->bio_error;
426105464Sphk	g_destroy_bio(bp);
427105464Sphk	wp = sp->owner;
428105464Sphk	if (wp->error == 0)
429105464Sphk		wp->error = sp->error;
430105464Sphk
431105464Sphk	if (wp->bp->bio_cmd == BIO_DELETE) {
432105464Sphk		KASSERT(sp == wp->sp, ("trashed delete op"));
433125591Sphk		g_bde_work_done(wp, wp->error);
434105464Sphk		mtx_unlock(&sc->worklist_mutex);
435105464Sphk		return;
436105464Sphk	}
437105464Sphk
438105464Sphk	KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
439105464Sphk	KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
440105464Sphk	if (wp->sp == sp) {
441105464Sphk		g_bde_delete_sector(sc, wp->sp);
442105464Sphk		wp->sp = NULL;
443105464Sphk	} else {
444105464Sphk		sp->state = VALID;
445105464Sphk	}
446125591Sphk	if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
447125591Sphk		g_bde_work_done(wp, wp->error);
448105464Sphk	mtx_unlock(&sc->worklist_mutex);
449105464Sphk	return;
450105464Sphk}
451105464Sphk
452105464Sphk/*
453105464Sphk * Send a write request for the given sector down the pipeline.
454105464Sphk */
455105464Sphk
456105464Sphkstatic int
457105464Sphkg_bde_start_write(struct g_bde_sector *sp)
458105464Sphk{
459105464Sphk	struct bio *bp;
460105464Sphk	struct g_bde_softc *sc;
461105464Sphk
462105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
463105464Sphk	sc = sp->softc;
464105464Sphk	KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
465105464Sphk	KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
466105464Sphk	bp = g_new_bio();
467105464Sphk	if (bp == NULL)
468105464Sphk		return (ENOMEM);
469105464Sphk	bp->bio_cmd = BIO_WRITE;
470105464Sphk	bp->bio_offset = sp->offset;
471105464Sphk	bp->bio_data = sp->data;
472105464Sphk	bp->bio_length = sp->size;
473105464Sphk	bp->bio_done = g_bde_write_done;
474105464Sphk	bp->bio_caller1 = sp;
475105464Sphk	bp->bio_caller2 = sc;
476105464Sphk	sp->state = IO;
477105464Sphk	g_io_request(bp, sc->consumer);
478105464Sphk	return(0);
479105464Sphk}
480105464Sphk
481105464Sphk/*
482105464Sphk * A read operation has finished.  Mark the sector no longer iobusy and
483105464Sphk * wake up the worker thread and let it do its thing.
484105464Sphk */
485105464Sphk
486105464Sphkstatic void
487105464Sphkg_bde_read_done(struct bio *bp)
488105464Sphk{
489105464Sphk	struct g_bde_sector *sp;
490105464Sphk	struct g_bde_softc *sc;
491105464Sphk
492105464Sphk	sp = bp->bio_caller1;
493105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
494105464Sphk	sc = bp->bio_caller2;
495105464Sphk	mtx_lock(&sc->worklist_mutex);
496114250Sphk	if (bp->bio_error == 0 && bp->bio_completed != sp->size)
497114249Sphk		bp->bio_error = EIO;
498105464Sphk	sp->error = bp->bio_error;
499114148Sphk	if (sp->error == 0)
500114148Sphk		sp->state = VALID;
501114148Sphk	else
502114148Sphk		sp->state = JUNK;
503105464Sphk	wakeup(sc);
504105464Sphk	g_destroy_bio(bp);
505105464Sphk	mtx_unlock(&sc->worklist_mutex);
506105464Sphk}
507105464Sphk
508105464Sphk/*
509105464Sphk * Send a read request for the given sector down the pipeline.
510105464Sphk */
511105464Sphk
512105464Sphkstatic int
513105464Sphkg_bde_start_read(struct g_bde_sector *sp)
514105464Sphk{
515105464Sphk	struct bio *bp;
516105464Sphk	struct g_bde_softc *sc;
517105464Sphk
518105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
519105464Sphk	sc = sp->softc;
520105464Sphk	KASSERT(sc != NULL, ("Null softc in sp %p", sp));
521105464Sphk	bp = g_new_bio();
522105464Sphk	if (bp == NULL)
523105464Sphk		return (ENOMEM);
524105464Sphk	bp->bio_cmd = BIO_READ;
525105464Sphk	bp->bio_offset = sp->offset;
526105464Sphk	bp->bio_data = sp->data;
527105464Sphk	bp->bio_length = sp->size;
528105464Sphk	bp->bio_done = g_bde_read_done;
529105464Sphk	bp->bio_caller1 = sp;
530105464Sphk	bp->bio_caller2 = sc;
531105464Sphk	sp->state = IO;
532105464Sphk	g_io_request(bp, sc->consumer);
533105464Sphk	return(0);
534105464Sphk}
535105464Sphk
536105464Sphk/*
537105464Sphk * The worker thread.
538105464Sphk *
539105464Sphk * The up/down path of GEOM is not allowed to sleep or do any major work
540105464Sphk * so we use this thread to do the actual crypto operations and to push
541105464Sphk * the state engine onwards.
542105464Sphk *
543105464Sphk * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
544105464Sphk * XXX: using a thread here is probably not needed.
545105464Sphk */
546105464Sphk
547105464Sphkvoid
548105464Sphkg_bde_worker(void *arg)
549105464Sphk{
550105464Sphk	struct g_bde_softc *sc;
551125591Sphk	struct g_bde_work *wp, *twp;
552105464Sphk	struct g_geom *gp;
553125591Sphk	int restart, error;
554105464Sphk
555105464Sphk	gp = arg;
556105464Sphk	sc = gp->softc;
557105464Sphk
558105464Sphk	mtx_lock(&sc->worklist_mutex);
559105464Sphk	for (;;) {
560125591Sphk		restart = 0;
561105464Sphk		g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
562125591Sphk		TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
563105464Sphk			KASSERT(wp != NULL, ("NULL wp"));
564105464Sphk			KASSERT(wp->softc != NULL, ("NULL wp->softc"));
565105464Sphk			if (wp->state != WAIT)
566125591Sphk				continue;	/* Not interesting here */
567105464Sphk
568105464Sphk			KASSERT(wp->bp != NULL, ("NULL wp->bp"));
569105464Sphk			KASSERT(wp->sp != NULL, ("NULL wp->sp"));
570105464Sphk
571105464Sphk			if (wp->ksp != NULL) {
572105464Sphk				if (wp->ksp->owner != wp)
573105464Sphk					continue;
574105464Sphk				if (wp->ksp->state == IO)
575105464Sphk					continue;
576105464Sphk				KASSERT(wp->ksp->state == VALID,
577125591Sphk				    ("Illegal sector state (%d)",
578125591Sphk				    wp->ksp->state));
579105464Sphk			}
580105464Sphk
581125591Sphk			if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
582105464Sphk				continue;
583105464Sphk
584105464Sphk			if (wp->ksp != NULL && wp->ksp->error != 0) {
585125591Sphk				g_bde_work_done(wp, wp->ksp->error);
586125591Sphk				continue;
587105464Sphk			}
588105464Sphk			switch(wp->bp->bio_cmd) {
589105464Sphk			case BIO_READ:
590114040Sphk				if (wp->ksp == NULL) {
591114040Sphk					KASSERT(wp->error != 0,
592114040Sphk					    ("BIO_READ, no ksp and no error"));
593125591Sphk					g_bde_work_done(wp, wp->error);
594125591Sphk					break;
595105464Sphk				}
596125591Sphk				if (wp->sp->error != 0) {
597125591Sphk					g_bde_work_done(wp, wp->sp->error);
598125591Sphk					break;
599125591Sphk				}
600125591Sphk				mtx_unlock(&sc->worklist_mutex);
601125591Sphk				g_bde_crypt_read(wp);
602125591Sphk				mtx_lock(&sc->worklist_mutex);
603125591Sphk				restart++;
604125591Sphk				g_bde_work_done(wp, wp->sp->error);
605105464Sphk				break;
606105464Sphk			case BIO_WRITE:
607105464Sphk				wp->state = FINISH;
608125591Sphk				KASSERT(wp->sp->owner == wp,
609125591Sphk				    ("Write not owner sp"));
610125591Sphk				KASSERT(wp->ksp->owner == wp,
611125591Sphk				    ("Write not owner ksp"));
612105464Sphk				mtx_unlock(&sc->worklist_mutex);
613105464Sphk				g_bde_crypt_write(wp);
614105464Sphk				mtx_lock(&sc->worklist_mutex);
615125591Sphk				restart++;
616114088Sphk				error = g_bde_start_write(wp->sp);
617114088Sphk				if (error) {
618125591Sphk					g_bde_work_done(wp, error);
619114088Sphk					break;
620114088Sphk				}
621114088Sphk				error = g_bde_start_write(wp->ksp);
622125591Sphk				if (wp->error != 0)
623114088Sphk					wp->error = error;
624105464Sphk				break;
625105464Sphk			case BIO_DELETE:
626105464Sphk				wp->state = FINISH;
627105464Sphk				mtx_unlock(&sc->worklist_mutex);
628105464Sphk				g_bde_crypt_delete(wp);
629105464Sphk				mtx_lock(&sc->worklist_mutex);
630125591Sphk				restart++;
631105464Sphk				g_bde_start_write(wp->sp);
632105464Sphk				break;
633105464Sphk			}
634125591Sphk			if (restart)
635125591Sphk				break;
636105464Sphk		}
637125591Sphk		if (!restart) {
638105464Sphk			/*
639105464Sphk			 * We don't look for our death-warrant until we are
640105464Sphk			 * idle.  Shouldn't make a difference in practice.
641105464Sphk			 */
642105464Sphk			if (sc->dead)
643105464Sphk				break;
644105464Sphk			g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
645105464Sphk			error = msleep(sc, &sc->worklist_mutex,
646125591Sphk			    PRIBIO, "-", hz);
647105464Sphk			if (error == EWOULDBLOCK) {
648105464Sphk				/*
649160964Syar				 * Lose our skey cache in an orderly fashion.
650105464Sphk				 * The exact rate can be tuned to be less
651105464Sphk				 * aggressive if this is desirable.  10% per
652105464Sphk				 * second means that the cache is gone in a
653105464Sphk				 * few minutes.
654105464Sphk				 */
655105464Sphk				g_bde_purge_sector(sc, 10);
656105464Sphk			}
657105464Sphk		}
658105464Sphk	}
659105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_worker die");
660105464Sphk	g_bde_purge_sector(sc, 1);
661105464Sphk	KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
662105464Sphk	KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
663105464Sphk	KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
664105464Sphk	mtx_unlock(&sc->worklist_mutex);
665105464Sphk	sc->dead = 2;
666105464Sphk	wakeup(sc);
667172836Sjulian	kproc_exit(0);
668105464Sphk}
669105464Sphk
670105464Sphk/*
671105464Sphk * g_bde_start1 has chopped the incoming request up so all the requests
672105464Sphk * we see here are inside a single zone.  Map the data and key locations
673105464Sphk * grab the buffers we need and fire off the first volley of read requests.
674105464Sphk */
675105464Sphk
676105464Sphkstatic void
677105464Sphkg_bde_start2(struct g_bde_work *wp)
678105464Sphk{
679105464Sphk	struct g_bde_softc *sc;
680105464Sphk
681105464Sphk	KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
682108558Sphk	KASSERT(wp->softc != NULL, ("NULL wp->softc"));
683105464Sphk	g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
684105464Sphk	sc = wp->softc;
685125591Sphk	switch (wp->bp->bio_cmd) {
686125591Sphk	case BIO_READ:
687105464Sphk		wp->sp = g_bde_new_sector(wp, 0);
688105464Sphk		if (wp->sp == NULL) {
689125591Sphk			g_bde_work_done(wp, ENOMEM);
690105464Sphk			return;
691105464Sphk		}
692105464Sphk		wp->sp->size = wp->length;
693105464Sphk		wp->sp->data = wp->data;
694105464Sphk		if (g_bde_start_read(wp->sp) != 0) {
695125591Sphk			g_bde_work_done(wp, ENOMEM);
696105464Sphk			return;
697105464Sphk		}
698114033Sphk		g_bde_read_keysector(sc, wp);
699105464Sphk		if (wp->ksp == NULL)
700105464Sphk			wp->error = ENOMEM;
701125591Sphk		break;
702125591Sphk	case BIO_DELETE:
703105464Sphk		wp->sp = g_bde_new_sector(wp, wp->length);
704105464Sphk		if (wp->sp == NULL) {
705125591Sphk			g_bde_work_done(wp, ENOMEM);
706105464Sphk			return;
707105464Sphk		}
708125591Sphk		break;
709125591Sphk	case BIO_WRITE:
710105464Sphk		wp->sp = g_bde_new_sector(wp, wp->length);
711105464Sphk		if (wp->sp == NULL) {
712125591Sphk			g_bde_work_done(wp, ENOMEM);
713105464Sphk			return;
714105464Sphk		}
715114033Sphk		g_bde_read_keysector(sc, wp);
716105464Sphk		if (wp->ksp == NULL) {
717125591Sphk			g_bde_work_done(wp, ENOMEM);
718105464Sphk			return;
719105464Sphk		}
720125591Sphk		break;
721125591Sphk	default:
722105464Sphk		KASSERT(0 == 1,
723105464Sphk		    ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
724105464Sphk	}
725105464Sphk
726105464Sphk	wp->state = WAIT;
727105464Sphk	wakeup(sc);
728105464Sphk}
729105464Sphk
730105464Sphk/*
731108558Sphk * Create a sequence of work structures, and have g_bde_map_sector() determine
732108558Sphk * how long they each can be.  Feed them to g_bde_start2().
733105464Sphk */
734105464Sphk
735105464Sphkvoid
736105464Sphkg_bde_start1(struct bio *bp)
737105464Sphk{
738105464Sphk	struct g_bde_softc *sc;
739105464Sphk	struct g_bde_work *wp;
740108558Sphk	off_t done;
741105464Sphk
742105464Sphk	sc = bp->bio_to->geom->softc;
743105464Sphk	bp->bio_driver1 = sc;
744105464Sphk
745105464Sphk	mtx_lock(&sc->worklist_mutex);
746108558Sphk	for(done = 0; done < bp->bio_length; ) {
747105464Sphk		wp = g_bde_new_work(sc);
748114038Sphk		if (wp != NULL) {
749114038Sphk			wp->bp = bp;
750114038Sphk			wp->offset = bp->bio_offset + done;
751114038Sphk			wp->data = bp->bio_data + done;
752114038Sphk			wp->length = bp->bio_length - done;
753114038Sphk			g_bde_map_sector(wp);
754114038Sphk			done += wp->length;
755114038Sphk			g_bde_start2(wp);
756105464Sphk		}
757114038Sphk		if (wp == NULL || bp->bio_error != 0) {
758114038Sphk			g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
759114038Sphk			break;
760114038Sphk		}
761105464Sphk	}
762105464Sphk	mtx_unlock(&sc->worklist_mutex);
763108052Sphk	return;
764105464Sphk}
765