Deleted Added
full compact
g_bde_work.c (151897) g_bde_work.c (160964)
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * $FreeBSD: head/sys/geom/bde/g_bde_work.c 151897 2005-10-31 15:41:29Z rwatson $
32 * $FreeBSD: head/sys/geom/bde/g_bde_work.c 160964 2006-08-04 07:56:35Z yar $
33 */
34/*
35 * This source file contains the state-engine which makes things happen in the
36 * right order.
37 *
38 * Outline:
39 * 1) g_bde_start1()
40 * Break the struct bio into multiple work packets one per zone.
41 * 2) g_bde_start2()
42 * Setup the necessary sector buffers and start those read operations
43 * which we can start at this time and put the item on the work-list.
44 * 3) g_bde_worker()
45 * Scan the work-list for items which are ready for crypto processing
46 * and call the matching crypto function in g_bde_crypt.c and schedule
47 * any writes needed. Read operations finish here by releasing the
48 * sector buffers and delivering the original bio request.
49 * 4) g_bde_write_done()
50 * Release sector buffers and deliver the original bio request.
51 *
52 * Because of the C-scope rules, the functions are almost perfectly in the
53 * opposite order in this source file.
54 *
55 * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
56 * XXX: additional states to this state-engine. Since no hardware available
57 * XXX: at this time has AES support, implementing this has been postponed
58 * XXX: until such time as it would result in a benefit.
59 */
60
61#include <sys/param.h>
62#include <sys/bio.h>
63#include <sys/lock.h>
64#include <sys/mutex.h>
65#include <sys/queue.h>
66#include <sys/malloc.h>
67#include <sys/systm.h>
68#include <sys/kernel.h>
69#include <sys/sysctl.h>
70#include <sys/proc.h>
71#include <sys/kthread.h>
72
73#include <crypto/rijndael/rijndael-api-fst.h>
74#include <crypto/sha2/sha2.h>
75#include <geom/geom.h>
76#include <geom/bde/g_bde.h>
77
78static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
79static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
80static void g_bde_release_keysector(struct g_bde_work *wp);
81static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
82static int g_bde_start_read(struct g_bde_sector *sp);
83static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
84
85/*
86 * Work item allocation.
87 *
88 * C++ would call these constructors and destructors.
89 */
90static u_int g_bde_nwork;
91SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
92
93static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
94
95static struct g_bde_work *
96g_bde_new_work(struct g_bde_softc *sc)
97{
98 struct g_bde_work *wp;
99
100 wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
101 if (wp == NULL)
102 return (wp);
103 wp->state = SETUP;
104 wp->softc = sc;
105 g_bde_nwork++;
106 sc->nwork++;
107 TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
108 return (wp);
109}
110
111static void
112g_bde_delete_work(struct g_bde_work *wp)
113{
114 struct g_bde_softc *sc;
115
116 sc = wp->softc;
117 g_bde_nwork--;
118 sc->nwork--;
119 TAILQ_REMOVE(&sc->worklist, wp, list);
120 free(wp, M_GBDE);
121}
122
123/*
124 * Sector buffer allocation
125 *
126 * These two functions allocate and free back variable sized sector buffers
127 */
128
129static u_int g_bde_nsect;
130SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
131
132static void
133g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
134{
135
136 g_bde_nsect--;
137 sc->nsect--;
138 if (sp->malloc)
139 free(sp->data, M_GBDE);
140 free(sp, M_GBDE);
141}
142
143static struct g_bde_sector *
144g_bde_new_sector(struct g_bde_work *wp, u_int len)
145{
146 struct g_bde_sector *sp;
147
148 sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
149 if (sp == NULL)
150 return (sp);
151 if (len > 0) {
152 sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
153 if (sp->data == NULL) {
154 free(sp, M_GBDE);
155 return (NULL);
156 }
157 sp->malloc = 1;
158 }
159 g_bde_nsect++;
160 wp->softc->nsect++;
161 sp->size = len;
162 sp->softc = wp->softc;
163 sp->ref = 1;
164 sp->owner = wp;
165 sp->offset = wp->so;
166 sp->state = JUNK;
167 return (sp);
168}
169
170/*
171 * Skey sector cache.
172 *
173 * Nothing prevents two separate I/O requests from addressing the same zone
174 * and thereby needing the same skey sector. We therefore need to sequence
175 * I/O operations to the skey sectors. A certain amount of caching is also
176 * desirable, although the extent of benefit from this is not at this point
177 * determined.
178 *
179 * XXX: GEOM may be able to grow a generic caching facility at some point
180 * XXX: to support such needs.
181 */
182
183static u_int g_bde_ncache;
184SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
185
186static void
187g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
188{
189
190 g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
191 if (sp->ref != 0)
192 return;
193 TAILQ_REMOVE(&sc->freelist, sp, list);
194 g_bde_ncache--;
195 sc->ncache--;
196 bzero(sp->data, sp->size);
197 g_bde_delete_sector(sc, sp);
198}
199
200static struct g_bde_sector *
201g_bde_get_keysector(struct g_bde_work *wp)
202{
203 struct g_bde_sector *sp;
204 struct g_bde_softc *sc;
205 off_t offset;
206
207 offset = wp->kso;
208 g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
209 sc = wp->softc;
210
211 if (malloc_last_fail() < g_bde_ncache)
212 g_bde_purge_sector(sc, -1);
213
214 sp = TAILQ_FIRST(&sc->freelist);
215 if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
216 g_bde_purge_one_sector(sc, sp);
217
218 TAILQ_FOREACH(sp, &sc->freelist, list) {
219 if (sp->offset == offset)
220 break;
221 }
222 if (sp != NULL) {
223 sp->ref++;
224 KASSERT(sp->offset == offset, ("wrong offset"));
225 KASSERT(sp->softc == wp->softc, ("wrong softc"));
226 if (sp->ref == 1)
227 sp->owner = wp;
228 } else {
229 if (malloc_last_fail() < g_bde_ncache) {
230 TAILQ_FOREACH(sp, &sc->freelist, list)
231 if (sp->ref == 0)
232 break;
233 }
234 if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
235 sp = TAILQ_FIRST(&sc->freelist);
236 if (sp != NULL && sp->ref > 0)
237 sp = NULL;
238 if (sp == NULL) {
239 sp = g_bde_new_sector(wp, sc->sectorsize);
240 if (sp != NULL) {
241 g_bde_ncache++;
242 sc->ncache++;
243 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
244 sp->malloc = 2;
245 }
246 }
247 if (sp != NULL) {
248 sp->offset = offset;
249 sp->softc = wp->softc;
250 sp->ref = 1;
251 sp->owner = wp;
252 sp->state = JUNK;
253 sp->error = 0;
254 }
255 }
256 if (sp != NULL) {
257 TAILQ_REMOVE(&sc->freelist, sp, list);
258 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
259 sp->used = time_uptime;
260 }
261 wp->ksp = sp;
262 return(sp);
263}
264
265static void
266g_bde_release_keysector(struct g_bde_work *wp)
267{
268 struct g_bde_softc *sc;
269 struct g_bde_work *wp2;
270 struct g_bde_sector *sp;
271
272 sp = wp->ksp;
273 g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
274 KASSERT(sp->malloc == 2, ("Wrong sector released"));
275 sc = sp->softc;
276 KASSERT(sc != NULL, ("NULL sp->softc"));
277 KASSERT(wp == sp->owner, ("Releasing, not owner"));
278 sp->owner = NULL;
279 wp->ksp = NULL;
280 sp->ref--;
281 if (sp->ref > 0) {
282 TAILQ_REMOVE(&sc->freelist, sp, list);
283 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
284 TAILQ_FOREACH(wp2, &sc->worklist, list) {
285 if (wp2->ksp == sp) {
286 KASSERT(wp2 != wp, ("Self-reowning"));
287 sp->owner = wp2;
288 wakeup(sp->softc);
289 break;
290 }
291 }
292 KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
293 } else if (sp->error != 0) {
294 sp->offset = ~0;
295 sp->error = 0;
296 sp->state = JUNK;
297 }
298 TAILQ_REMOVE(&sc->freelist, sp, list);
299 TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
300}
301
302static void
303g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
304{
305 struct g_bde_sector *sp;
306 int n;
307
308 g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
309 if (fraction > 0)
310 n = sc->ncache / fraction + 1;
311 else
312 n = g_bde_ncache - malloc_last_fail();
313 if (n < 0)
314 return;
315 if (n > sc->ncache)
316 n = sc->ncache;
317 while(n--) {
318 TAILQ_FOREACH(sp, &sc->freelist, list) {
319 if (sp->ref != 0)
320 continue;
321 TAILQ_REMOVE(&sc->freelist, sp, list);
322 g_bde_ncache--;
323 sc->ncache--;
324 bzero(sp->data, sp->size);
325 g_bde_delete_sector(sc, sp);
326 break;
327 }
328 }
329}
330
331static struct g_bde_sector *
332g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
333{
334 struct g_bde_sector *sp;
335
336 g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
337 sp = g_bde_get_keysector(wp);
338 if (sp == NULL) {
339 g_bde_purge_sector(sc, -1);
340 sp = g_bde_get_keysector(wp);
341 }
342 if (sp == NULL)
343 return (sp);
344 if (sp->owner != wp)
345 return (sp);
346 if (sp->state == VALID)
347 return (sp);
348 if (g_bde_start_read(sp) == 0)
349 return (sp);
350 g_bde_release_keysector(wp);
351 return (NULL);
352}
353
354/*
355 * Contribute to the completion of the original bio request.
356 *
357 * We have no simple way to tell how many bits the original bio request has
358 * been segmented into, so the easiest way to determine when we can deliver
359 * it is to keep track of the number of bytes we have completed. We keep
360 * track of any errors underway and latch onto the first one.
361 *
362 * We always report "nothing done" in case of error, because random bits here
363 * and there may be completed and returning a number of completed bytes does
364 * not convey any useful information about which bytes they were. If some
365 * piece of broken code somewhere interprets this to mean that nothing has
366 * changed on the underlying media they deserve the lossage headed for them.
367 *
368 * A single mutex per g_bde instance is used to prevent contention.
369 */
370
371static void
372g_bde_contribute(struct bio *bp, off_t bytes, int error)
373{
374
375 g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
376 bp, (intmax_t)bytes, error);
377 if (bp->bio_error == 0)
378 bp->bio_error = error;
379 bp->bio_completed += bytes;
380 KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
381 if (bp->bio_completed == bp->bio_length) {
382 if (bp->bio_error != 0)
383 bp->bio_completed = 0;
384 g_io_deliver(bp, bp->bio_error);
385 }
386}
387
388/*
389 * This is the common case "we're done with this work package" function
390 */
391
392static void
393g_bde_work_done(struct g_bde_work *wp, int error)
394{
395
396 g_bde_contribute(wp->bp, wp->length, error);
397 if (wp->sp != NULL)
398 g_bde_delete_sector(wp->softc, wp->sp);
399 if (wp->ksp != NULL)
400 g_bde_release_keysector(wp);
401 g_bde_delete_work(wp);
402}
403
404/*
405 * A write operation has finished. When we have all expected cows in the
406 * barn close the door and call it a day.
407 */
408
409static void
410g_bde_write_done(struct bio *bp)
411{
412 struct g_bde_sector *sp;
413 struct g_bde_work *wp;
414 struct g_bde_softc *sc;
415
416 sp = bp->bio_caller1;
417 sc = bp->bio_caller2;
418 mtx_lock(&sc->worklist_mutex);
419 KASSERT(sp != NULL, ("NULL sp"));
420 KASSERT(sc != NULL, ("NULL sc"));
421 KASSERT(sp->owner != NULL, ("NULL sp->owner"));
422 g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
423 if (bp->bio_error == 0 && bp->bio_completed != sp->size)
424 bp->bio_error = EIO;
425 sp->error = bp->bio_error;
426 g_destroy_bio(bp);
427 wp = sp->owner;
428 if (wp->error == 0)
429 wp->error = sp->error;
430
431 if (wp->bp->bio_cmd == BIO_DELETE) {
432 KASSERT(sp == wp->sp, ("trashed delete op"));
433 g_bde_work_done(wp, wp->error);
434 mtx_unlock(&sc->worklist_mutex);
435 return;
436 }
437
438 KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
439 KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
440 if (wp->sp == sp) {
441 g_bde_delete_sector(sc, wp->sp);
442 wp->sp = NULL;
443 } else {
444 sp->state = VALID;
445 }
446 if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
447 g_bde_work_done(wp, wp->error);
448 mtx_unlock(&sc->worklist_mutex);
449 return;
450}
451
452/*
453 * Send a write request for the given sector down the pipeline.
454 */
455
456static int
457g_bde_start_write(struct g_bde_sector *sp)
458{
459 struct bio *bp;
460 struct g_bde_softc *sc;
461
462 g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
463 sc = sp->softc;
464 KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
465 KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
466 bp = g_new_bio();
467 if (bp == NULL)
468 return (ENOMEM);
469 bp->bio_cmd = BIO_WRITE;
470 bp->bio_offset = sp->offset;
471 bp->bio_data = sp->data;
472 bp->bio_length = sp->size;
473 bp->bio_done = g_bde_write_done;
474 bp->bio_caller1 = sp;
475 bp->bio_caller2 = sc;
476 sp->state = IO;
477 g_io_request(bp, sc->consumer);
478 return(0);
479}
480
481/*
482 * A read operation has finished. Mark the sector no longer iobusy and
483 * wake up the worker thread and let it do its thing.
484 */
485
486static void
487g_bde_read_done(struct bio *bp)
488{
489 struct g_bde_sector *sp;
490 struct g_bde_softc *sc;
491
492 sp = bp->bio_caller1;
493 g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
494 sc = bp->bio_caller2;
495 mtx_lock(&sc->worklist_mutex);
496 if (bp->bio_error == 0 && bp->bio_completed != sp->size)
497 bp->bio_error = EIO;
498 sp->error = bp->bio_error;
499 if (sp->error == 0)
500 sp->state = VALID;
501 else
502 sp->state = JUNK;
503 wakeup(sc);
504 g_destroy_bio(bp);
505 mtx_unlock(&sc->worklist_mutex);
506}
507
508/*
509 * Send a read request for the given sector down the pipeline.
510 */
511
512static int
513g_bde_start_read(struct g_bde_sector *sp)
514{
515 struct bio *bp;
516 struct g_bde_softc *sc;
517
518 g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
519 sc = sp->softc;
520 KASSERT(sc != NULL, ("Null softc in sp %p", sp));
521 bp = g_new_bio();
522 if (bp == NULL)
523 return (ENOMEM);
524 bp->bio_cmd = BIO_READ;
525 bp->bio_offset = sp->offset;
526 bp->bio_data = sp->data;
527 bp->bio_length = sp->size;
528 bp->bio_done = g_bde_read_done;
529 bp->bio_caller1 = sp;
530 bp->bio_caller2 = sc;
531 sp->state = IO;
532 g_io_request(bp, sc->consumer);
533 return(0);
534}
535
536/*
537 * The worker thread.
538 *
539 * The up/down path of GEOM is not allowed to sleep or do any major work
540 * so we use this thread to do the actual crypto operations and to push
541 * the state engine onwards.
542 *
543 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
544 * XXX: using a thread here is probably not needed.
545 */
546
547void
548g_bde_worker(void *arg)
549{
550 struct g_bde_softc *sc;
551 struct g_bde_work *wp, *twp;
552 struct g_geom *gp;
553 int restart, error;
554
555 gp = arg;
556 sc = gp->softc;
557
558 mtx_lock(&sc->worklist_mutex);
559 for (;;) {
560 restart = 0;
561 g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
562 TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
563 KASSERT(wp != NULL, ("NULL wp"));
564 KASSERT(wp->softc != NULL, ("NULL wp->softc"));
565 if (wp->state != WAIT)
566 continue; /* Not interesting here */
567
568 KASSERT(wp->bp != NULL, ("NULL wp->bp"));
569 KASSERT(wp->sp != NULL, ("NULL wp->sp"));
570
571 if (wp->ksp != NULL) {
572 if (wp->ksp->owner != wp)
573 continue;
574 if (wp->ksp->state == IO)
575 continue;
576 KASSERT(wp->ksp->state == VALID,
577 ("Illegal sector state (%d)",
578 wp->ksp->state));
579 }
580
581 if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
582 continue;
583
584 if (wp->ksp != NULL && wp->ksp->error != 0) {
585 g_bde_work_done(wp, wp->ksp->error);
586 continue;
587 }
588 switch(wp->bp->bio_cmd) {
589 case BIO_READ:
590 if (wp->ksp == NULL) {
591 KASSERT(wp->error != 0,
592 ("BIO_READ, no ksp and no error"));
593 g_bde_work_done(wp, wp->error);
594 break;
595 }
596 if (wp->sp->error != 0) {
597 g_bde_work_done(wp, wp->sp->error);
598 break;
599 }
600 mtx_unlock(&sc->worklist_mutex);
601 g_bde_crypt_read(wp);
602 mtx_lock(&sc->worklist_mutex);
603 restart++;
604 g_bde_work_done(wp, wp->sp->error);
605 break;
606 case BIO_WRITE:
607 wp->state = FINISH;
608 KASSERT(wp->sp->owner == wp,
609 ("Write not owner sp"));
610 KASSERT(wp->ksp->owner == wp,
611 ("Write not owner ksp"));
612 mtx_unlock(&sc->worklist_mutex);
613 g_bde_crypt_write(wp);
614 mtx_lock(&sc->worklist_mutex);
615 restart++;
616 error = g_bde_start_write(wp->sp);
617 if (error) {
618 g_bde_work_done(wp, error);
619 break;
620 }
621 error = g_bde_start_write(wp->ksp);
622 if (wp->error != 0)
623 wp->error = error;
624 break;
625 case BIO_DELETE:
626 wp->state = FINISH;
627 mtx_unlock(&sc->worklist_mutex);
628 g_bde_crypt_delete(wp);
629 mtx_lock(&sc->worklist_mutex);
630 restart++;
631 g_bde_start_write(wp->sp);
632 break;
633 }
634 if (restart)
635 break;
636 }
637 if (!restart) {
638 /*
639 * We don't look for our death-warrant until we are
640 * idle. Shouldn't make a difference in practice.
641 */
642 if (sc->dead)
643 break;
644 g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
645 error = msleep(sc, &sc->worklist_mutex,
646 PRIBIO, "-", hz);
647 if (error == EWOULDBLOCK) {
648 /*
33 */
34/*
35 * This source file contains the state-engine which makes things happen in the
36 * right order.
37 *
38 * Outline:
39 * 1) g_bde_start1()
40 * Break the struct bio into multiple work packets one per zone.
41 * 2) g_bde_start2()
42 * Setup the necessary sector buffers and start those read operations
43 * which we can start at this time and put the item on the work-list.
44 * 3) g_bde_worker()
45 * Scan the work-list for items which are ready for crypto processing
46 * and call the matching crypto function in g_bde_crypt.c and schedule
47 * any writes needed. Read operations finish here by releasing the
48 * sector buffers and delivering the original bio request.
49 * 4) g_bde_write_done()
50 * Release sector buffers and deliver the original bio request.
51 *
52 * Because of the C-scope rules, the functions are almost perfectly in the
53 * opposite order in this source file.
54 *
55 * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
56 * XXX: additional states to this state-engine. Since no hardware available
57 * XXX: at this time has AES support, implementing this has been postponed
58 * XXX: until such time as it would result in a benefit.
59 */
60
61#include <sys/param.h>
62#include <sys/bio.h>
63#include <sys/lock.h>
64#include <sys/mutex.h>
65#include <sys/queue.h>
66#include <sys/malloc.h>
67#include <sys/systm.h>
68#include <sys/kernel.h>
69#include <sys/sysctl.h>
70#include <sys/proc.h>
71#include <sys/kthread.h>
72
73#include <crypto/rijndael/rijndael-api-fst.h>
74#include <crypto/sha2/sha2.h>
75#include <geom/geom.h>
76#include <geom/bde/g_bde.h>
77
78static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
79static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
80static void g_bde_release_keysector(struct g_bde_work *wp);
81static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp);
82static int g_bde_start_read(struct g_bde_sector *sp);
83static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction);
84
85/*
86 * Work item allocation.
87 *
88 * C++ would call these constructors and destructors.
89 */
90static u_int g_bde_nwork;
91SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
92
93static MALLOC_DEFINE(M_GBDE, "gbde", "GBDE data structures");
94
95static struct g_bde_work *
96g_bde_new_work(struct g_bde_softc *sc)
97{
98 struct g_bde_work *wp;
99
100 wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO);
101 if (wp == NULL)
102 return (wp);
103 wp->state = SETUP;
104 wp->softc = sc;
105 g_bde_nwork++;
106 sc->nwork++;
107 TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
108 return (wp);
109}
110
111static void
112g_bde_delete_work(struct g_bde_work *wp)
113{
114 struct g_bde_softc *sc;
115
116 sc = wp->softc;
117 g_bde_nwork--;
118 sc->nwork--;
119 TAILQ_REMOVE(&sc->worklist, wp, list);
120 free(wp, M_GBDE);
121}
122
123/*
124 * Sector buffer allocation
125 *
126 * These two functions allocate and free back variable sized sector buffers
127 */
128
129static u_int g_bde_nsect;
130SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
131
132static void
133g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
134{
135
136 g_bde_nsect--;
137 sc->nsect--;
138 if (sp->malloc)
139 free(sp->data, M_GBDE);
140 free(sp, M_GBDE);
141}
142
143static struct g_bde_sector *
144g_bde_new_sector(struct g_bde_work *wp, u_int len)
145{
146 struct g_bde_sector *sp;
147
148 sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO);
149 if (sp == NULL)
150 return (sp);
151 if (len > 0) {
152 sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO);
153 if (sp->data == NULL) {
154 free(sp, M_GBDE);
155 return (NULL);
156 }
157 sp->malloc = 1;
158 }
159 g_bde_nsect++;
160 wp->softc->nsect++;
161 sp->size = len;
162 sp->softc = wp->softc;
163 sp->ref = 1;
164 sp->owner = wp;
165 sp->offset = wp->so;
166 sp->state = JUNK;
167 return (sp);
168}
169
170/*
171 * Skey sector cache.
172 *
173 * Nothing prevents two separate I/O requests from addressing the same zone
174 * and thereby needing the same skey sector. We therefore need to sequence
175 * I/O operations to the skey sectors. A certain amount of caching is also
176 * desirable, although the extent of benefit from this is not at this point
177 * determined.
178 *
179 * XXX: GEOM may be able to grow a generic caching facility at some point
180 * XXX: to support such needs.
181 */
182
183static u_int g_bde_ncache;
184SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
185
186static void
187g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
188{
189
190 g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp);
191 if (sp->ref != 0)
192 return;
193 TAILQ_REMOVE(&sc->freelist, sp, list);
194 g_bde_ncache--;
195 sc->ncache--;
196 bzero(sp->data, sp->size);
197 g_bde_delete_sector(sc, sp);
198}
199
200static struct g_bde_sector *
201g_bde_get_keysector(struct g_bde_work *wp)
202{
203 struct g_bde_sector *sp;
204 struct g_bde_softc *sc;
205 off_t offset;
206
207 offset = wp->kso;
208 g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset);
209 sc = wp->softc;
210
211 if (malloc_last_fail() < g_bde_ncache)
212 g_bde_purge_sector(sc, -1);
213
214 sp = TAILQ_FIRST(&sc->freelist);
215 if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime)
216 g_bde_purge_one_sector(sc, sp);
217
218 TAILQ_FOREACH(sp, &sc->freelist, list) {
219 if (sp->offset == offset)
220 break;
221 }
222 if (sp != NULL) {
223 sp->ref++;
224 KASSERT(sp->offset == offset, ("wrong offset"));
225 KASSERT(sp->softc == wp->softc, ("wrong softc"));
226 if (sp->ref == 1)
227 sp->owner = wp;
228 } else {
229 if (malloc_last_fail() < g_bde_ncache) {
230 TAILQ_FOREACH(sp, &sc->freelist, list)
231 if (sp->ref == 0)
232 break;
233 }
234 if (sp == NULL && !TAILQ_EMPTY(&sc->freelist))
235 sp = TAILQ_FIRST(&sc->freelist);
236 if (sp != NULL && sp->ref > 0)
237 sp = NULL;
238 if (sp == NULL) {
239 sp = g_bde_new_sector(wp, sc->sectorsize);
240 if (sp != NULL) {
241 g_bde_ncache++;
242 sc->ncache++;
243 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
244 sp->malloc = 2;
245 }
246 }
247 if (sp != NULL) {
248 sp->offset = offset;
249 sp->softc = wp->softc;
250 sp->ref = 1;
251 sp->owner = wp;
252 sp->state = JUNK;
253 sp->error = 0;
254 }
255 }
256 if (sp != NULL) {
257 TAILQ_REMOVE(&sc->freelist, sp, list);
258 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
259 sp->used = time_uptime;
260 }
261 wp->ksp = sp;
262 return(sp);
263}
264
265static void
266g_bde_release_keysector(struct g_bde_work *wp)
267{
268 struct g_bde_softc *sc;
269 struct g_bde_work *wp2;
270 struct g_bde_sector *sp;
271
272 sp = wp->ksp;
273 g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp);
274 KASSERT(sp->malloc == 2, ("Wrong sector released"));
275 sc = sp->softc;
276 KASSERT(sc != NULL, ("NULL sp->softc"));
277 KASSERT(wp == sp->owner, ("Releasing, not owner"));
278 sp->owner = NULL;
279 wp->ksp = NULL;
280 sp->ref--;
281 if (sp->ref > 0) {
282 TAILQ_REMOVE(&sc->freelist, sp, list);
283 TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
284 TAILQ_FOREACH(wp2, &sc->worklist, list) {
285 if (wp2->ksp == sp) {
286 KASSERT(wp2 != wp, ("Self-reowning"));
287 sp->owner = wp2;
288 wakeup(sp->softc);
289 break;
290 }
291 }
292 KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
293 } else if (sp->error != 0) {
294 sp->offset = ~0;
295 sp->error = 0;
296 sp->state = JUNK;
297 }
298 TAILQ_REMOVE(&sc->freelist, sp, list);
299 TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
300}
301
302static void
303g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
304{
305 struct g_bde_sector *sp;
306 int n;
307
308 g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
309 if (fraction > 0)
310 n = sc->ncache / fraction + 1;
311 else
312 n = g_bde_ncache - malloc_last_fail();
313 if (n < 0)
314 return;
315 if (n > sc->ncache)
316 n = sc->ncache;
317 while(n--) {
318 TAILQ_FOREACH(sp, &sc->freelist, list) {
319 if (sp->ref != 0)
320 continue;
321 TAILQ_REMOVE(&sc->freelist, sp, list);
322 g_bde_ncache--;
323 sc->ncache--;
324 bzero(sp->data, sp->size);
325 g_bde_delete_sector(sc, sp);
326 break;
327 }
328 }
329}
330
331static struct g_bde_sector *
332g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp)
333{
334 struct g_bde_sector *sp;
335
336 g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp);
337 sp = g_bde_get_keysector(wp);
338 if (sp == NULL) {
339 g_bde_purge_sector(sc, -1);
340 sp = g_bde_get_keysector(wp);
341 }
342 if (sp == NULL)
343 return (sp);
344 if (sp->owner != wp)
345 return (sp);
346 if (sp->state == VALID)
347 return (sp);
348 if (g_bde_start_read(sp) == 0)
349 return (sp);
350 g_bde_release_keysector(wp);
351 return (NULL);
352}
353
354/*
355 * Contribute to the completion of the original bio request.
356 *
357 * We have no simple way to tell how many bits the original bio request has
358 * been segmented into, so the easiest way to determine when we can deliver
359 * it is to keep track of the number of bytes we have completed. We keep
360 * track of any errors underway and latch onto the first one.
361 *
362 * We always report "nothing done" in case of error, because random bits here
363 * and there may be completed and returning a number of completed bytes does
364 * not convey any useful information about which bytes they were. If some
365 * piece of broken code somewhere interprets this to mean that nothing has
366 * changed on the underlying media they deserve the lossage headed for them.
367 *
368 * A single mutex per g_bde instance is used to prevent contention.
369 */
370
371static void
372g_bde_contribute(struct bio *bp, off_t bytes, int error)
373{
374
375 g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
376 bp, (intmax_t)bytes, error);
377 if (bp->bio_error == 0)
378 bp->bio_error = error;
379 bp->bio_completed += bytes;
380 KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
381 if (bp->bio_completed == bp->bio_length) {
382 if (bp->bio_error != 0)
383 bp->bio_completed = 0;
384 g_io_deliver(bp, bp->bio_error);
385 }
386}
387
388/*
389 * This is the common case "we're done with this work package" function
390 */
391
392static void
393g_bde_work_done(struct g_bde_work *wp, int error)
394{
395
396 g_bde_contribute(wp->bp, wp->length, error);
397 if (wp->sp != NULL)
398 g_bde_delete_sector(wp->softc, wp->sp);
399 if (wp->ksp != NULL)
400 g_bde_release_keysector(wp);
401 g_bde_delete_work(wp);
402}
403
404/*
405 * A write operation has finished. When we have all expected cows in the
406 * barn close the door and call it a day.
407 */
408
409static void
410g_bde_write_done(struct bio *bp)
411{
412 struct g_bde_sector *sp;
413 struct g_bde_work *wp;
414 struct g_bde_softc *sc;
415
416 sp = bp->bio_caller1;
417 sc = bp->bio_caller2;
418 mtx_lock(&sc->worklist_mutex);
419 KASSERT(sp != NULL, ("NULL sp"));
420 KASSERT(sc != NULL, ("NULL sc"));
421 KASSERT(sp->owner != NULL, ("NULL sp->owner"));
422 g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
423 if (bp->bio_error == 0 && bp->bio_completed != sp->size)
424 bp->bio_error = EIO;
425 sp->error = bp->bio_error;
426 g_destroy_bio(bp);
427 wp = sp->owner;
428 if (wp->error == 0)
429 wp->error = sp->error;
430
431 if (wp->bp->bio_cmd == BIO_DELETE) {
432 KASSERT(sp == wp->sp, ("trashed delete op"));
433 g_bde_work_done(wp, wp->error);
434 mtx_unlock(&sc->worklist_mutex);
435 return;
436 }
437
438 KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
439 KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
440 if (wp->sp == sp) {
441 g_bde_delete_sector(sc, wp->sp);
442 wp->sp = NULL;
443 } else {
444 sp->state = VALID;
445 }
446 if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID)
447 g_bde_work_done(wp, wp->error);
448 mtx_unlock(&sc->worklist_mutex);
449 return;
450}
451
452/*
453 * Send a write request for the given sector down the pipeline.
454 */
455
456static int
457g_bde_start_write(struct g_bde_sector *sp)
458{
459 struct bio *bp;
460 struct g_bde_softc *sc;
461
462 g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
463 sc = sp->softc;
464 KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
465 KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
466 bp = g_new_bio();
467 if (bp == NULL)
468 return (ENOMEM);
469 bp->bio_cmd = BIO_WRITE;
470 bp->bio_offset = sp->offset;
471 bp->bio_data = sp->data;
472 bp->bio_length = sp->size;
473 bp->bio_done = g_bde_write_done;
474 bp->bio_caller1 = sp;
475 bp->bio_caller2 = sc;
476 sp->state = IO;
477 g_io_request(bp, sc->consumer);
478 return(0);
479}
480
481/*
482 * A read operation has finished. Mark the sector no longer iobusy and
483 * wake up the worker thread and let it do its thing.
484 */
485
486static void
487g_bde_read_done(struct bio *bp)
488{
489 struct g_bde_sector *sp;
490 struct g_bde_softc *sc;
491
492 sp = bp->bio_caller1;
493 g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
494 sc = bp->bio_caller2;
495 mtx_lock(&sc->worklist_mutex);
496 if (bp->bio_error == 0 && bp->bio_completed != sp->size)
497 bp->bio_error = EIO;
498 sp->error = bp->bio_error;
499 if (sp->error == 0)
500 sp->state = VALID;
501 else
502 sp->state = JUNK;
503 wakeup(sc);
504 g_destroy_bio(bp);
505 mtx_unlock(&sc->worklist_mutex);
506}
507
508/*
509 * Send a read request for the given sector down the pipeline.
510 */
511
512static int
513g_bde_start_read(struct g_bde_sector *sp)
514{
515 struct bio *bp;
516 struct g_bde_softc *sc;
517
518 g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
519 sc = sp->softc;
520 KASSERT(sc != NULL, ("Null softc in sp %p", sp));
521 bp = g_new_bio();
522 if (bp == NULL)
523 return (ENOMEM);
524 bp->bio_cmd = BIO_READ;
525 bp->bio_offset = sp->offset;
526 bp->bio_data = sp->data;
527 bp->bio_length = sp->size;
528 bp->bio_done = g_bde_read_done;
529 bp->bio_caller1 = sp;
530 bp->bio_caller2 = sc;
531 sp->state = IO;
532 g_io_request(bp, sc->consumer);
533 return(0);
534}
535
536/*
537 * The worker thread.
538 *
539 * The up/down path of GEOM is not allowed to sleep or do any major work
540 * so we use this thread to do the actual crypto operations and to push
541 * the state engine onwards.
542 *
543 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
544 * XXX: using a thread here is probably not needed.
545 */
546
547void
548g_bde_worker(void *arg)
549{
550 struct g_bde_softc *sc;
551 struct g_bde_work *wp, *twp;
552 struct g_geom *gp;
553 int restart, error;
554
555 gp = arg;
556 sc = gp->softc;
557
558 mtx_lock(&sc->worklist_mutex);
559 for (;;) {
560 restart = 0;
561 g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
562 TAILQ_FOREACH_SAFE(wp, &sc->worklist, list, twp) {
563 KASSERT(wp != NULL, ("NULL wp"));
564 KASSERT(wp->softc != NULL, ("NULL wp->softc"));
565 if (wp->state != WAIT)
566 continue; /* Not interesting here */
567
568 KASSERT(wp->bp != NULL, ("NULL wp->bp"));
569 KASSERT(wp->sp != NULL, ("NULL wp->sp"));
570
571 if (wp->ksp != NULL) {
572 if (wp->ksp->owner != wp)
573 continue;
574 if (wp->ksp->state == IO)
575 continue;
576 KASSERT(wp->ksp->state == VALID,
577 ("Illegal sector state (%d)",
578 wp->ksp->state));
579 }
580
581 if (wp->bp->bio_cmd == BIO_READ && wp->sp->state == IO)
582 continue;
583
584 if (wp->ksp != NULL && wp->ksp->error != 0) {
585 g_bde_work_done(wp, wp->ksp->error);
586 continue;
587 }
588 switch(wp->bp->bio_cmd) {
589 case BIO_READ:
590 if (wp->ksp == NULL) {
591 KASSERT(wp->error != 0,
592 ("BIO_READ, no ksp and no error"));
593 g_bde_work_done(wp, wp->error);
594 break;
595 }
596 if (wp->sp->error != 0) {
597 g_bde_work_done(wp, wp->sp->error);
598 break;
599 }
600 mtx_unlock(&sc->worklist_mutex);
601 g_bde_crypt_read(wp);
602 mtx_lock(&sc->worklist_mutex);
603 restart++;
604 g_bde_work_done(wp, wp->sp->error);
605 break;
606 case BIO_WRITE:
607 wp->state = FINISH;
608 KASSERT(wp->sp->owner == wp,
609 ("Write not owner sp"));
610 KASSERT(wp->ksp->owner == wp,
611 ("Write not owner ksp"));
612 mtx_unlock(&sc->worklist_mutex);
613 g_bde_crypt_write(wp);
614 mtx_lock(&sc->worklist_mutex);
615 restart++;
616 error = g_bde_start_write(wp->sp);
617 if (error) {
618 g_bde_work_done(wp, error);
619 break;
620 }
621 error = g_bde_start_write(wp->ksp);
622 if (wp->error != 0)
623 wp->error = error;
624 break;
625 case BIO_DELETE:
626 wp->state = FINISH;
627 mtx_unlock(&sc->worklist_mutex);
628 g_bde_crypt_delete(wp);
629 mtx_lock(&sc->worklist_mutex);
630 restart++;
631 g_bde_start_write(wp->sp);
632 break;
633 }
634 if (restart)
635 break;
636 }
637 if (!restart) {
638 /*
639 * We don't look for our death-warrant until we are
640 * idle. Shouldn't make a difference in practice.
641 */
642 if (sc->dead)
643 break;
644 g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
645 error = msleep(sc, &sc->worklist_mutex,
646 PRIBIO, "-", hz);
647 if (error == EWOULDBLOCK) {
648 /*
649 * Loose our skey cache in an orderly fashion.
649 * Lose our skey cache in an orderly fashion.
650 * The exact rate can be tuned to be less
651 * aggressive if this is desirable. 10% per
652 * second means that the cache is gone in a
653 * few minutes.
654 */
655 g_bde_purge_sector(sc, 10);
656 }
657 }
658 }
659 g_trace(G_T_TOPOLOGY, "g_bde_worker die");
660 g_bde_purge_sector(sc, 1);
661 KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
662 KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
663 KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
664 mtx_unlock(&sc->worklist_mutex);
665 sc->dead = 2;
666 wakeup(sc);
667 kthread_exit(0);
668}
669
670/*
671 * g_bde_start1 has chopped the incoming request up so all the requests
672 * we see here are inside a single zone. Map the data and key locations
673 * grab the buffers we need and fire off the first volley of read requests.
674 */
675
676static void
677g_bde_start2(struct g_bde_work *wp)
678{
679 struct g_bde_softc *sc;
680
681 KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
682 KASSERT(wp->softc != NULL, ("NULL wp->softc"));
683 g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
684 sc = wp->softc;
685 switch (wp->bp->bio_cmd) {
686 case BIO_READ:
687 wp->sp = g_bde_new_sector(wp, 0);
688 if (wp->sp == NULL) {
689 g_bde_work_done(wp, ENOMEM);
690 return;
691 }
692 wp->sp->size = wp->length;
693 wp->sp->data = wp->data;
694 if (g_bde_start_read(wp->sp) != 0) {
695 g_bde_work_done(wp, ENOMEM);
696 return;
697 }
698 g_bde_read_keysector(sc, wp);
699 if (wp->ksp == NULL)
700 wp->error = ENOMEM;
701 break;
702 case BIO_DELETE:
703 wp->sp = g_bde_new_sector(wp, wp->length);
704 if (wp->sp == NULL) {
705 g_bde_work_done(wp, ENOMEM);
706 return;
707 }
708 break;
709 case BIO_WRITE:
710 wp->sp = g_bde_new_sector(wp, wp->length);
711 if (wp->sp == NULL) {
712 g_bde_work_done(wp, ENOMEM);
713 return;
714 }
715 g_bde_read_keysector(sc, wp);
716 if (wp->ksp == NULL) {
717 g_bde_work_done(wp, ENOMEM);
718 return;
719 }
720 break;
721 default:
722 KASSERT(0 == 1,
723 ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
724 }
725
726 wp->state = WAIT;
727 wakeup(sc);
728}
729
730/*
731 * Create a sequence of work structures, and have g_bde_map_sector() determine
732 * how long they each can be. Feed them to g_bde_start2().
733 */
734
735void
736g_bde_start1(struct bio *bp)
737{
738 struct g_bde_softc *sc;
739 struct g_bde_work *wp;
740 off_t done;
741
742 sc = bp->bio_to->geom->softc;
743 bp->bio_driver1 = sc;
744
745 mtx_lock(&sc->worklist_mutex);
746 for(done = 0; done < bp->bio_length; ) {
747 wp = g_bde_new_work(sc);
748 if (wp != NULL) {
749 wp->bp = bp;
750 wp->offset = bp->bio_offset + done;
751 wp->data = bp->bio_data + done;
752 wp->length = bp->bio_length - done;
753 g_bde_map_sector(wp);
754 done += wp->length;
755 g_bde_start2(wp);
756 }
757 if (wp == NULL || bp->bio_error != 0) {
758 g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
759 break;
760 }
761 }
762 mtx_unlock(&sc->worklist_mutex);
763 return;
764}
650 * The exact rate can be tuned to be less
651 * aggressive if this is desirable. 10% per
652 * second means that the cache is gone in a
653 * few minutes.
654 */
655 g_bde_purge_sector(sc, 10);
656 }
657 }
658 }
659 g_trace(G_T_TOPOLOGY, "g_bde_worker die");
660 g_bde_purge_sector(sc, 1);
661 KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
662 KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
663 KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
664 mtx_unlock(&sc->worklist_mutex);
665 sc->dead = 2;
666 wakeup(sc);
667 kthread_exit(0);
668}
669
670/*
671 * g_bde_start1 has chopped the incoming request up so all the requests
672 * we see here are inside a single zone. Map the data and key locations
673 * grab the buffers we need and fire off the first volley of read requests.
674 */
675
676static void
677g_bde_start2(struct g_bde_work *wp)
678{
679 struct g_bde_softc *sc;
680
681 KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
682 KASSERT(wp->softc != NULL, ("NULL wp->softc"));
683 g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
684 sc = wp->softc;
685 switch (wp->bp->bio_cmd) {
686 case BIO_READ:
687 wp->sp = g_bde_new_sector(wp, 0);
688 if (wp->sp == NULL) {
689 g_bde_work_done(wp, ENOMEM);
690 return;
691 }
692 wp->sp->size = wp->length;
693 wp->sp->data = wp->data;
694 if (g_bde_start_read(wp->sp) != 0) {
695 g_bde_work_done(wp, ENOMEM);
696 return;
697 }
698 g_bde_read_keysector(sc, wp);
699 if (wp->ksp == NULL)
700 wp->error = ENOMEM;
701 break;
702 case BIO_DELETE:
703 wp->sp = g_bde_new_sector(wp, wp->length);
704 if (wp->sp == NULL) {
705 g_bde_work_done(wp, ENOMEM);
706 return;
707 }
708 break;
709 case BIO_WRITE:
710 wp->sp = g_bde_new_sector(wp, wp->length);
711 if (wp->sp == NULL) {
712 g_bde_work_done(wp, ENOMEM);
713 return;
714 }
715 g_bde_read_keysector(sc, wp);
716 if (wp->ksp == NULL) {
717 g_bde_work_done(wp, ENOMEM);
718 return;
719 }
720 break;
721 default:
722 KASSERT(0 == 1,
723 ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
724 }
725
726 wp->state = WAIT;
727 wakeup(sc);
728}
729
730/*
731 * Create a sequence of work structures, and have g_bde_map_sector() determine
732 * how long they each can be. Feed them to g_bde_start2().
733 */
734
735void
736g_bde_start1(struct bio *bp)
737{
738 struct g_bde_softc *sc;
739 struct g_bde_work *wp;
740 off_t done;
741
742 sc = bp->bio_to->geom->softc;
743 bp->bio_driver1 = sc;
744
745 mtx_lock(&sc->worklist_mutex);
746 for(done = 0; done < bp->bio_length; ) {
747 wp = g_bde_new_work(sc);
748 if (wp != NULL) {
749 wp->bp = bp;
750 wp->offset = bp->bio_offset + done;
751 wp->data = bp->bio_data + done;
752 wp->length = bp->bio_length - done;
753 g_bde_map_sector(wp);
754 done += wp->length;
755 g_bde_start2(wp);
756 }
757 if (wp == NULL || bp->bio_error != 0) {
758 g_bde_contribute(bp, bp->bio_length - done, ENOMEM);
759 break;
760 }
761 }
762 mtx_unlock(&sc->worklist_mutex);
763 return;
764}