g_bde_work.c revision 105520
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The names of the authors may not be used to endorse or promote
20 *    products derived from this software without specific prior written
21 *    permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * $FreeBSD: head/sys/geom/bde/g_bde_work.c 105520 2002-10-20 14:30:28Z phk $
36 *
37 * This source file contains the state-engine which makes things happen in the
38 * right order.
39 *
40 * Outline:
41 *   1) g_bde_start1()
42 *	Break the struct bio into multiple work packets one per zone.
43 *   2) g_bde_start2()
44 *	Setup the necessary sector buffers and start those read operations
45 *	which we can start at this time and put the item on the work-list.
46 *   3) g_bde_worker()
47 *	Scan the work-list for items which are ready for crypto processing
48 *	and call the matching crypto function in g_bde_crypt.c and schedule
49 *	any writes needed.  Read operations finish here by releasing the
50 *	sector buffers and delivering the original bio request.
51 *   4) g_bde_write_done()
52 *	Release sector buffers and deliver the original bio request.
53 *
54 * Because of the C-scope rules, the functions are almost perfectly in the
55 * opposite order in this source file.
56 *
57 * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add
58 * XXX: additional states to this state-engine.  Since no hardware available
59 * XXX: at this time has AES support, implementing this has been postponed
60 * XXX: until such time as it would result in a benefit.
61 */
62
63#include <sys/param.h>
64#include <sys/stdint.h>
65#include <sys/bio.h>
66#include <sys/lock.h>
67#include <sys/mutex.h>
68#include <sys/queue.h>
69#include <sys/malloc.h>
70#include <sys/systm.h>
71#include <sys/kernel.h>
72#include <sys/sysctl.h>
73#include <sys/proc.h>
74#include <sys/kthread.h>
75
76#include <geom/geom.h>
77#include <geom/bde/g_bde.h>
78
79static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp);
80static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len);
81static void g_bde_release_sector(struct g_bde_work *wp, struct g_bde_sector *sp);
82static struct g_bde_sector *g_bde_get_sector(struct g_bde_work *wp, off_t offset);
83static int g_bde_start_read(struct g_bde_sector *sp);
84
85/*
86 * Work item allocation.
87 *
88 * C++ would call these constructors and destructors.
89 */
90static u_int g_bde_nwork;
91SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, "");
92
93static struct g_bde_work *
94g_bde_new_work(struct g_bde_softc *sc)
95{
96	struct g_bde_work *wp;
97
98	wp = g_malloc(sizeof *wp, M_NOWAIT | M_ZERO);
99	if (wp == NULL)
100		return (wp);
101	wp->state = SETUP;
102	wp->softc = sc;
103	g_bde_nwork++;
104	sc->nwork++;
105	TAILQ_INSERT_TAIL(&sc->worklist, wp, list);
106	return (wp);
107}
108
109static void
110g_bde_delete_work(struct g_bde_work *wp)
111{
112	struct g_bde_softc *sc;
113
114	sc = wp->softc;
115	g_bde_nwork--;
116	sc->nwork--;
117	TAILQ_REMOVE(&sc->worklist, wp, list);
118	g_free(wp);
119}
120
121/*
122 * Sector buffer allocation
123 *
124 * These two functions allocate and free back variable sized sector buffers
125 */
126
127static u_int g_bde_nsect;
128SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, "");
129
130static void
131g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp)
132{
133
134	g_bde_nsect--;
135	sc->nsect--;
136	if (sp->malloc)
137		g_free(sp->data);
138	g_free(sp);
139}
140
141static struct g_bde_sector *
142g_bde_new_sector(struct g_bde_work *wp, u_int len)
143{
144	struct g_bde_sector *sp;
145
146	sp = g_malloc(sizeof *sp, M_NOWAIT | M_ZERO);
147	if (sp == NULL)
148		return (sp);
149	if (len > 0) {
150		sp->data = g_malloc(len, M_NOWAIT | M_ZERO);
151		if (sp->data == NULL) {
152			g_free(sp);
153			return (NULL);
154		}
155		sp->malloc = 1;
156	}
157	g_bde_nsect++;
158	wp->softc->nsect++;
159	sp->size = len;
160	sp->softc = wp->softc;
161	sp->ref = 1;
162	sp->owner = wp;
163	sp->offset = wp->so;
164	sp->state = JUNK;
165	return (sp);
166}
167
168/*
169 * Skey sector cache.
170 *
171 * Nothing prevents two separate I/O requests from addressing the same zone
172 * and thereby needing the same skey sector.  We therefore need to sequence
173 * I/O operations to the skey sectors.  A certain amount of caching is also
174 * desirable, although the extent of benefit from this is not at this point
175 * determined.
176 *
177 * XXX: GEOM may be able to grow a generic caching facility at some point
178 * XXX: to support such needs.
179 */
180
181static u_int g_bde_ncache;
182SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, "");
183
184static struct g_bde_sector *
185g_bde_get_sector(struct g_bde_work *wp, off_t offset)
186{
187	struct g_bde_sector *sp;
188	struct g_bde_softc *sc;
189
190	g_trace(G_T_TOPOLOGY, "g_bde_get_sector(%p, %jd)", wp, (intmax_t)offset);
191	sc = wp->softc;
192	TAILQ_FOREACH(sp, &sc->freelist, list) {
193		if (sp->offset == offset)
194			break;
195	}
196	if (sp != NULL) {
197		sp->ref++;
198		KASSERT(sp->offset == offset, ("wrong offset"));
199		KASSERT(sp->softc == wp->softc, ("wrong softc"));
200		if (sp->ref == 1)
201			sp->owner = wp;
202	} else {
203		if (!TAILQ_EMPTY(&sc->freelist))
204			sp = TAILQ_FIRST(&sc->freelist);
205		if (sp != NULL && sp->ref > 0)
206			sp = NULL;
207		if (sp == NULL) {
208			g_bde_ncache++;
209			sc->ncache++;
210			sp = g_bde_new_sector(wp, sc->sectorsize);
211			if (sp != NULL) {
212				TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
213				sp->malloc = 2;
214			}
215		}
216		if (sp != NULL) {
217			sp->offset = offset;
218			sp->softc = wp->softc;
219			sp->ref = 1;
220			sp->owner = wp;
221			sp->state = JUNK;
222			sp->error = 0;
223		}
224	}
225	if (sp != NULL) {
226		TAILQ_REMOVE(&sc->freelist, sp, list);
227		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
228	}
229	wp->ksp = sp;
230	KASSERT(sp != NULL, ("get_sector failed"));
231	return(sp);
232}
233
234static void
235g_bde_release_sector(struct g_bde_work *wp, struct g_bde_sector *sp)
236{
237	struct g_bde_softc *sc;
238	struct g_bde_work *wp2;
239
240	g_trace(G_T_TOPOLOGY, "g_bde_release_sector(%p)", sp);
241	KASSERT(sp->malloc == 2, ("Wrong sector released"));
242	sc = sp->softc;
243	KASSERT(sc != NULL, ("NULL sp->softc"));
244	KASSERT(wp == sp->owner, ("Releasing, not owner"));
245	sp->owner = NULL;
246	wp->ksp = NULL;
247	sp->ref--;
248	if (sp->ref > 0) {
249		TAILQ_REMOVE(&sc->freelist, sp, list);
250		TAILQ_INSERT_TAIL(&sc->freelist, sp, list);
251		TAILQ_FOREACH(wp2, &sc->worklist, list) {
252			if (wp2->ksp == sp) {
253				KASSERT(wp2 != wp, ("Self-reowning"));
254				sp->owner = wp2;
255				wakeup(sp->softc);
256				break;
257			}
258		}
259		KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp));
260	} else if (sp->error != 0) {
261		sp->offset = ~0;
262		sp->error = 0;
263		sp->state = JUNK;
264	}
265	TAILQ_REMOVE(&sc->freelist, sp, list);
266	TAILQ_INSERT_HEAD(&sc->freelist, sp, list);
267}
268
269static void
270g_bde_purge_sector(struct g_bde_softc *sc, int fraction)
271{
272	struct g_bde_sector *sp;
273	int n;
274
275	g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc);
276	n = sc->ncache / fraction + 1;
277	while(n--) {
278		TAILQ_FOREACH(sp, &sc->freelist, list) {
279			if (sp->ref != 0)
280				continue;
281			TAILQ_REMOVE(&sc->freelist, sp, list);
282			g_bde_ncache--;
283			sc->ncache--;
284			bzero(sp->data, sp->size);
285			g_bde_delete_sector(sc, sp);
286			break;
287		}
288	}
289}
290
291static struct g_bde_sector *
292g_bde_read_sector(struct g_bde_softc *sc, struct g_bde_work *wp, off_t offset)
293{
294	struct g_bde_sector *sp;
295
296	g_trace(G_T_TOPOLOGY, "g_bde_read_sector(%p)", wp);
297	sp = g_bde_get_sector(wp, offset);
298	if (sp == NULL)
299		return (sp);
300	if (sp->owner != wp)
301		return (sp);
302	if (sp->state == VALID)
303		return (sp);
304	if (g_bde_start_read(sp) == 0)
305		return (sp);
306	g_bde_release_sector(wp, sp);
307	return (NULL);
308}
309
310/*
311 * Contribute to the completion of the original bio request.
312 *
313 * We have no simple way to tell how many bits the original bio request has
314 * been segmented into, so the easiest way to determine when we can deliver
315 * it is to keep track of the number of bytes we have completed.  We keep
316 * track of any errors underway and latch onto the first one.
317 *
318 * We always report "nothing done" in case of error, because random bits here
319 * and there may be completed and returning a number of completed bytes does
320 * not convey any useful information about which bytes they were.  If some
321 * piece of broken code somewhere interprets this to mean that nothing has
322 * changed on the underlying media they deserve the lossage headed for them.
323 *
324 * A single mutex per g_bde instance is used to prevent contention.
325 */
326
327static void
328g_bde_contribute(struct bio *bp, off_t bytes, int error)
329{
330	struct g_bde_softc *sc;
331
332	g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d",
333	     bp, (intmax_t)bytes, error);
334	sc = bp->bio_driver1;
335	if (bp->bio_error == 0)
336		bp->bio_error = error;
337	bp->bio_completed += bytes;
338	KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution"));
339	if (bp->bio_completed == bp->bio_length) {
340		if (bp->bio_error != 0)
341			bp->bio_completed = 0;
342		g_io_deliver(bp, bp->bio_error);
343	}
344}
345
346/*
347 * A write operation has finished.  When we have all expected cows in the
348 * barn close the door and call it a day.
349 */
350
351static void
352g_bde_write_done(struct bio *bp)
353{
354	struct g_bde_sector *sp;
355	struct g_bde_work *wp;
356	struct g_bde_softc *sc;
357
358	sp = bp->bio_caller1;
359	sc = bp->bio_caller2;
360	mtx_lock(&sc->worklist_mutex);
361	KASSERT(sp != NULL, ("NULL sp"));
362	KASSERT(sc != NULL, ("NULL sc"));
363	KASSERT(sp->owner != NULL, ("NULL sp->owner"));
364	g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp);
365	sp->error = bp->bio_error;
366	g_destroy_bio(bp);
367	wp = sp->owner;
368	if (wp->error == 0)
369		wp->error = sp->error;
370
371	if (wp->bp->bio_cmd == BIO_DELETE) {
372		KASSERT(sp == wp->sp, ("trashed delete op"));
373		g_bde_contribute(wp->bp, wp->length, wp->error);
374		g_bde_delete_sector(sc, sp);
375		g_bde_delete_work(wp);
376		mtx_unlock(&sc->worklist_mutex);
377		return;
378	}
379
380	KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()"));
381	KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op"));
382	if (wp->sp == sp) {
383		g_bde_delete_sector(sc, wp->sp);
384		wp->sp = NULL;
385	} else {
386		sp->state = VALID;
387	}
388	if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID) {
389		g_bde_contribute(wp->bp, wp->length, wp->error);
390		g_bde_release_sector(wp, wp->ksp);
391		g_bde_delete_work(wp);
392	}
393	mtx_unlock(&sc->worklist_mutex);
394	return;
395}
396
397/*
398 * Send a write request for the given sector down the pipeline.
399 */
400
401static int
402g_bde_start_write(struct g_bde_sector *sp)
403{
404	struct bio *bp;
405	struct g_bde_softc *sc;
406
407	g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp);
408	sc = sp->softc;
409	KASSERT(sc != NULL, ("NULL sc in g_bde_start_write"));
410	KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write"));
411	bp = g_new_bio();
412	if (bp == NULL)
413		return (ENOMEM);
414	bp->bio_cmd = BIO_WRITE;
415	bp->bio_offset = sp->offset;
416	bp->bio_data = sp->data;
417	bp->bio_length = sp->size;
418	bp->bio_done = g_bde_write_done;
419	bp->bio_caller1 = sp;
420	bp->bio_caller2 = sc;
421	sp->state = IO;
422	g_io_request(bp, sc->consumer);
423	return(0);
424}
425
426/*
427 * A read operation has finished.  Mark the sector no longer iobusy and
428 * wake up the worker thread and let it do its thing.
429 */
430
431static void
432g_bde_read_done(struct bio *bp)
433{
434	struct g_bde_sector *sp;
435	struct g_bde_softc *sc;
436
437	sp = bp->bio_caller1;
438	g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp);
439	sc = bp->bio_caller2;
440	mtx_lock(&sc->worklist_mutex);
441	sp->error = bp->bio_error;
442	sp->state = VALID;
443	wakeup(sc);
444	g_destroy_bio(bp);
445	mtx_unlock(&sc->worklist_mutex);
446}
447
448/*
449 * Send a read request for the given sector down the pipeline.
450 */
451
452static int
453g_bde_start_read(struct g_bde_sector *sp)
454{
455	struct bio *bp;
456	struct g_bde_softc *sc;
457
458	g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp);
459	sc = sp->softc;
460	KASSERT(sc != NULL, ("Null softc in sp %p", sp));
461	bp = g_new_bio();
462	if (bp == NULL)
463		return (ENOMEM);
464	bp->bio_cmd = BIO_READ;
465	bp->bio_offset = sp->offset;
466	bp->bio_data = sp->data;
467	bp->bio_length = sp->size;
468	bp->bio_done = g_bde_read_done;
469	bp->bio_caller1 = sp;
470	bp->bio_caller2 = sc;
471	sp->state = IO;
472	g_io_request(bp, sc->consumer);
473	return(0);
474}
475
476/*
477 * The worker thread.
478 *
479 * The up/down path of GEOM is not allowed to sleep or do any major work
480 * so we use this thread to do the actual crypto operations and to push
481 * the state engine onwards.
482 *
483 * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption
484 * XXX: using a thread here is probably not needed.
485 */
486
487void
488g_bde_worker(void *arg)
489{
490	struct g_bde_softc *sc;
491	struct g_bde_work *wp;
492	struct g_geom *gp;
493	int busy, error;
494
495	gp = arg;
496	sc = gp->softc;
497
498	mtx_lock(&sc->worklist_mutex);
499	for (;;) {
500		busy = 0;
501		g_trace(G_T_TOPOLOGY, "g_bde_worker scan");
502		TAILQ_FOREACH(wp, &sc->worklist, list) {
503			KASSERT(wp != NULL, ("NULL wp"));
504			KASSERT(wp->softc != NULL, ("NULL wp->softc"));
505			if (wp->state != WAIT)
506				continue;		/* Not interesting here */
507
508			KASSERT(wp->bp != NULL, ("NULL wp->bp"));
509			KASSERT(wp->sp != NULL, ("NULL wp->sp"));
510
511			if (wp->ksp != NULL) {
512				if (wp->ksp->owner != wp)
513					continue;
514				if (wp->ksp->state == IO)
515					continue;
516				KASSERT(wp->ksp->state == VALID,
517				    ("Illegal sector state (JUNK ?)"));
518			}
519
520			if (wp->bp->bio_cmd == BIO_READ && wp->sp->state != VALID)
521				continue;
522
523			if (wp->ksp != NULL && wp->ksp->error != 0) {
524				g_bde_contribute(wp->bp, wp->length,
525				    wp->ksp->error);
526				g_bde_delete_sector(sc, wp->sp);
527				g_bde_release_sector(wp, wp->ksp);
528				g_bde_delete_work(wp);
529				busy++;
530				break;
531			}
532			switch(wp->bp->bio_cmd) {
533			case BIO_READ:
534				if (wp->ksp != NULL && wp->sp->error == 0) {
535					mtx_unlock(&sc->worklist_mutex);
536					g_bde_crypt_read(wp);
537					mtx_lock(&sc->worklist_mutex);
538				}
539				g_bde_contribute(wp->bp, wp->length,
540					    wp->sp->error);
541				g_bde_delete_sector(sc, wp->sp);
542				if (wp->ksp != NULL)
543					g_bde_release_sector(wp, wp->ksp);
544				g_bde_delete_work(wp);
545				break;
546			case BIO_WRITE:
547				wp->state = FINISH;
548				KASSERT(wp->sp->owner == wp, ("Write not owner sp"));
549				KASSERT(wp->ksp->owner == wp, ("Write not owner ksp"));
550				mtx_unlock(&sc->worklist_mutex);
551				g_bde_crypt_write(wp);
552				mtx_lock(&sc->worklist_mutex);
553				g_bde_start_write(wp->sp);
554				g_bde_start_write(wp->ksp);
555				break;
556			case BIO_DELETE:
557				wp->state = FINISH;
558				mtx_unlock(&sc->worklist_mutex);
559				g_bde_crypt_delete(wp);
560				mtx_lock(&sc->worklist_mutex);
561				g_bde_start_write(wp->sp);
562				break;
563			}
564			busy++;
565			break;
566		}
567		if (!busy) {
568			/*
569			 * We don't look for our death-warrant until we are
570			 * idle.  Shouldn't make a difference in practice.
571			 */
572			if (sc->dead)
573				break;
574			g_trace(G_T_TOPOLOGY, "g_bde_worker sleep");
575			error = msleep(sc, &sc->worklist_mutex,
576			    PRIBIO, "g_bde", hz);
577			if (error == EWOULDBLOCK) {
578				/*
579				 * Loose our skey cache in an orderly fashion.
580				 * The exact rate can be tuned to be less
581				 * aggressive if this is desirable.  10% per
582				 * second means that the cache is gone in a
583				 * few minutes.
584				 */
585				g_bde_purge_sector(sc, 10);
586			}
587		}
588	}
589	g_trace(G_T_TOPOLOGY, "g_bde_worker die");
590	g_bde_purge_sector(sc, 1);
591	KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork));
592	KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache));
593	KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect));
594	mtx_unlock(&sc->worklist_mutex);
595	sc->dead = 2;
596	wakeup(sc);
597	mtx_lock(&Giant);
598	kthread_exit(0);
599}
600
601/*
602 * g_bde_start1 has chopped the incoming request up so all the requests
603 * we see here are inside a single zone.  Map the data and key locations
604 * grab the buffers we need and fire off the first volley of read requests.
605 */
606
607static void
608g_bde_start2(struct g_bde_work *wp)
609{
610	struct g_bde_softc *sc;
611
612	KASSERT(wp != NULL, ("NULL wp in g_bde_start2"));
613	g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp);
614	sc = wp->softc;
615	KASSERT(wp->softc != NULL, ("NULL wp->softc"));
616	g_bde_map_sector(&sc->key, wp->offset, &wp->so, &wp->kso, &wp->ko);
617	if (wp->bp->bio_cmd == BIO_READ) {
618		wp->sp = g_bde_new_sector(wp, 0);
619		if (wp->sp == NULL) {
620			g_bde_contribute(wp->bp, wp->length, ENOMEM);
621			g_bde_delete_work(wp);
622			return;
623		}
624		wp->sp->size = wp->length;
625		wp->sp->data = wp->data;
626		if (g_bde_start_read(wp->sp) != 0) {
627			g_bde_contribute(wp->bp, wp->length, ENOMEM);
628			g_bde_delete_sector(sc, wp->sp);
629			g_bde_delete_work(wp);
630			return;
631		}
632		g_bde_read_sector(sc, wp, wp->kso);
633		if (wp->ksp == NULL)
634			wp->error = ENOMEM;
635	} else if (wp->bp->bio_cmd == BIO_DELETE) {
636		wp->sp = g_bde_new_sector(wp, wp->length);
637		if (wp->sp == NULL) {
638			g_bde_contribute(wp->bp, wp->length, ENOMEM);
639			g_bde_delete_work(wp);
640			return;
641		}
642	} else if (wp->bp->bio_cmd == BIO_WRITE) {
643		wp->sp = g_bde_new_sector(wp, wp->length);
644		if (wp->sp == NULL) {
645			g_bde_contribute(wp->bp, wp->length, ENOMEM);
646			g_bde_delete_work(wp);
647			return;
648		}
649		g_bde_read_sector(sc, wp, wp->kso);
650		if (wp->ksp == NULL) {
651			g_bde_contribute(wp->bp, wp->length, ENOMEM);
652			g_bde_delete_sector(sc, wp->sp);
653			g_bde_delete_work(wp);
654			return;
655		}
656	} else {
657		KASSERT(0 == 1,
658		    ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd));
659	}
660
661	wp->state = WAIT;
662	wakeup(sc);
663}
664
665/*
666 * Split the incoming bio on zone boundaries and submit the resulting
667 * work structures to g_bde_start2().
668 */
669
670void
671g_bde_start1(struct bio *bp)
672{
673	struct g_bde_softc *sc;
674	struct g_bde_work *wp;
675	off_t zone_start, left;
676	caddr_t p;
677
678	sc = bp->bio_to->geom->softc;
679	bp->bio_driver1 = sc;
680
681	mtx_lock(&sc->worklist_mutex);
682	zone_start = bp->bio_offset - bp->bio_offset % sc->zone_cont;
683	wp = g_bde_new_work(sc);
684	if (wp == NULL) {
685		g_io_deliver(bp, ENOMEM);
686		mtx_unlock(&sc->worklist_mutex);
687		return;
688	}
689	left = bp->bio_length;
690	p = bp->bio_data;
691
692	/* Do the first and possible only fragment */
693	wp->bp = bp;
694	wp->offset = bp->bio_offset;
695	wp->data = p;
696	wp->length = zone_start + sc->zone_cont - wp->offset;
697	if (wp->length >= left) {
698		/* Only this one fragment needed */
699		wp->length = left;
700		g_bde_start2(wp);
701		mtx_unlock(&sc->worklist_mutex);
702		return;
703	}
704
705	/* Submit the first fragment */
706	g_bde_start2(wp);
707	left -= wp->length;
708	p += wp->length;
709
710	/* Do the subsequent fragments */
711	for(;left > 0;) {
712		wp = g_bde_new_work(sc);
713		if (wp == NULL) {
714			g_bde_contribute(bp, left, ENOMEM);
715			mtx_unlock(&sc->worklist_mutex);
716			return;
717		}
718		zone_start += sc->zone_cont;
719		wp->bp = bp;
720		wp->offset = zone_start;
721		wp->data = p;
722		if (left > sc->zone_cont)
723			wp->length = sc->zone_cont;
724		else
725			wp->length = left;
726		left -= wp->length;
727		p += wp->length;
728		g_bde_start2(wp);
729	}
730	mtx_unlock(&sc->worklist_mutex);
731}
732