1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/systm.h>
27#include <sys/types.h>
28#include <sys/vnode.h>
29#include <sys/errno.h>
30#include <sys/sysmacros.h>
31#include <sys/debug.h>
32#include <sys/kmem.h>
33#include <sys/conf.h>
34#include <sys/proc.h>
35#include <sys/cmn_err.h>
36#include <sys/fssnap_if.h>
37#include <sys/fs/ufs_inode.h>
38#include <sys/fs/ufs_filio.h>
39#include <sys/fs/ufs_log.h>
40#include <sys/fs/ufs_bio.h>
41#include <sys/atomic.h>
42
43extern int		maxphys;
44extern uint_t		bypass_snapshot_throttle_key;
45
46extern struct kmem_cache	*lufs_sv;
47extern struct kmem_cache	*lufs_bp;
48
49static void
50makebusy(ml_unit_t *ul, buf_t *bp)
51{
52	sema_p(&bp->b_sem);
53	if ((bp->b_flags & B_ERROR) == 0)
54		return;
55	if (bp->b_flags & B_READ)
56		ldl_seterror(ul, "Error reading ufs log");
57	else
58		ldl_seterror(ul, "Error writing ufs log");
59}
60
61static int
62logdone(buf_t *bp)
63{
64	bp->b_flags |= B_DONE;
65
66	if (bp->b_flags & B_WRITE)
67		sema_v(&bp->b_sem);
68	else
69		/* wakeup the thread waiting on this buf */
70		sema_v(&bp->b_io);
71	return (0);
72}
73
74static int
75ldl_strategy_done(buf_t *cb)
76{
77	lufs_save_t	*sv;
78	lufs_buf_t	*lbp;
79	buf_t		*bp;
80
81	ASSERT(SEMA_HELD(&cb->b_sem));
82	ASSERT((cb->b_flags & B_DONE) == 0);
83
84	/*
85	 * Compute address of the ``save'' struct
86	 */
87	lbp = (lufs_buf_t *)cb;
88	sv = (lufs_save_t *)lbp->lb_ptr;
89
90	if (cb->b_flags & B_ERROR)
91		sv->sv_error = 1;
92
93	/*
94	 * If this is the last request, release the resources and
95	 * ``done'' the original buffer header.
96	 */
97	if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
98		kmem_cache_free(lufs_bp, lbp);
99		return (1);
100	}
101	/* Propagate any errors back to the original buffer header */
102	bp = sv->sv_bp;
103	if (sv->sv_error)
104		bp->b_flags |= B_ERROR;
105	kmem_cache_free(lufs_bp, lbp);
106	kmem_cache_free(lufs_sv, sv);
107
108	biodone(bp);
109	return (0);
110}
111
112/*
113 * Map the log logical block number to a physical disk block number
114 */
115static int
116map_frag(
117	ml_unit_t	*ul,
118	daddr_t		lblkno,
119	size_t		bcount,
120	daddr_t		*pblkno,
121	size_t		*pbcount)
122{
123	ic_extent_t	*ext = ul->un_ebp->ic_extents;
124	uint32_t	e = ul->un_ebp->ic_nextents;
125	uint32_t	s = 0;
126	uint32_t	i = e >> 1;
127	uint32_t	lasti = i;
128	uint32_t	bno_off;
129
130again:
131	if (ext[i].ic_lbno <= lblkno) {
132		if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
133			/* FOUND IT */
134			bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
135			*pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
136			*pblkno = ext[i].ic_pbno + bno_off;
137			return (0);
138		} else
139			s = i;
140	} else
141		e = i;
142	i = s + ((e - s) >> 1);
143
144	if (i == lasti) {
145		*pbcount = bcount;
146		return (ENOENT);
147	}
148	lasti = i;
149
150	goto again;
151}
152
153/*
154 * The log is a set of extents (which typically will be only one, but
155 * may be more if the disk was close to full when the log was created)
156 * and hence the logical offsets into the log
157 * have to be translated into their real device locations before
158 * calling the device's strategy routine. The translation may result
159 * in several IO requests if this request spans extents.
160 */
161void
162ldl_strategy(ml_unit_t *ul, buf_t *pb)
163{
164	lufs_save_t	*sv;
165	lufs_buf_t	*lbp;
166	buf_t		*cb;
167	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
168	daddr_t		lblkno, pblkno;
169	size_t		nb_left, pbcount;
170	off_t		offset;
171	dev_t		dev	= ul->un_dev;
172	int		error;
173	int		read = pb->b_flags & B_READ;
174
175	/*
176	 * Allocate and initialise the save stucture,
177	 */
178	sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
179	sv->sv_error = 0;
180	sv->sv_bp = pb;
181	nb_left = pb->b_bcount;
182	sv->sv_nb_left = nb_left;
183
184	lblkno = pb->b_blkno;
185	offset = 0;
186
187	do {
188		error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
189
190		lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
191		bioinit(&lbp->lb_buf);
192		lbp->lb_ptr = sv;
193
194		cb = bioclone(pb, offset, pbcount, dev,
195		    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
196
197		offset += pbcount;
198		lblkno += btodb(pbcount);
199		nb_left -= pbcount;
200
201		if (error) {
202			cb->b_flags |= B_ERROR;
203			cb->b_resid = cb->b_bcount;
204			biodone(cb);
205		} else {
206			if (read) {
207				logstats.ls_ldlreads.value.ui64++;
208				ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
209				lwp_stat_update(LWP_STAT_INBLK, 1);
210			} else {
211				logstats.ls_ldlwrites.value.ui64++;
212				lwp_stat_update(LWP_STAT_OUBLK, 1);
213			}
214
215			/*
216			 * write through the snapshot driver if necessary
217			 * We do not want this write to be throttled because
218			 * we are holding the un_log mutex here. If we
219			 * are throttled in fssnap_translate, the fssnap_taskq
220			 * thread which can wake us up can get blocked on
221			 * the un_log mutex resulting in a deadlock.
222			 */
223			if (ufsvfsp->vfs_snapshot) {
224				(void) tsd_set(bypass_snapshot_throttle_key,
225				    (void *)1);
226				fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
227
228				(void) tsd_set(bypass_snapshot_throttle_key,
229				    (void *)0);
230			} else {
231				(void) bdev_strategy(cb);
232			}
233		}
234
235	} while (nb_left);
236}
237
238static void
239writelog(ml_unit_t *ul, buf_t *bp)
240{
241	ASSERT(SEMA_HELD(&bp->b_sem));
242
243	/*
244	 * This is really an B_ASYNC write but we want Presto to
245	 * cache this write.  The iodone routine, logdone, processes
246	 * the buf correctly.
247	 */
248	bp->b_flags = B_WRITE;
249	bp->b_edev = ul->un_dev;
250	bp->b_iodone = logdone;
251
252	/*
253	 * return EIO for every IO if in hard error state
254	 */
255	if (ul->un_flags & LDL_ERROR) {
256		bp->b_flags |= B_ERROR;
257		bp->b_error = EIO;
258		biodone(bp);
259		return;
260	}
261
262	ldl_strategy(ul, bp);
263}
264
265static void
266readlog(ml_unit_t *ul, buf_t *bp)
267{
268	ASSERT(SEMA_HELD(&bp->b_sem));
269	ASSERT(bp->b_bcount);
270
271	bp->b_flags = B_READ;
272	bp->b_edev = ul->un_dev;
273	bp->b_iodone = logdone;
274
275	/* all IO returns errors when in error state */
276	if (ul->un_flags & LDL_ERROR) {
277		bp->b_flags |= B_ERROR;
278		bp->b_error = EIO;
279		biodone(bp);
280		(void) trans_wait(bp);
281		return;
282	}
283
284	ldl_strategy(ul, bp);
285
286	if (trans_wait(bp))
287		ldl_seterror(ul, "Error reading ufs log");
288}
289
290/*
291 * NOTE: writers are single threaded thru the log layer.
292 * This means we can safely reference and change the cb and bp fields
293 * that ldl_read does not reference w/o holding the cb_rwlock or
294 * the bp makebusy lock.
295 */
296static void
297push_dirty_bp(ml_unit_t *ul, buf_t *bp)
298{
299	buf_t		*newbp;
300	cirbuf_t	*cb		= &ul->un_wrbuf;
301
302	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
303	ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
304
305	/*
306	 * async write the buf
307	 */
308	writelog(ul, bp);
309
310	/*
311	 * no longer filling any buf
312	 */
313	cb->cb_dirty = NULL;
314
315	/*
316	 * no extra buffer space; all done
317	 */
318	if (bp->b_bcount == bp->b_bufsize)
319		return;
320
321	/*
322	 * give extra buffer space to a new bp
323	 * 	try to take buf off of free list
324	 */
325	if ((newbp = cb->cb_free) != NULL) {
326		cb->cb_free = newbp->b_forw;
327	} else {
328		newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
329		sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
330		sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
331	}
332	newbp->b_flags = 0;
333	newbp->b_bcount = 0;
334	newbp->b_file = NULL;
335	newbp->b_offset = -1;
336	newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
337	newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
338	bp->b_bufsize = bp->b_bcount;
339
340	/*
341	 * lock out readers and put new buf at LRU position
342	 */
343	rw_enter(&cb->cb_rwlock, RW_WRITER);
344	newbp->b_forw = bp->b_forw;
345	newbp->b_back = bp;
346	bp->b_forw->b_back = newbp;
347	bp->b_forw = newbp;
348	rw_exit(&cb->cb_rwlock);
349}
350
351static void
352inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
353{
354	buf_t		*bp;
355	off_t		elof	= lof + nb;
356	off_t		buflof;
357	off_t		bufelof;
358
359	/*
360	 * discard all bufs that overlap the range (lof, lof + nb)
361	 */
362	rw_enter(&cb->cb_rwlock, RW_WRITER);
363	bp = cb->cb_bp;
364	do {
365		if (bp == cb->cb_dirty || bp->b_bcount == 0) {
366			bp = bp->b_forw;
367			continue;
368		}
369		buflof = dbtob(bp->b_blkno);
370		bufelof = buflof + bp->b_bcount;
371		if ((buflof < lof && bufelof <= lof) ||
372		    (buflof >= elof && bufelof > elof)) {
373			bp = bp->b_forw;
374			continue;
375		}
376		makebusy(ul, bp);
377		bp->b_flags = 0;
378		bp->b_bcount = 0;
379		sema_v(&bp->b_sem);
380		bp = bp->b_forw;
381	} while (bp != cb->cb_bp);
382	rw_exit(&cb->cb_rwlock);
383}
384
385/*
386 * NOTE: writers are single threaded thru the log layer.
387 * This means we can safely reference and change the cb and bp fields
388 * that ldl_read does not reference w/o holding the cb_rwlock or
389 * the bp makebusy lock.
390 */
391static buf_t *
392get_write_bp(ml_unit_t *ul)
393{
394	cirbuf_t	*cb = &ul->un_wrbuf;
395	buf_t		*bp;
396
397	/*
398	 * cb_dirty is the buffer we are currently filling; if any
399	 */
400	if ((bp = cb->cb_dirty) != NULL) {
401		makebusy(ul, bp);
402		return (bp);
403	}
404	/*
405	 * discard any bp that overlaps the current tail since we are
406	 * about to overwrite it.
407	 */
408	inval_range(ul, cb, ul->un_tail_lof, 1);
409
410	/*
411	 * steal LRU buf
412	 */
413	rw_enter(&cb->cb_rwlock, RW_WRITER);
414	bp = cb->cb_bp->b_forw;
415	makebusy(ul, bp);
416
417	cb->cb_dirty = bp;
418	cb->cb_bp = bp;
419
420	bp->b_flags = 0;
421	bp->b_bcount = 0;
422	bp->b_blkno = btodb(ul->un_tail_lof);
423	ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
424	rw_exit(&cb->cb_rwlock);
425
426	/*
427	 * NOTE:
428	 *	1. un_tail_lof never addresses >= un_eol_lof
429	 *	2. b_blkno + btodb(b_bufsize) may > un_eol_lof
430	 *		this case is handled in storebuf
431	 */
432	return (bp);
433}
434
435void
436alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
437{
438	int	i;
439	buf_t	*bp;
440
441	/*
442	 * Clear previous allocation
443	 */
444	if (cb->cb_nb)
445		free_cirbuf(cb);
446
447	bzero(cb, sizeof (*cb));
448	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
449
450	rw_enter(&cb->cb_rwlock, RW_WRITER);
451
452	/*
453	 * preallocate 3 bp's and put them on the free list.
454	 */
455	for (i = 0; i < 3; ++i) {
456		bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
457		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
458		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
459		bp->b_offset = -1;
460		bp->b_forw = cb->cb_free;
461		cb->cb_free = bp;
462	}
463
464	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
465	cb->cb_nb = bufsize;
466
467	/*
468	 * first bp claims entire write buffer
469	 */
470	bp = cb->cb_free;
471	cb->cb_free = bp->b_forw;
472
473	bp->b_forw = bp;
474	bp->b_back = bp;
475	cb->cb_bp = bp;
476	bp->b_un.b_addr = cb->cb_va;
477	bp->b_bufsize = cb->cb_nb;
478
479	rw_exit(&cb->cb_rwlock);
480}
481
482void
483alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
484{
485	caddr_t	va;
486	size_t	nb;
487	buf_t	*bp;
488
489	/*
490	 * Clear previous allocation
491	 */
492	if (cb->cb_nb)
493		free_cirbuf(cb);
494
495	bzero(cb, sizeof (*cb));
496	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
497
498	rw_enter(&cb->cb_rwlock, RW_WRITER);
499
500	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
501	cb->cb_nb = bufsize;
502
503	/*
504	 * preallocate N bufs that are hard-sized to blksize
505	 *	in other words, the read buffer pool is a linked list
506	 *	of statically sized bufs.
507	 */
508	va = cb->cb_va;
509	while ((nb = bufsize) != 0) {
510		if (nb > blksize)
511			nb = blksize;
512		bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
513		bzero(bp, sizeof (buf_t));
514		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
515		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
516		bp->b_un.b_addr = va;
517		bp->b_bufsize = nb;
518		if (cb->cb_bp) {
519			bp->b_forw = cb->cb_bp->b_forw;
520			bp->b_back = cb->cb_bp;
521			cb->cb_bp->b_forw->b_back = bp;
522			cb->cb_bp->b_forw = bp;
523		} else
524			bp->b_forw = bp->b_back = bp;
525		cb->cb_bp = bp;
526		bufsize -= nb;
527		va += nb;
528	}
529
530	rw_exit(&cb->cb_rwlock);
531}
532
533void
534free_cirbuf(cirbuf_t *cb)
535{
536	buf_t	*bp;
537
538	if (cb->cb_nb == 0)
539		return;
540
541	rw_enter(&cb->cb_rwlock, RW_WRITER);
542	ASSERT(cb->cb_dirty == NULL);
543
544	/*
545	 * free the active bufs
546	 */
547	while ((bp = cb->cb_bp) != NULL) {
548		if (bp == bp->b_forw)
549			cb->cb_bp = NULL;
550		else
551			cb->cb_bp = bp->b_forw;
552		bp->b_back->b_forw = bp->b_forw;
553		bp->b_forw->b_back = bp->b_back;
554		sema_destroy(&bp->b_sem);
555		sema_destroy(&bp->b_io);
556		kmem_free(bp, sizeof (buf_t));
557	}
558
559	/*
560	 * free the free bufs
561	 */
562	while ((bp = cb->cb_free) != NULL) {
563		cb->cb_free = bp->b_forw;
564		sema_destroy(&bp->b_sem);
565		sema_destroy(&bp->b_io);
566		kmem_free(bp, sizeof (buf_t));
567	}
568	kmem_free(cb->cb_va, cb->cb_nb);
569	cb->cb_va = NULL;
570	cb->cb_nb = 0;
571	rw_exit(&cb->cb_rwlock);
572	rw_destroy(&cb->cb_rwlock);
573}
574
575static int
576within_range(off_t lof, daddr_t blkno, ulong_t bcount)
577{
578	off_t	blof	= dbtob(blkno);
579
580	return ((lof >= blof) && (lof < (blof + bcount)));
581}
582
583static buf_t *
584find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
585{
586	buf_t *bp;
587
588	/*
589	 * find a buf that contains the offset lof
590	 */
591	rw_enter(&cb->cb_rwlock, RW_READER);
592	bp = cb->cb_bp;
593	do {
594		if (bp->b_bcount &&
595		    within_range(lof, bp->b_blkno, bp->b_bcount)) {
596			makebusy(ul, bp);
597			rw_exit(&cb->cb_rwlock);
598			return (bp);
599		}
600		bp = bp->b_forw;
601	} while (bp != cb->cb_bp);
602	rw_exit(&cb->cb_rwlock);
603
604	return (NULL);
605}
606
607static off_t
608find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
609{
610	buf_t	*bp, *bpend;
611	off_t	rlof;
612
613	/*
614	 * we mustn't:
615	 *	o read past eol
616	 *	o read past the tail
617	 *	o read data that may be being written.
618	 */
619	rw_enter(&cb->cb_rwlock, RW_READER);
620	bpend = bp = cb->cb_bp->b_forw;
621	rlof = ul->un_tail_lof;
622	do {
623		if (bp->b_bcount) {
624			rlof = dbtob(bp->b_blkno);
625			break;
626		}
627		bp = bp->b_forw;
628	} while (bp != bpend);
629	rw_exit(&cb->cb_rwlock);
630
631	if (lof <= rlof)
632		/* lof is prior to the range represented by the write buf */
633		return (rlof);
634	else
635		/* lof follows the range represented by the write buf */
636		return ((off_t)ul->un_eol_lof);
637}
638
639static buf_t *
640get_read_bp(ml_unit_t *ul, off_t lof)
641{
642	cirbuf_t	*cb;
643	buf_t		*bp;
644	off_t		rlof;
645
646	/*
647	 * retrieve as much data as possible from the incore buffers
648	 */
649	if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
650		logstats.ls_lreadsinmem.value.ui64++;
651		return (bp);
652	}
653	if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
654		logstats.ls_lreadsinmem.value.ui64++;
655		return (bp);
656	}
657
658	/*
659	 * steal the LRU buf
660	 */
661	cb = &ul->un_rdbuf;
662	rw_enter(&cb->cb_rwlock, RW_WRITER);
663	bp = cb->cb_bp->b_forw;
664	makebusy(ul, bp);
665	bp->b_flags = 0;
666	bp->b_bcount = 0;
667	cb->cb_bp = bp;
668	rw_exit(&cb->cb_rwlock);
669
670	/*
671	 * don't read past the tail or the end-of-log
672	 */
673	bp->b_blkno = btodb(lof);
674	lof = dbtob(bp->b_blkno);
675	rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
676	bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
677	readlog(ul, bp);
678	return (bp);
679}
680
681/*
682 * NOTE: writers are single threaded thru the log layer.
683 * This means we can safely reference and change the cb and bp fields
684 * that ldl_read does not reference w/o holding the cb_rwlock or
685 * the bp makebusy lock.
686 */
687static int
688extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
689{
690	buf_t	*bpforw	= bp->b_forw;
691
692	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
693
694	/*
695	 * there is no `next' bp; do nothing
696	 */
697	if (bpforw == bp)
698		return (0);
699
700	/*
701	 * buffer space is not adjacent; do nothing
702	 */
703	if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
704		return (0);
705
706	/*
707	 * locking protocol requires giving up any bp locks before
708	 * acquiring cb_rwlock.  This is okay because we hold
709	 * un_log_mutex.
710	 */
711	sema_v(&bp->b_sem);
712
713	/*
714	 * lock out ldl_read
715	 */
716	rw_enter(&cb->cb_rwlock, RW_WRITER);
717
718	/*
719	 * wait for current IO to finish w/next bp; if necessary
720	 */
721	makebusy(ul, bpforw);
722
723	/*
724	 * free the next bp and steal its space
725	 */
726	bp->b_forw = bpforw->b_forw;
727	bpforw->b_forw->b_back = bp;
728	bp->b_bufsize += bpforw->b_bufsize;
729	sema_v(&bpforw->b_sem);
730	bpforw->b_forw = cb->cb_free;
731	cb->cb_free = bpforw;
732	makebusy(ul, bp);
733	rw_exit(&cb->cb_rwlock);
734
735	return (1);
736}
737
738static size_t
739storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
740{
741	size_t		copy_nb;
742	size_t		nb_in_sec;
743	sect_trailer_t	*st;
744	size_t		nb_left = nb;
745	cirbuf_t	*cb	= &ul->un_wrbuf;
746
747again:
748	nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
749	copy_nb = MIN(nb_left, nb_in_sec);
750
751	ASSERT(copy_nb);
752
753	bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
754	bp->b_bcount += copy_nb;
755	va += copy_nb;
756	nb_left -= copy_nb;
757	ul->un_tail_lof += copy_nb;
758
759	if ((nb_in_sec -= copy_nb) == 0) {
760		st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
761
762		st->st_tid = ul->un_logmap->mtm_tid;
763		st->st_ident = ul->un_tail_ident++;
764		bp->b_bcount += sizeof (sect_trailer_t);
765		ul->un_tail_lof += sizeof (sect_trailer_t);
766		/*
767		 * log wrapped; async write this bp
768		 */
769		if (ul->un_tail_lof == ul->un_eol_lof) {
770			ul->un_tail_lof = ul->un_bol_lof;
771			push_dirty_bp(ul, bp);
772			return (nb - nb_left);
773		}
774		/*
775		 * out of bp space; get more or async write buf
776		 */
777		if (bp->b_bcount == bp->b_bufsize) {
778			if (!extend_write_bp(ul, cb, bp)) {
779				push_dirty_bp(ul, bp);
780				return (nb - nb_left);
781			}
782		}
783	}
784	if (nb_left)
785		goto again;
786
787	sema_v(&bp->b_sem);
788	return (nb);
789}
790
791static void
792fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
793{
794	offset_t	src_mof	= me->me_mof;
795	size_t		src_nb	= me->me_nb;
796
797	if (src_mof > dst_mof) {
798		ASSERT(src_mof < (dst_mof + dst_nb));
799		dst_va += (src_mof - dst_mof);
800		dst_nb -= (src_mof - dst_mof);
801	} else {
802		ASSERT(dst_mof < (src_mof + src_nb));
803		src_nb -= (dst_mof - src_mof);
804	}
805
806	src_nb = MIN(src_nb, dst_nb);
807	ASSERT(src_nb);
808	bzero(dst_va, src_nb);
809}
810
811/*
812 * dst_va == NULL means don't copy anything
813 */
814static ulong_t
815fetchbuf(
816	ml_unit_t *ul,
817	buf_t *bp,
818	caddr_t dst_va,
819	size_t dst_nb,
820	off_t *dst_lofp)
821{
822	caddr_t	copy_va;
823	size_t	copy_nb;
824	size_t	nb_sec;
825	off_t	dst_lof		= *dst_lofp;
826	ulong_t	sav_dst_nb	= dst_nb;
827	ulong_t	src_nb		= bp->b_bcount;
828	off_t	src_lof		= dbtob(bp->b_blkno);
829	off_t	src_elof	= src_lof + src_nb;
830	caddr_t	src_va		= bp->b_un.b_addr;
831
832	/*
833	 * copy from bp to dst_va
834	 */
835	while (dst_nb) {
836		/*
837		 * compute address within bp
838		 */
839		copy_va = src_va + (dst_lof - src_lof);
840
841		/*
842		 * adjust copy size to amount of data in bp
843		 */
844		copy_nb = MIN(dst_nb, src_elof - dst_lof);
845
846		/*
847		 * adjust copy size to amount of data in sector
848		 */
849		nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
850		copy_nb = MIN(copy_nb, nb_sec);
851
852		/*
853		 * dst_va == NULL means don't do copy (see logseek())
854		 */
855		if (dst_va) {
856			bcopy(copy_va, dst_va, copy_nb);
857			dst_va += copy_nb;
858		}
859		dst_lof += copy_nb;
860		dst_nb -= copy_nb;
861		nb_sec -= copy_nb;
862
863		/*
864		 * advance over sector trailer
865		 */
866		if (nb_sec == 0)
867			dst_lof += sizeof (sect_trailer_t);
868
869		/*
870		 * exhausted buffer
871		 *	return current lof for next read
872		 */
873		if (dst_lof == src_elof) {
874			sema_v(&bp->b_sem);
875			if (dst_lof == ul->un_eol_lof)
876				dst_lof = ul->un_bol_lof;
877			*dst_lofp = dst_lof;
878			return (sav_dst_nb - dst_nb);
879		}
880	}
881
882	/*
883	 * copy complete - return current lof
884	 */
885	sema_v(&bp->b_sem);
886	*dst_lofp = dst_lof;
887	return (sav_dst_nb);
888}
889
890void
891ldl_round_commit(ml_unit_t *ul)
892{
893	int		wrapped;
894	buf_t		*bp;
895	sect_trailer_t	*st;
896	size_t		bcount;
897	cirbuf_t	*cb	= &ul->un_wrbuf;
898
899	/*
900	 * if nothing to write; then do nothing
901	 */
902	if ((bp = cb->cb_dirty) == NULL)
903		return;
904	makebusy(ul, bp);
905
906	/*
907	 * round up to sector boundary and set new tail
908	 *	don't readjust st_ident if buf is already rounded
909	 */
910	bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
911	if (bcount == bp->b_bcount) {
912		sema_v(&bp->b_sem);
913		return;
914	}
915	bp->b_bcount = bcount;
916	ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
917	wrapped = 0;
918	if (ul->un_tail_lof == ul->un_eol_lof) {
919		ul->un_tail_lof = ul->un_bol_lof;
920		++wrapped;
921	}
922	ASSERT(ul->un_tail_lof != ul->un_head_lof);
923
924	/*
925	 * fix up the sector trailer
926	 */
927	/* LINTED */
928	st = (sect_trailer_t *)
929	    ((bp->b_un.b_addr + bcount) - sizeof (*st));
930	st->st_tid = ul->un_logmap->mtm_tid;
931	st->st_ident = ul->un_tail_ident++;
932
933	/*
934	 * if tail wrapped or we have exhausted this buffer
935	 *	async write the buffer
936	 */
937	if (wrapped || bcount == bp->b_bufsize)
938		push_dirty_bp(ul, bp);
939	else
940		sema_v(&bp->b_sem);
941}
942
943void
944ldl_push_commit(ml_unit_t *ul)
945{
946	buf_t		*bp;
947	cirbuf_t	*cb	= &ul->un_wrbuf;
948
949	/*
950	 * if nothing to write; then do nothing
951	 */
952	if ((bp = cb->cb_dirty) == NULL)
953		return;
954	makebusy(ul, bp);
955	push_dirty_bp(ul, bp);
956}
957
958int
959ldl_need_commit(ml_unit_t *ul)
960{
961	return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
962}
963
964int
965ldl_has_space(ml_unit_t *ul, mapentry_t *me)
966{
967	off_t	nfb;
968	off_t	nb;
969
970	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
971
972	/*
973	 * Add up the size used by the deltas
974	 * round nb up to a sector length plus an extra sector
975	 *	w/o the extra sector we couldn't distinguish
976	 *	a full log (head == tail) from an empty log (head == tail)
977	 */
978	for (nb = DEV_BSIZE; me; me = me->me_hash) {
979		nb += sizeof (struct delta);
980		if (me->me_dt != DT_CANCEL)
981			nb += me->me_nb;
982	}
983	nb = P2ROUNDUP(nb, DEV_BSIZE);
984
985	if (ul->un_head_lof <= ul->un_tail_lof)
986		nfb = (ul->un_head_lof - ul->un_bol_lof) +
987		    (ul->un_eol_lof - ul->un_tail_lof);
988	else
989		nfb = ul->un_head_lof - ul->un_tail_lof;
990
991	return (nb < nfb);
992}
993
994void
995ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
996{
997	buf_t		*bp;
998	caddr_t		va;
999	size_t		nb;
1000	size_t		actual;
1001
1002	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1003
1004	/* Write the delta */
1005
1006	nb = sizeof (struct delta);
1007	va = (caddr_t)&me->me_delta;
1008	bp = get_write_bp(ul);
1009
1010	while (nb) {
1011		if (ul->un_flags & LDL_ERROR) {
1012			sema_v(&bp->b_sem);
1013			return;
1014		}
1015		actual = storebuf(ul, bp, va, nb);
1016		ASSERT(actual);
1017		va += actual;
1018		nb -= actual;
1019		if (nb)
1020			bp = get_write_bp(ul);
1021	}
1022
1023	/* If a commit, cancel, or 0's; we're almost done */
1024	switch (me->me_dt) {
1025		case DT_COMMIT:
1026		case DT_CANCEL:
1027		case DT_ABZERO:
1028			/* roll needs to know where the next delta will go */
1029			me->me_lof = ul->un_tail_lof;
1030			return;
1031		default:
1032			break;
1033	}
1034
1035	/* Now write the data */
1036
1037	ASSERT(me->me_nb != 0);
1038
1039	nb = me->me_nb;
1040	va = (me->me_mof - bufmof) + bufp;
1041	bp = get_write_bp(ul);
1042
1043	/* Save where we will put the data */
1044	me->me_lof = ul->un_tail_lof;
1045
1046	while (nb) {
1047		if (ul->un_flags & LDL_ERROR) {
1048			sema_v(&bp->b_sem);
1049			return;
1050		}
1051		actual = storebuf(ul, bp, va, nb);
1052		ASSERT(actual);
1053		va += actual;
1054		nb -= actual;
1055		if (nb)
1056			bp = get_write_bp(ul);
1057	}
1058}
1059
1060void
1061ldl_waito(ml_unit_t *ul)
1062{
1063	buf_t		*bp;
1064	cirbuf_t	*cb	= &ul->un_wrbuf;
1065
1066	rw_enter(&cb->cb_rwlock, RW_WRITER);
1067	/*
1068	 * wait on them
1069	 */
1070	bp = cb->cb_bp;
1071	do {
1072		if ((bp->b_flags & B_DONE) == 0) {
1073			makebusy(ul, bp);
1074			sema_v(&bp->b_sem);
1075		}
1076		bp = bp->b_forw;
1077	} while (bp != cb->cb_bp);
1078	rw_exit(&cb->cb_rwlock);
1079}
1080
1081/*
1082 * seek nb bytes from location lof
1083 */
1084static int
1085logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1086{
1087	buf_t	*bp;
1088	ulong_t	actual;
1089
1090	while (nb) {
1091		bp = get_read_bp(ul, lof);
1092		if (bp->b_flags & B_ERROR) {
1093			sema_v(&bp->b_sem);
1094			return (EIO);
1095		}
1096		actual = fetchbuf(ul, bp, NULL, nb, &lof);
1097		ASSERT(actual);
1098		nb -= actual;
1099	}
1100	*lofp = lof;
1101	ASSERT(nb == 0);
1102	return (0);
1103}
1104
1105int
1106ldl_read(
1107	ml_unit_t *ul,		/* Log unit */
1108	caddr_t va,		/* address of buffer to read into */
1109	offset_t mof,		/* mof of buffer */
1110	off_t nb,		/* length of buffer */
1111	mapentry_t *me)		/* Map entry list */
1112{
1113	buf_t	*bp;
1114	crb_t   *crb;
1115	caddr_t	rva;			/* address to read into */
1116	size_t	rnb;			/* # of bytes to read */
1117	off_t	lof;			/* log device offset to read from */
1118	off_t   skip;
1119	ulong_t	actual;
1120	int	error;
1121	caddr_t	eva	= va + nb;	/* end of buffer */
1122
1123	for (; me; me = me->me_agenext) {
1124		ASSERT(me->me_dt != DT_CANCEL);
1125
1126		/*
1127		 * check for an cached roll buffer
1128		 */
1129		crb = me->me_crb;
1130		if (crb) {
1131			if (mof > crb->c_mof) {
1132				/*
1133				 * This mapentry overlaps with the beginning of
1134				 * the supplied buffer
1135				 */
1136				skip = mof - crb->c_mof;
1137				bcopy(crb->c_buf + skip, va,
1138				    MIN(nb, crb->c_nb - skip));
1139			} else {
1140				/*
1141				 * This mapentry starts at or after
1142				 * the supplied buffer.
1143				 */
1144				skip = crb->c_mof - mof;
1145				bcopy(crb->c_buf, va + skip,
1146				    MIN(crb->c_nb, nb - skip));
1147			}
1148			logstats.ls_lreadsinmem.value.ui64++;
1149			continue;
1150		}
1151
1152		/*
1153		 * check for a delta full of zeroes - there's no log data
1154		 */
1155		if (me->me_dt == DT_ABZERO) {
1156			fetchzeroes(va, mof, nb, me);
1157			continue;
1158		}
1159
1160		if (mof > me->me_mof) {
1161			rnb = (size_t)(mof - me->me_mof);
1162			error = logseek(ul, me->me_lof, rnb, &lof);
1163			if (error)
1164				return (EIO);
1165			rva = va;
1166			rnb = me->me_nb - rnb;
1167			rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1168		} else {
1169			lof = me->me_lof;
1170			rva = (me->me_mof - mof) + va;
1171			rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1172		}
1173
1174		while (rnb) {
1175			bp = get_read_bp(ul, lof);
1176			if (bp->b_flags & B_ERROR) {
1177				sema_v(&bp->b_sem);
1178				return (EIO);
1179			}
1180			ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1181			    (bp != ul->un_wrbuf.cb_dirty));
1182			actual = fetchbuf(ul, bp, rva, rnb, &lof);
1183			ASSERT(actual);
1184			rva += actual;
1185			rnb -= actual;
1186		}
1187	}
1188	return (0);
1189}
1190
1191void
1192ldl_savestate(ml_unit_t *ul)
1193{
1194	int		error;
1195	buf_t		*bp	= ul->un_bp;
1196	ml_odunit_t	*ud	= (void *)bp->b_un.b_addr;
1197	ml_odunit_t	*ud2	= (void *)(bp->b_un.b_addr + DEV_BSIZE);
1198
1199#if	DEBUG
1200	/*
1201	 * Scan test is running; don't update intermediate state
1202	 */
1203	if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1204		return;
1205#endif	/* DEBUG */
1206
1207	mutex_enter(&ul->un_state_mutex);
1208	bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1209	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1210	bcopy(ud, ud2, sizeof (*ud));
1211
1212	/* If a snapshot is enabled write through the shapshot driver. */
1213	if (ul->un_ufsvfs->vfs_snapshot)
1214		UFS_BWRITE2(ul->un_ufsvfs, bp);
1215	else
1216		BWRITE2(bp);
1217	logstats.ls_ldlwrites.value.ui64++;
1218	error = bp->b_flags & B_ERROR;
1219	mutex_exit(&ul->un_state_mutex);
1220	if (error)
1221		ldl_seterror(ul, "Error writing ufs log state");
1222}
1223
1224/*
1225 * The head will be set to (new_lof - header) since ldl_sethead is
1226 * called with the new_lof of the data portion of a delta.
1227 */
1228void
1229ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1230{
1231	off_t		nb;
1232	off_t		new_lof;
1233	uint32_t	new_ident;
1234	daddr_t		beg_blkno;
1235	daddr_t		end_blkno;
1236
1237	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1238
1239	if (data_lof == -1) {
1240		/* log is empty */
1241		new_ident = lufs_hd_genid(ul);
1242		new_lof = ul->un_tail_lof;
1243
1244	} else {
1245		/* compute header's lof */
1246		new_ident = ul->un_head_ident;
1247		new_lof = data_lof - sizeof (struct delta);
1248
1249		/* whoops, header spans sectors; subtract out sector trailer */
1250		if (btodb(new_lof) != btodb(data_lof))
1251			new_lof -= sizeof (sect_trailer_t);
1252
1253		/* whoops, header wrapped the log; go to last sector */
1254		if (new_lof < ul->un_bol_lof) {
1255			/* sector offset */
1256			new_lof -= dbtob(btodb(new_lof));
1257			/* add to last sector's lof */
1258			new_lof += (ul->un_eol_lof - DEV_BSIZE);
1259		}
1260		ul->un_head_tid = tid;
1261	}
1262
1263	/*
1264	 * check for nop
1265	 */
1266	if (new_lof == ul->un_head_lof)
1267		return;
1268
1269	/*
1270	 * invalidate the affected bufs and calculate new ident
1271	 */
1272	if (new_lof > ul->un_head_lof) {
1273		nb = new_lof - ul->un_head_lof;
1274		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1275		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1276
1277		end_blkno = btodb(new_lof);
1278		beg_blkno = btodb(ul->un_head_lof);
1279		new_ident += (end_blkno - beg_blkno);
1280	} else {
1281		nb = ul->un_eol_lof - ul->un_head_lof;
1282		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1283		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284
1285		end_blkno = btodb(ul->un_eol_lof);
1286		beg_blkno = btodb(ul->un_head_lof);
1287		new_ident += (end_blkno - beg_blkno);
1288
1289		nb = new_lof - ul->un_bol_lof;
1290		inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1291		inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1292
1293		end_blkno = btodb(new_lof);
1294		beg_blkno = btodb(ul->un_bol_lof);
1295		new_ident += (end_blkno - beg_blkno);
1296	}
1297	/*
1298	 * don't update the head if there has been an error
1299	 */
1300	if (ul->un_flags & LDL_ERROR)
1301		return;
1302
1303	/* Fix up the head and ident */
1304	ASSERT(new_lof >= ul->un_bol_lof);
1305	ul->un_head_lof = new_lof;
1306	ul->un_head_ident = new_ident;
1307	if (data_lof == -1) {
1308		ul->un_tail_ident = ul->un_head_ident;
1309	}
1310
1311
1312	/* Commit to the database */
1313	ldl_savestate(ul);
1314
1315	ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1316	    ldl_sethead_debug(ul));
1317}
1318
1319/*
1320 * The tail will be set to the sector following lof+nb
1321 *	lof + nb == size of the last delta + commit record
1322 *	this function is called once after the log scan has completed.
1323 */
1324void
1325ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1326{
1327	off_t		new_lof;
1328	uint32_t	new_ident;
1329	daddr_t		beg_blkno;
1330	daddr_t		end_blkno;
1331
1332	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1333
1334	if (lof == -1) {
1335		ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1336		ul->un_head_lof = ul->un_tail_lof;
1337		ul->un_head_ident = lufs_hd_genid(ul);
1338		ul->un_tail_ident = ul->un_head_ident;
1339
1340		/* Commit to the database */
1341		ldl_savestate(ul);
1342
1343		return;
1344	}
1345
1346	/*
1347	 * new_lof is the offset of the sector following the last commit
1348	 */
1349	(void) logseek(ul, lof, nb, &new_lof);
1350	ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1351
1352	/*
1353	 * calculate new ident
1354	 */
1355	if (new_lof > ul->un_head_lof) {
1356		end_blkno = btodb(new_lof);
1357		beg_blkno = btodb(ul->un_head_lof);
1358		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1359	} else {
1360		end_blkno = btodb(ul->un_eol_lof);
1361		beg_blkno = btodb(ul->un_head_lof);
1362		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1363
1364		end_blkno = btodb(new_lof);
1365		beg_blkno = btodb(ul->un_bol_lof);
1366		new_ident += (end_blkno - beg_blkno);
1367	}
1368
1369	/* Fix up the tail and ident */
1370	ul->un_tail_lof = new_lof;
1371	ul->un_tail_ident = new_ident;
1372
1373	/* Commit to the database */
1374	ldl_savestate(ul);
1375}
1376
1377/*
1378 * LOGSCAN STUFF
1379 */
1380static int
1381ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1382{
1383	ulong_t		ident;
1384	size_t		nblk, i;
1385	sect_trailer_t	*st;
1386
1387	/*
1388	 * compute ident for first sector in the buffer
1389	 */
1390	ident = ul->un_head_ident;
1391	if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1392		ident += (bp->b_blkno - btodb(ul->un_head_lof));
1393	} else {
1394		ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1395		ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1396	}
1397	/*
1398	 * truncate the buffer down to the last valid sector
1399	 */
1400	nblk = btodb(bp->b_bcount);
1401	bp->b_bcount = 0;
1402	/* LINTED */
1403	st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1404	for (i = 0; i < nblk; ++i) {
1405		if (st->st_ident != ident)
1406			break;
1407
1408		/* remember last valid tid for ldl_logscan_error() */
1409		ul->un_tid = st->st_tid;
1410
1411		/* LINTED */
1412		st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1413		++ident;
1414		bp->b_bcount += DEV_BSIZE;
1415	}
1416	/*
1417	 * make sure that lof is still within range
1418	 */
1419	return (within_range(lof, bp->b_blkno, bp->b_bcount));
1420}
1421
1422ulong_t
1423ldl_logscan_nbcommit(off_t lof)
1424{
1425	/*
1426	 * lof is the offset following the commit header.  However,
1427	 * if the commit header fell on the end-of-sector, then lof
1428	 * has already been advanced to the beginning of the next
1429	 * sector.  So do nothing.  Otherwise, return the remaining
1430	 * bytes in the sector.
1431	 */
1432	if ((lof & (DEV_BSIZE - 1)) == 0)
1433		return (0);
1434	return (NB_LEFT_IN_SECTOR(lof));
1435}
1436
1437int
1438ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1439{
1440	buf_t	*bp;
1441	ulong_t	actual;
1442
1443	ASSERT(ul->un_head_lof != ul->un_tail_lof);
1444
1445	/*
1446	 * Check the log data doesn't go out of bounds
1447	 */
1448	if (ul->un_head_lof < ul->un_tail_lof) {
1449		if (!WITHIN(*lofp, nb, ul->un_head_lof,
1450		    (ul->un_tail_lof - ul->un_head_lof))) {
1451			return (EIO);
1452		}
1453	} else {
1454		if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1455		    (ul->un_head_lof - ul->un_tail_lof))) {
1456			return (EIO);
1457		}
1458	}
1459
1460	while (nb) {
1461		bp = get_read_bp(ul, *lofp);
1462		if (bp->b_flags & B_ERROR) {
1463			sema_v(&bp->b_sem);
1464			return (EIO);
1465		}
1466		/*
1467		 * out-of-seq idents means partial transaction
1468		 *	panic, non-corrupting powerfail, ...
1469		 */
1470		if (!ldl_logscan_ident(ul, bp, *lofp)) {
1471			sema_v(&bp->b_sem);
1472			return (EIO);
1473		}
1474		/*
1475		 * copy the header into the caller's buf
1476		 */
1477		actual = fetchbuf(ul, bp, va, nb, lofp);
1478		if (va)
1479			va += actual;
1480		nb -= actual;
1481	}
1482	return (0);
1483}
1484
1485void
1486ldl_logscan_begin(ml_unit_t *ul)
1487{
1488	size_t	bufsize;
1489
1490	ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1491
1492	/*
1493	 * logscan has begun
1494	 */
1495	ul->un_flags |= LDL_SCAN;
1496
1497	/*
1498	 * reset the circular bufs
1499	 */
1500	bufsize = ldl_bufsize(ul);
1501	alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1502	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1503
1504	/*
1505	 * set the tail to reflect a full log
1506	 */
1507	ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1508
1509	if (ul->un_tail_lof < ul->un_bol_lof)
1510		ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1511	if (ul->un_tail_lof >= ul->un_eol_lof)
1512		ul->un_tail_lof = ul->un_bol_lof;
1513
1514	/*
1515	 * un_tid is used during error processing; it is initialized to
1516	 * the tid of the delta at un_head_lof;
1517	 */
1518	ul->un_tid = ul->un_head_tid;
1519}
1520
1521void
1522ldl_logscan_end(ml_unit_t *ul)
1523{
1524	size_t	bufsize;
1525
1526	/*
1527	 * reset the circular bufs
1528	 */
1529	bufsize = ldl_bufsize(ul);
1530	alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1531	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1532
1533	/*
1534	 * Done w/scan
1535	 */
1536	ul->un_flags &= ~LDL_SCAN;
1537}
1538
1539int
1540ldl_need_roll(ml_unit_t *ul)
1541{
1542	off_t	busybytes;
1543	off_t	head;
1544	off_t	tail;
1545	off_t	bol;
1546	off_t	eol;
1547	off_t	nb;
1548
1549	/*
1550	 * snapshot the log state
1551	 */
1552	head = ul->un_head_lof;
1553	tail = ul->un_tail_lof;
1554	bol = ul->un_bol_lof;
1555	eol = ul->un_eol_lof;
1556	nb = ul->un_logsize;
1557
1558	/*
1559	 * compute number of busy (inuse) bytes
1560	 */
1561	if (head <= tail)
1562		busybytes = tail - head;
1563	else
1564		busybytes = (eol - head) + (tail - bol);
1565
1566	/*
1567	 * return TRUE if > 75% full
1568	 */
1569	return (busybytes > (nb - (nb >> 2)));
1570}
1571
1572void
1573ldl_seterror(ml_unit_t *ul, char *why)
1574{
1575	/*
1576	 * already in error state; do nothing
1577	 */
1578	if (ul->un_flags & LDL_ERROR)
1579		return;
1580
1581	ul->un_flags |= LDL_ERROR;	/* incore */
1582	ul->un_badlog = 1;		/* ondisk (cleared by fsck) */
1583
1584	/*
1585	 * Commit to state sectors
1586	 */
1587	uniqtime(&ul->un_timestamp);
1588	ldl_savestate(ul);
1589
1590	/* Pretty print */
1591	cmn_err(CE_WARN, "%s", why);
1592	cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1593	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1594	cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1595	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1596
1597	/*
1598	 * If we aren't in the middle of scan (aka snarf); tell ufs
1599	 * to hard lock itself.
1600	 */
1601	if ((ul->un_flags & LDL_SCAN) == 0)
1602		ufs_trans_onerror();
1603}
1604
1605size_t
1606ldl_bufsize(ml_unit_t *ul)
1607{
1608	size_t		bufsize;
1609	extern uint32_t	ldl_minbufsize;
1610
1611	/*
1612	 * initial guess is the maxtransfer value for this log device
1613	 * 	increase if too small
1614	 * 	decrease if too large
1615	 */
1616	bufsize = dbtob(btod(ul->un_maxtransfer));
1617	if (bufsize < ldl_minbufsize)
1618		bufsize = ldl_minbufsize;
1619	if (bufsize > maxphys)
1620		bufsize = maxphys;
1621	if (bufsize > ul->un_maxtransfer)
1622		bufsize = ul->un_maxtransfer;
1623	return (bufsize);
1624}
1625