1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27/* All Rights Reserved */
28
29/*
30 * Portions of this source code were derived from Berkeley 4.3 BSD
31 * under license from the Regents of the University of California.
32 */
33
34#include <sys/types.h>
35#include <sys/t_lock.h>
36#include <sys/param.h>
37#include <sys/time.h>
38#include <sys/systm.h>
39#include <sys/sysmacros.h>
40#include <sys/resource.h>
41#include <sys/signal.h>
42#include <sys/cred.h>
43#include <sys/user.h>
44#include <sys/buf.h>
45#include <sys/vfs.h>
46#include <sys/vnode.h>
47#include <sys/proc.h>
48#include <sys/disp.h>
49#include <sys/file.h>
50#include <sys/fcntl.h>
51#include <sys/flock.h>
52#include <sys/kmem.h>
53#include <sys/uio.h>
54#include <sys/dnlc.h>
55#include <sys/conf.h>
56#include <sys/mman.h>
57#include <sys/pathname.h>
58#include <sys/debug.h>
59#include <sys/vmsystm.h>
60#include <sys/cmn_err.h>
61#include <sys/filio.h>
62#include <sys/atomic.h>
63
64#include <sys/fssnap_if.h>
65#include <sys/fs/ufs_fs.h>
66#include <sys/fs/ufs_lockfs.h>
67#include <sys/fs/ufs_filio.h>
68#include <sys/fs/ufs_inode.h>
69#include <sys/fs/ufs_fsdir.h>
70#include <sys/fs/ufs_quota.h>
71#include <sys/fs/ufs_trans.h>
72#include <sys/fs/ufs_panic.h>
73#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
74#include <sys/errno.h>
75
76#include <sys/filio.h>		/* _FIOIO */
77
78#include <vm/hat.h>
79#include <vm/page.h>
80#include <vm/pvn.h>
81#include <vm/as.h>
82#include <vm/seg.h>
83#include <vm/seg_map.h>
84#include <vm/seg_vn.h>
85#include <vm/seg_kmem.h>
86#include <vm/rm.h>
87#include <sys/swap.h>
88#include <sys/epm.h>
89
90#include <fs/fs_subr.h>
91
92static void	*ufs_directio_zero_buf;
93static int	ufs_directio_zero_len	= 8192;
94
95int	ufs_directio_enabled = 1;	/* feature is enabled */
96
97/*
98 * for kstats reader
99 */
100struct ufs_directio_kstats {
101	kstat_named_t	logical_reads;
102	kstat_named_t	phys_reads;
103	kstat_named_t	hole_reads;
104	kstat_named_t	nread;
105	kstat_named_t	logical_writes;
106	kstat_named_t	phys_writes;
107	kstat_named_t	nwritten;
108	kstat_named_t	nflushes;
109} ufs_directio_kstats = {
110	{ "logical_reads",	KSTAT_DATA_UINT64 },
111	{ "phys_reads",		KSTAT_DATA_UINT64 },
112	{ "hole_reads",		KSTAT_DATA_UINT64 },
113	{ "nread",		KSTAT_DATA_UINT64 },
114	{ "logical_writes",	KSTAT_DATA_UINT64 },
115	{ "phys_writes",	KSTAT_DATA_UINT64 },
116	{ "nwritten",		KSTAT_DATA_UINT64 },
117	{ "nflushes",		KSTAT_DATA_UINT64 },
118};
119
120kstat_t	*ufs_directio_kstatsp;
121
122/*
123 * use kmem_cache_create for direct-physio buffers. This has shown
124 * a better cache distribution compared to buffers on the
125 * stack. It also avoids semaphore construction/deconstruction
126 * per request
127 */
128struct directio_buf {
129	struct directio_buf	*next;
130	char		*addr;
131	size_t		nbytes;
132	struct buf	buf;
133};
134static struct kmem_cache *directio_buf_cache;
135
136
137/* ARGSUSED */
138static int
139directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
140{
141	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
142	return (0);
143}
144
145/* ARGSUSED */
146static void
147directio_buf_destructor(void *dbp, void *cdrarg)
148{
149	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
150}
151
152void
153directio_bufs_init(void)
154{
155	directio_buf_cache = kmem_cache_create("directio_buf_cache",
156	    sizeof (struct directio_buf), 0,
157	    directio_buf_constructor, directio_buf_destructor,
158	    NULL, NULL, NULL, 0);
159}
160
161void
162ufs_directio_init(void)
163{
164	/*
165	 * kstats
166	 */
167	ufs_directio_kstatsp = kstat_create("ufs", 0,
168	    "directio", "ufs", KSTAT_TYPE_NAMED,
169	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
170	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
171	if (ufs_directio_kstatsp) {
172		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
173		kstat_install(ufs_directio_kstatsp);
174	}
175	/*
176	 * kzero is broken so we have to use a private buf of zeroes
177	 */
178	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
179	directio_bufs_init();
180}
181
182/*
183 * Wait for the first direct IO operation to finish
184 */
185static int
186directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
187{
188	buf_t	*bp;
189	int	error;
190
191	/*
192	 * Wait for IO to finish
193	 */
194	bp = &dbp->buf;
195	error = biowait(bp);
196
197	/*
198	 * bytes_io will be used to figure out a resid
199	 * for the caller. The resid is approximated by reporting
200	 * the bytes following the first failed IO as the residual.
201	 *
202	 * I am cautious about using b_resid because I
203	 * am not sure how well the disk drivers maintain it.
204	 */
205	if (error)
206		if (bp->b_resid)
207			*bytes_iop = bp->b_bcount - bp->b_resid;
208		else
209			*bytes_iop = 0;
210	else
211		*bytes_iop += bp->b_bcount;
212	/*
213	 * Release direct IO resources
214	 */
215	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
216	kmem_cache_free(directio_buf_cache, dbp);
217	return (error);
218}
219
220/*
221 * Wait for all of the direct IO operations to finish
222 */
223
224uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */
225
226static int
227directio_wait(struct directio_buf *tail, long *bytes_iop)
228{
229	int	error = 0, newerror;
230	struct directio_buf	*dbp;
231	uint_t	kpri_req_save;
232
233	/*
234	 * The linked list of directio buf structures is maintained
235	 * in reverse order (tail->last request->penultimate request->...)
236	 */
237	/*
238	 * This is the k_pri_req hack. Large numbers of threads
239	 * sleeping with kernel priority will cause scheduler thrashing
240	 * on an MP machine. This can be seen running Oracle using
241	 * directio to ufs files. Sleep at normal priority here to
242	 * more closely mimic physio to a device partition. This
243	 * workaround is disabled by default as a niced thread could
244	 * be starved from running while holding i_rwlock and i_contents.
245	 */
246	if (ufs_directio_drop_kpri) {
247		kpri_req_save = curthread->t_kpri_req;
248		curthread->t_kpri_req = 0;
249	}
250	while ((dbp = tail) != NULL) {
251		tail = dbp->next;
252		newerror = directio_wait_one(dbp, bytes_iop);
253		if (error == 0)
254			error = newerror;
255	}
256	if (ufs_directio_drop_kpri)
257		curthread->t_kpri_req = kpri_req_save;
258	return (error);
259}
260/*
261 * Initiate direct IO request
262 */
263static void
264directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
265	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
266	struct directio_buf **tailp, page_t **pplist)
267{
268	buf_t *bp;
269	struct directio_buf *dbp;
270
271	/*
272	 * Allocate a directio buf header
273	 *   Note - list is maintained in reverse order.
274	 *   directio_wait_one() depends on this fact when
275	 *   adjusting the ``bytes_io'' param. bytes_io
276	 *   is used to compute a residual in the case of error.
277	 */
278	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
279	dbp->next = *tailp;
280	*tailp = dbp;
281
282	/*
283	 * Initialize buf header
284	 */
285	dbp->addr = addr;
286	dbp->nbytes = nbytes;
287	bp = &dbp->buf;
288	bp->b_edev = ip->i_dev;
289	bp->b_lblkno = btodt(offset);
290	bp->b_bcount = nbytes;
291	bp->b_un.b_addr = addr;
292	bp->b_proc = procp;
293	bp->b_file = ip->i_vnode;
294
295	/*
296	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
297	 * will B_READ data from the filesystem and S_WRITE it into
298	 * the user's buffer; a write(2) will S_READ data from the
299	 * user's buffer and B_WRITE it to the filesystem.
300	 */
301	if (rw == S_WRITE) {
302		bp->b_flags = B_BUSY | B_PHYS | B_READ;
303		ufs_directio_kstats.phys_reads.value.ui64++;
304		ufs_directio_kstats.nread.value.ui64 += nbytes;
305	} else {
306		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
307		ufs_directio_kstats.phys_writes.value.ui64++;
308		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
309	}
310	bp->b_shadow = pplist;
311	if (pplist != NULL)
312		bp->b_flags |= B_SHADOW;
313
314	/*
315	 * Issue I/O request.
316	 */
317	ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
318	if (ufsvfsp->vfs_snapshot)
319		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
320	else
321		(void) bdev_strategy(bp);
322
323	if (rw == S_WRITE)
324		lwp_stat_update(LWP_STAT_OUBLK, 1);
325	else
326		lwp_stat_update(LWP_STAT_INBLK, 1);
327
328}
329
330uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
331uint32_t	ufs_cur_writes;		/* # concurrent writes */
332uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
333uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
334
335/*
336 * Force POSIX syncronous data integrity on all writes for testing.
337 */
338uint32_t	ufs_force_posix_sdi = 0;
339
340/*
341 * Direct Write
342 */
343
344int
345ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
346	cred_t *cr, int *statusp)
347{
348	long		resid, bytes_written;
349	u_offset_t	size, uoff;
350	uio_t		*uio = arg_uio;
351	rlim64_t	limit = uio->uio_llimit;
352	int		on, n, error, newerror, len, has_holes;
353	daddr_t		bn;
354	size_t		nbytes;
355	struct fs	*fs;
356	vnode_t		*vp;
357	iovec_t		*iov;
358	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
359	struct proc	*procp;
360	struct as	*as;
361	struct directio_buf	*tail;
362	int		exclusive, ncur, bmap_peek;
363	uio_t		copy_uio;
364	iovec_t		copy_iov;
365	char		*copy_base;
366	long		copy_resid;
367
368	/*
369	 * assume that directio isn't possible (normal case)
370	 */
371	*statusp = DIRECTIO_FAILURE;
372
373	/*
374	 * Don't go direct
375	 */
376	if (ufs_directio_enabled == 0)
377		return (0);
378
379	/*
380	 * mapped file; nevermind
381	 */
382	if (ip->i_mapcnt)
383		return (0);
384
385	/*
386	 * CAN WE DO DIRECT IO?
387	 */
388	uoff = uio->uio_loffset;
389	resid = uio->uio_resid;
390
391	/*
392	 * beyond limit
393	 */
394	if (uoff + resid > limit)
395		return (0);
396
397	/*
398	 * must be sector aligned
399	 */
400	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
401		return (0);
402
403	/*
404	 * SHOULD WE DO DIRECT IO?
405	 */
406	size = ip->i_size;
407	has_holes = -1;
408
409	/*
410	 * only on regular files; no metadata
411	 */
412	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
413		return (0);
414
415	/*
416	 * Synchronous, allocating writes run very slow in Direct-Mode
417	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
418	 *	XXX - can be fixed for updates to "almost-full" files
419	 *	XXX - WARNING - system hangs if bmap_write() has to
420	 * 			allocate lots of pages since pageout
421	 * 			suspends on locked inode
422	 */
423	if (!rewrite && (ip->i_flag & ISYNC)) {
424		if ((uoff + resid) > size)
425			return (0);
426		has_holes = bmap_has_holes(ip);
427		if (has_holes)
428			return (0);
429	}
430
431	/*
432	 * Each iovec must be short aligned and sector aligned.  If
433	 * one is not, then kmem_alloc a new buffer and copy all of
434	 * the smaller buffers into the new buffer.  This new
435	 * buffer will be short aligned and sector aligned.
436	 */
437	iov = uio->uio_iov;
438	nbytes = uio->uio_iovcnt;
439	while (nbytes--) {
440		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
441		    (intptr_t)(iov->iov_base) & 1) {
442			copy_resid = uio->uio_resid;
443			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
444			if (copy_base == NULL)
445				return (0);
446			copy_iov.iov_base = copy_base;
447			copy_iov.iov_len = copy_resid;
448			copy_uio.uio_iov = &copy_iov;
449			copy_uio.uio_iovcnt = 1;
450			copy_uio.uio_segflg = UIO_SYSSPACE;
451			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
452			copy_uio.uio_loffset = uio->uio_loffset;
453			copy_uio.uio_resid = uio->uio_resid;
454			copy_uio.uio_llimit = uio->uio_llimit;
455			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
456			if (error) {
457				kmem_free(copy_base, copy_resid);
458				return (0);
459			}
460			uio = &copy_uio;
461			break;
462		}
463		iov++;
464	}
465
466	/*
467	 * From here on down, all error exits must go to errout and
468	 * not simply return a 0.
469	 */
470
471	/*
472	 * DIRECTIO
473	 */
474
475	fs = ip->i_fs;
476
477	/*
478	 * POSIX check. If attempting a concurrent re-write, make sure
479	 * that this will be a single request to the driver to meet
480	 * POSIX synchronous data integrity requirements.
481	 */
482	bmap_peek = 0;
483	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
484		int upgrade = 0;
485
486		/* check easy conditions first */
487		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
488			upgrade = 1;
489		} else {
490			/* now look for contiguous allocation */
491			len = (ssize_t)blkroundup(fs, resid);
492			error = bmap_read(ip, uoff, &bn, &len);
493			if (error || bn == UFS_HOLE || len == 0)
494				goto errout;
495			/* save a call to bmap_read later */
496			bmap_peek = 1;
497			if (len < resid)
498				upgrade = 1;
499		}
500		if (upgrade) {
501			rw_exit(&ip->i_contents);
502			rw_enter(&ip->i_contents, RW_WRITER);
503			ufs_posix_hits++;
504		}
505	}
506
507
508	/*
509	 * allocate space
510	 */
511
512	/*
513	 * If attempting a re-write, there is no allocation to do.
514	 * bmap_write would trip an ASSERT if i_contents is held shared.
515	 */
516	if (rewrite)
517		goto skip_alloc;
518
519	do {
520		on = (int)blkoff(fs, uoff);
521		n = (int)MIN(fs->fs_bsize - on, resid);
522		if ((uoff + n) > ip->i_size) {
523			error = bmap_write(ip, uoff, (int)(on + n),
524			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
525			    NULL, cr);
526			/* Caller is responsible for updating i_seq if needed */
527			if (error)
528				break;
529			ip->i_size = uoff + n;
530			ip->i_flag |= IATTCHG;
531		} else if (n == MAXBSIZE) {
532			error = bmap_write(ip, uoff, (int)(on + n),
533			    BI_ALLOC_ONLY, NULL, cr);
534			/* Caller is responsible for updating i_seq if needed */
535		} else {
536			if (has_holes < 0)
537				has_holes = bmap_has_holes(ip);
538			if (has_holes) {
539				uint_t	blk_size;
540				u_offset_t offset;
541
542				offset = uoff & (offset_t)fs->fs_bmask;
543				blk_size = (int)blksize(fs, ip,
544				    (daddr_t)lblkno(fs, offset));
545				error = bmap_write(ip, uoff, blk_size,
546				    BI_NORMAL, NULL, cr);
547				/*
548				 * Caller is responsible for updating
549				 * i_seq if needed
550				 */
551			} else
552				error = 0;
553		}
554		if (error)
555			break;
556		uoff += n;
557		resid -= n;
558		/*
559		 * if file has grown larger than 2GB, set flag
560		 * in superblock if not already set
561		 */
562		if ((ip->i_size > MAXOFF32_T) &&
563		    !(fs->fs_flags & FSLARGEFILES)) {
564			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
565			mutex_enter(&ufsvfsp->vfs_lock);
566			fs->fs_flags |= FSLARGEFILES;
567			ufs_sbwrite(ufsvfsp);
568			mutex_exit(&ufsvfsp->vfs_lock);
569		}
570	} while (resid);
571
572	if (error) {
573		/*
574		 * restore original state
575		 */
576		if (resid) {
577			if (size == ip->i_size)
578				goto errout;
579			(void) ufs_itrunc(ip, size, 0, cr);
580		}
581		/*
582		 * try non-directio path
583		 */
584		goto errout;
585	}
586skip_alloc:
587
588	/*
589	 * get rid of cached pages
590	 */
591	vp = ITOV(ip);
592	exclusive = rw_write_held(&ip->i_contents);
593	if (vn_has_cached_data(vp)) {
594		if (!exclusive) {
595			/*
596			 * Still holding i_rwlock, so no allocations
597			 * can happen after dropping contents.
598			 */
599			rw_exit(&ip->i_contents);
600			rw_enter(&ip->i_contents, RW_WRITER);
601		}
602		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
603		    B_INVAL, cr, NULL);
604		if (vn_has_cached_data(vp))
605			goto errout;
606		if (!exclusive)
607			rw_downgrade(&ip->i_contents);
608		ufs_directio_kstats.nflushes.value.ui64++;
609	}
610
611	/*
612	 * Direct Writes
613	 */
614
615	if (!exclusive) {
616		ufs_shared_writes++;
617		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
618		if (ncur > ufs_maxcur_writes)
619			ufs_maxcur_writes = ncur;
620	}
621
622	/*
623	 * proc and as are for VM operations in directio_start()
624	 */
625	if (uio->uio_segflg == UIO_USERSPACE) {
626		procp = ttoproc(curthread);
627		as = procp->p_as;
628	} else {
629		procp = NULL;
630		as = &kas;
631	}
632	*statusp = DIRECTIO_SUCCESS;
633	error = 0;
634	newerror = 0;
635	resid = uio->uio_resid;
636	bytes_written = 0;
637	ufs_directio_kstats.logical_writes.value.ui64++;
638	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
639		size_t pglck_len, pglck_size;
640		caddr_t pglck_base;
641		page_t **pplist, **spplist;
642
643		tail = NULL;
644
645		/*
646		 * Adjust number of bytes
647		 */
648		iov = uio->uio_iov;
649		pglck_len = (size_t)MIN(iov->iov_len, resid);
650		pglck_base = iov->iov_base;
651		if (pglck_len == 0) {
652			uio->uio_iov++;
653			uio->uio_iovcnt--;
654			continue;
655		}
656
657		/*
658		 * Try to Lock down the largest chunck of pages possible.
659		 */
660		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
661		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
662
663		if (error)
664			break;
665
666		pglck_size = pglck_len;
667		while (pglck_len) {
668
669			nbytes = pglck_len;
670			uoff = uio->uio_loffset;
671
672			if (!bmap_peek) {
673
674				/*
675				 * Re-adjust number of bytes to contiguous
676				 * range. May have already called bmap_read
677				 * in the case of a concurrent rewrite.
678				 */
679				len = (ssize_t)blkroundup(fs, nbytes);
680				error = bmap_read(ip, uoff, &bn, &len);
681				if (error)
682					break;
683				if (bn == UFS_HOLE || len == 0)
684					break;
685			}
686			nbytes = (size_t)MIN(nbytes, len);
687			bmap_peek = 0;
688
689			/*
690			 * Get the pagelist pointer for this offset to be
691			 * passed to directio_start.
692			 */
693
694			if (pplist != NULL)
695				spplist = pplist +
696				    btop((uintptr_t)iov->iov_base -
697				    ((uintptr_t)pglck_base & PAGEMASK));
698			else
699				spplist = NULL;
700
701			/*
702			 * Kick off the direct write requests
703			 */
704			directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
705			    iov->iov_base, S_READ, procp, &tail, spplist);
706
707			/*
708			 * Adjust pointers and counters
709			 */
710			iov->iov_len -= nbytes;
711			iov->iov_base += nbytes;
712			uio->uio_loffset += nbytes;
713			resid -= nbytes;
714			pglck_len -= nbytes;
715		}
716
717		/*
718		 * Wait for outstanding requests
719		 */
720		newerror = directio_wait(tail, &bytes_written);
721
722		/*
723		 * Release VM resources
724		 */
725		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
726
727	}
728
729	if (!exclusive) {
730		atomic_add_32(&ufs_cur_writes, -1);
731		/*
732		 * If this write was done shared, readers may
733		 * have pulled in unmodified pages. Get rid of
734		 * these potentially stale pages.
735		 */
736		if (vn_has_cached_data(vp)) {
737			rw_exit(&ip->i_contents);
738			rw_enter(&ip->i_contents, RW_WRITER);
739			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
740			    B_INVAL, cr, NULL);
741			ufs_directio_kstats.nflushes.value.ui64++;
742			rw_downgrade(&ip->i_contents);
743		}
744	}
745
746	/*
747	 * If error, adjust resid to begin at the first
748	 * un-writable byte.
749	 */
750	if (error == 0)
751		error = newerror;
752	if (error)
753		resid = uio->uio_resid - bytes_written;
754	arg_uio->uio_resid = resid;
755
756	if (!rewrite) {
757		ip->i_flag |= IUPD | ICHG;
758		/* Caller will update i_seq */
759		TRANS_INODE(ip->i_ufsvfs, ip);
760	}
761	/*
762	 * If there is a residual; adjust the EOF if necessary
763	 */
764	if (resid) {
765		if (size != ip->i_size) {
766			if (uio->uio_loffset > size)
767				size = uio->uio_loffset;
768			(void) ufs_itrunc(ip, size, 0, cr);
769		}
770	}
771
772	if (uio == &copy_uio)
773		kmem_free(copy_base, copy_resid);
774
775	return (error);
776
777errout:
778	if (uio == &copy_uio)
779		kmem_free(copy_base, copy_resid);
780
781	return (0);
782}
783/*
784 * Direct read of a hole
785 */
786static int
787directio_hole(struct uio *uio, size_t nbytes)
788{
789	int		error = 0, nzero;
790	uio_t		phys_uio;
791	iovec_t		phys_iov;
792
793	ufs_directio_kstats.hole_reads.value.ui64++;
794	ufs_directio_kstats.nread.value.ui64 += nbytes;
795
796	phys_iov.iov_base = uio->uio_iov->iov_base;
797	phys_iov.iov_len = nbytes;
798
799	phys_uio.uio_iov = &phys_iov;
800	phys_uio.uio_iovcnt = 1;
801	phys_uio.uio_resid = phys_iov.iov_len;
802	phys_uio.uio_segflg = uio->uio_segflg;
803	phys_uio.uio_extflg = uio->uio_extflg;
804	while (error == 0 && phys_uio.uio_resid) {
805		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
806		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
807		    &phys_uio);
808	}
809	return (error);
810}
811
812/*
813 * Direct Read
814 */
815int
816ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
817{
818	ssize_t		resid, bytes_read;
819	u_offset_t	size, uoff;
820	int		error, newerror, len;
821	size_t		nbytes;
822	struct fs	*fs;
823	vnode_t		*vp;
824	daddr_t		bn;
825	iovec_t		*iov;
826	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
827	struct proc	*procp;
828	struct as	*as;
829	struct directio_buf	*tail;
830
831	/*
832	 * assume that directio isn't possible (normal case)
833	 */
834	*statusp = DIRECTIO_FAILURE;
835
836	/*
837	 * Don't go direct
838	 */
839	if (ufs_directio_enabled == 0)
840		return (0);
841
842	/*
843	 * mapped file; nevermind
844	 */
845	if (ip->i_mapcnt)
846		return (0);
847
848	/*
849	 * CAN WE DO DIRECT IO?
850	 */
851	/*
852	 * must be sector aligned
853	 */
854	uoff = uio->uio_loffset;
855	resid = uio->uio_resid;
856	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
857		return (0);
858	/*
859	 * must be short aligned and sector aligned
860	 */
861	iov = uio->uio_iov;
862	nbytes = uio->uio_iovcnt;
863	while (nbytes--) {
864		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
865			return (0);
866		if ((intptr_t)(iov++->iov_base) & 1)
867			return (0);
868	}
869
870	/*
871	 * DIRECTIO
872	 */
873	fs = ip->i_fs;
874
875	/*
876	 * don't read past EOF
877	 */
878	size = ip->i_size;
879
880	/*
881	 * The file offset is past EOF so bail out here; we don't want
882	 * to update uio_resid and make it look like we read something.
883	 * We say that direct I/O was a success to avoid having rdip()
884	 * go through the same "read past EOF logic".
885	 */
886	if (uoff >= size) {
887		*statusp = DIRECTIO_SUCCESS;
888		return (0);
889	}
890
891	/*
892	 * The read would extend past EOF so make it smaller.
893	 */
894	if ((uoff + resid) > size) {
895		resid = size - uoff;
896		/*
897		 * recheck sector alignment
898		 */
899		if (resid & (DEV_BSIZE - 1))
900			return (0);
901	}
902
903	/*
904	 * At this point, we know there is some real work to do.
905	 */
906	ASSERT(resid);
907
908	/*
909	 * get rid of cached pages
910	 */
911	vp = ITOV(ip);
912	if (vn_has_cached_data(vp)) {
913		rw_exit(&ip->i_contents);
914		rw_enter(&ip->i_contents, RW_WRITER);
915		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
916		    B_INVAL, cr, NULL);
917		if (vn_has_cached_data(vp))
918			return (0);
919		rw_downgrade(&ip->i_contents);
920		ufs_directio_kstats.nflushes.value.ui64++;
921	}
922	/*
923	 * Direct Reads
924	 */
925
926	/*
927	 * proc and as are for VM operations in directio_start()
928	 */
929	if (uio->uio_segflg == UIO_USERSPACE) {
930		procp = ttoproc(curthread);
931		as = procp->p_as;
932	} else {
933		procp = NULL;
934		as = &kas;
935	}
936
937	*statusp = DIRECTIO_SUCCESS;
938	error = 0;
939	newerror = 0;
940	bytes_read = 0;
941	ufs_directio_kstats.logical_reads.value.ui64++;
942	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
943		size_t pglck_len, pglck_size;
944		caddr_t pglck_base;
945		page_t **pplist, **spplist;
946
947		tail = NULL;
948
949		/*
950		 * Adjust number of bytes
951		 */
952		iov = uio->uio_iov;
953		pglck_len = (size_t)MIN(iov->iov_len, resid);
954		pglck_base = iov->iov_base;
955		if (pglck_len == 0) {
956			uio->uio_iov++;
957			uio->uio_iovcnt--;
958			continue;
959		}
960
961		/*
962		 * Try to Lock down the largest chunck of pages possible.
963		 */
964		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
965		error = as_pagelock(as, &pplist, pglck_base,
966		    pglck_len, S_WRITE);
967
968		if (error)
969			break;
970
971		pglck_size = pglck_len;
972		while (pglck_len) {
973
974			nbytes = pglck_len;
975			uoff = uio->uio_loffset;
976
977			/*
978			 * Re-adjust number of bytes to contiguous range
979			 */
980			len = (ssize_t)blkroundup(fs, nbytes);
981			error = bmap_read(ip, uoff, &bn, &len);
982			if (error)
983				break;
984
985			if (bn == UFS_HOLE) {
986				nbytes = (size_t)MIN(fs->fs_bsize -
987				    (long)blkoff(fs, uoff), nbytes);
988				error = directio_hole(uio, nbytes);
989				/*
990				 * Hole reads are not added to the list
991				 * processed by directio_wait() below so
992				 * account for bytes read here.
993				 */
994				if (!error)
995					bytes_read += nbytes;
996			} else {
997				nbytes = (size_t)MIN(nbytes, len);
998
999				/*
1000				 * Get the pagelist pointer for this offset
1001				 * to be passed to directio_start.
1002				 */
1003				if (pplist != NULL)
1004					spplist = pplist +
1005					    btop((uintptr_t)iov->iov_base -
1006					    ((uintptr_t)pglck_base & PAGEMASK));
1007				else
1008					spplist = NULL;
1009
1010				/*
1011				 * Kick off the direct read requests
1012				 */
1013				directio_start(ufsvfsp, ip, nbytes,
1014				    ldbtob(bn), iov->iov_base,
1015				    S_WRITE, procp, &tail, spplist);
1016			}
1017
1018			if (error)
1019				break;
1020
1021			/*
1022			 * Adjust pointers and counters
1023			 */
1024			iov->iov_len -= nbytes;
1025			iov->iov_base += nbytes;
1026			uio->uio_loffset += nbytes;
1027			resid -= nbytes;
1028			pglck_len -= nbytes;
1029		}
1030
1031		/*
1032		 * Wait for outstanding requests
1033		 */
1034		newerror = directio_wait(tail, &bytes_read);
1035		/*
1036		 * Release VM resources
1037		 */
1038		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039
1040	}
1041
1042	/*
1043	 * If error, adjust resid to begin at the first
1044	 * un-read byte.
1045	 */
1046	if (error == 0)
1047		error = newerror;
1048	uio->uio_resid -= bytes_read;
1049	return (error);
1050}
1051