1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2000-2003 Tor Egge
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/systm.h>
30#include <sys/bio.h>
31#include <sys/buf.h>
32#include <sys/conf.h>
33#include <sys/fcntl.h>
34#include <sys/kernel.h>
35#include <sys/limits.h>
36#include <sys/mount.h>
37#include <sys/namei.h>
38#include <sys/proc.h>
39#include <sys/rwlock.h>
40#include <sys/stat.h>
41#include <sys/sysctl.h>
42#include <sys/vnode.h>
43
44#include <ufs/ufs/extattr.h>
45#include <ufs/ufs/quota.h>
46#include <ufs/ufs/inode.h>
47#include <ufs/ufs/ufsmount.h>
48#include <ufs/ufs/ufs_extern.h>
49#include <ufs/ffs/fs.h>
50#include <ufs/ffs/ffs_extern.h>
51
52#include <vm/vm.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_object.h>
55#include <vm/vnode_pager.h>
56
57static int ffs_rawread_readahead(struct vnode *vp,
58				 caddr_t udata,
59				 off_t offset,
60				 size_t len,
61				 struct thread *td,
62				 struct buf *bp);
63static int ffs_rawread_main(struct vnode *vp,
64			    struct uio *uio);
65
66static int ffs_rawread_sync(struct vnode *vp);
67
68int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
69
70SYSCTL_DECL(_vfs_ffs);
71
72static uma_zone_t ffsraw_pbuf_zone;
73
74static int allowrawread = 1;
75SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0,
76	   "Flag to enable raw reads");
77
78static int rawreadahead = 1;
79SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0,
80	   "Flag to enable readahead for long raw reads");
81
82static void
83ffs_rawread_setup(void *arg __unused)
84{
85
86	ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf",
87	    (nswbuf > 100 ) ?  (nswbuf - (nswbuf >> 4)) : nswbuf - 8);
88}
89SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL);
90
91static int
92ffs_rawread_sync(struct vnode *vp)
93{
94	int error;
95	int upgraded;
96	struct bufobj *bo;
97	struct mount *mp;
98	vm_object_t obj;
99
100	/* Check for dirty mmap, pending writes and dirty buffers */
101	bo = &vp->v_bufobj;
102	BO_LOCK(bo);
103	VI_LOCK(vp);
104	if (bo->bo_numoutput > 0 ||
105	    bo->bo_dirty.bv_cnt > 0 ||
106	    ((obj = vp->v_object) != NULL &&
107	     vm_object_mightbedirty(obj))) {
108		VI_UNLOCK(vp);
109		BO_UNLOCK(bo);
110
111		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
112			if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
113				upgraded = 1;
114			else
115				upgraded = 0;
116			VOP_UNLOCK(vp);
117			(void) vn_start_write(vp, &mp, V_WAIT);
118			VOP_LOCK(vp, LK_EXCLUSIVE);
119		} else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
120			upgraded = 1;
121			/* Upgrade to exclusive lock, this might block */
122			VOP_LOCK(vp, LK_UPGRADE);
123		} else
124			upgraded = 0;
125
126
127		VI_LOCK(vp);
128		/* Check if vnode was reclaimed while unlocked. */
129		if (VN_IS_DOOMED(vp)) {
130			VI_UNLOCK(vp);
131			if (upgraded != 0)
132				VOP_LOCK(vp, LK_DOWNGRADE);
133			vn_finished_write(mp);
134			return (EIO);
135		}
136		VI_UNLOCK(vp);
137
138		/* Attempt to msync mmap() regions to clean dirty mmap */
139		vnode_pager_clean_sync(vp);
140
141		/* Wait for pending writes to complete */
142		BO_LOCK(bo);
143		error = bufobj_wwait(&vp->v_bufobj, 0, 0);
144		if (error != 0) {
145			/* XXX: can't happen with a zero timeout ??? */
146			BO_UNLOCK(bo);
147			if (upgraded != 0)
148				VOP_LOCK(vp, LK_DOWNGRADE);
149			vn_finished_write(mp);
150			return (error);
151		}
152		/* Flush dirty buffers */
153		if (bo->bo_dirty.bv_cnt > 0) {
154			BO_UNLOCK(bo);
155			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) {
156				if (upgraded != 0)
157					VOP_LOCK(vp, LK_DOWNGRADE);
158				vn_finished_write(mp);
159				return (error);
160			}
161			BO_LOCK(bo);
162			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
163				panic("ffs_rawread_sync: dirty bufs");
164		}
165		BO_UNLOCK(bo);
166		if (upgraded != 0)
167			VOP_LOCK(vp, LK_DOWNGRADE);
168		vn_finished_write(mp);
169	} else {
170		VI_UNLOCK(vp);
171		BO_UNLOCK(bo);
172	}
173	return 0;
174}
175
176static int
177ffs_rawread_readahead(struct vnode *vp,
178		      caddr_t udata,
179		      off_t offset,
180		      size_t len,
181		      struct thread *td,
182		      struct buf *bp)
183{
184	int error;
185	uint64_t iolen;
186	off_t blockno;
187	int blockoff;
188	int bsize;
189	struct vnode *dp;
190	int bforwards;
191	struct inode *ip;
192	ufs2_daddr_t blkno;
193
194	bsize = vp->v_mount->mnt_stat.f_iosize;
195
196	ip = VTOI(vp);
197	dp = ITODEVVP(ip);
198
199	iolen = ((vm_offset_t) udata) & PAGE_MASK;
200	bp->b_bcount = len;
201	if (bp->b_bcount + iolen > bp->b_kvasize) {
202		bp->b_bcount = bp->b_kvasize;
203		if (iolen != 0)
204			bp->b_bcount -= PAGE_SIZE;
205	}
206	bp->b_flags = 0;	/* XXX necessary ? */
207	bp->b_iocmd = BIO_READ;
208	bp->b_iodone = bdone;
209	blockno = offset / bsize;
210	blockoff = (offset % bsize) / DEV_BSIZE;
211	if ((daddr_t) blockno != blockno) {
212		return EINVAL; /* blockno overflow */
213	}
214
215	bp->b_lblkno = bp->b_blkno = blockno;
216
217	error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL);
218	if (error != 0)
219		return error;
220	if (blkno == -1) {
221		/* Fill holes with NULs to preserve semantics */
222
223		if (bp->b_bcount + blockoff * DEV_BSIZE > bsize)
224			bp->b_bcount = bsize - blockoff * DEV_BSIZE;
225
226		if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0)
227			return EFAULT;
228
229		maybe_yield();
230		bzero(bp->b_data, bp->b_bufsize);
231
232		/* Mark operation completed (similar to bufdone()) */
233
234		bp->b_resid = 0;
235		bp->b_flags |= B_DONE;
236		return 0;
237	}
238	bp->b_blkno = blkno + blockoff;
239	bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE;
240
241	if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
242		bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
243
244	if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0)
245		return EFAULT;
246
247	BO_STRATEGY(&dp->v_bufobj, bp);
248	return 0;
249}
250
251static int
252ffs_rawread_main(struct vnode *vp,
253		 struct uio *uio)
254{
255	int error, nerror;
256	struct buf *bp, *nbp, *tbp;
257	uint64_t iolen;
258	caddr_t udata;
259	long resid;
260	off_t offset;
261	struct thread *td;
262
263	td = uio->uio_td ? uio->uio_td : curthread;
264	udata = uio->uio_iov->iov_base;
265	resid = uio->uio_resid;
266	offset = uio->uio_offset;
267
268	/*
269	 * keep the process from being swapped
270	 */
271	PHOLD(td->td_proc);
272
273	error = 0;
274	nerror = 0;
275
276	bp = NULL;
277	nbp = NULL;
278
279	while (resid > 0) {
280
281		if (bp == NULL) { /* Setup first read */
282			bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK);
283			pbgetvp(vp, bp);
284			error = ffs_rawread_readahead(vp, udata, offset,
285						     resid, td, bp);
286			if (error != 0)
287				break;
288
289			if (resid > bp->b_bufsize) { /* Setup fist readahead */
290				if (rawreadahead != 0)
291					nbp = uma_zalloc(ffsraw_pbuf_zone,
292					    M_NOWAIT);
293				else
294					nbp = NULL;
295				if (nbp != NULL) {
296					pbgetvp(vp, nbp);
297
298					nerror = ffs_rawread_readahead(vp,
299								       udata +
300								       bp->b_bufsize,
301								       offset +
302								       bp->b_bufsize,
303								       resid -
304								       bp->b_bufsize,
305								       td,
306								       nbp);
307					if (nerror) {
308						pbrelvp(nbp);
309						uma_zfree(ffsraw_pbuf_zone,
310						    nbp);
311						nbp = NULL;
312					}
313				}
314			}
315		}
316
317		bwait(bp, PRIBIO, "rawrd");
318		vunmapbuf(bp);
319
320		iolen = bp->b_bcount - bp->b_resid;
321		if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
322			nerror = 0;	/* Ignore possible beyond EOF error */
323			break; /* EOF */
324		}
325
326		if ((bp->b_ioflags & BIO_ERROR) != 0) {
327			error = bp->b_error;
328			break;
329		}
330		resid -= iolen;
331		udata += iolen;
332		offset += iolen;
333		if (iolen < bp->b_bufsize) {
334			/* Incomplete read.  Try to read remaining part */
335			error = ffs_rawread_readahead(vp,
336						      udata,
337						      offset,
338						      bp->b_bufsize - iolen,
339						      td,
340						      bp);
341			if (error != 0)
342				break;
343		} else if (nbp != NULL) { /* Complete read with readahead */
344
345			tbp = bp;
346			bp = nbp;
347			nbp = tbp;
348
349			if (resid <= bp->b_bufsize) { /* No more readaheads */
350				pbrelvp(nbp);
351				uma_zfree(ffsraw_pbuf_zone, nbp);
352				nbp = NULL;
353			} else { /* Setup next readahead */
354				nerror = ffs_rawread_readahead(vp,
355							       udata +
356							       bp->b_bufsize,
357							       offset +
358							       bp->b_bufsize,
359							       resid -
360							       bp->b_bufsize,
361							       td,
362							       nbp);
363				if (nerror != 0) {
364					pbrelvp(nbp);
365					uma_zfree(ffsraw_pbuf_zone, nbp);
366					nbp = NULL;
367				}
368			}
369		} else if (nerror != 0) {/* Deferred Readahead error */
370			break;
371		}  else if (resid > 0) { /* More to read, no readahead */
372			error = ffs_rawread_readahead(vp, udata, offset,
373						      resid, td, bp);
374			if (error != 0)
375				break;
376		}
377	}
378
379	if (bp != NULL) {
380		pbrelvp(bp);
381		uma_zfree(ffsraw_pbuf_zone, bp);
382	}
383	if (nbp != NULL) {			/* Run down readahead buffer */
384		bwait(nbp, PRIBIO, "rawrd");
385		vunmapbuf(nbp);
386		pbrelvp(nbp);
387		uma_zfree(ffsraw_pbuf_zone, nbp);
388	}
389
390	if (error == 0)
391		error = nerror;
392	PRELE(td->td_proc);
393	uio->uio_iov->iov_base = udata;
394	uio->uio_resid = resid;
395	uio->uio_offset = offset;
396	return error;
397}
398
399int
400ffs_rawread(struct vnode *vp,
401	    struct uio *uio,
402	    int *workdone)
403{
404	if (allowrawread != 0 &&
405	    uio->uio_iovcnt == 1 &&
406	    uio->uio_segflg == UIO_USERSPACE &&
407	    uio->uio_resid == uio->uio_iov->iov_len &&
408	    (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags &
409	     TDP_DEADLKTREAT) == 0) {
410		int secsize;		/* Media sector size */
411		off_t filebytes;	/* Bytes left of file */
412		int blockbytes;		/* Bytes left of file in full blocks */
413		int partialbytes;	/* Bytes in last partial block */
414		int skipbytes;		/* Bytes not to read in ffs_rawread */
415		struct inode *ip;
416		int error;
417
418
419		/* Only handle sector aligned reads */
420		ip = VTOI(vp);
421		secsize = ITODEVVP(ip)->v_bufobj.bo_bsize;
422		if ((uio->uio_offset & (secsize - 1)) == 0 &&
423		    (uio->uio_resid & (secsize - 1)) == 0) {
424
425			/* Sync dirty pages and buffers if needed */
426			error = ffs_rawread_sync(vp);
427			if (error != 0)
428				return error;
429
430			/* Check for end of file */
431			if (ip->i_size > uio->uio_offset) {
432				filebytes = ip->i_size - uio->uio_offset;
433
434				/* No special eof handling needed ? */
435				if (uio->uio_resid <= filebytes) {
436					*workdone = 1;
437					return ffs_rawread_main(vp, uio);
438				}
439
440				partialbytes = ((unsigned int) ip->i_size) %
441				    ITOFS(ip)->fs_bsize;
442				blockbytes = (int) filebytes - partialbytes;
443				if (blockbytes > 0) {
444					skipbytes = uio->uio_resid -
445						blockbytes;
446					uio->uio_resid = blockbytes;
447					error = ffs_rawread_main(vp, uio);
448					uio->uio_resid += skipbytes;
449					if (error != 0)
450						return error;
451					/* Read remaining part using buffer */
452				}
453			}
454		}
455	}
456	*workdone = 0;
457	return 0;
458}
459