Deleted Added
full compact
nfs_bio.c (26409) nfs_bio.c (26469)
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.38 1997/05/19 14:36:47 dfr Exp $
37 * $Id: nfs_bio.c,v 1.39 1997/06/03 09:42:36 dfr Exp $
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_prot.h>
56#include <vm/vm_page.h>
57#include <vm/vm_object.h>
58#include <vm/vm_pager.h>
59#include <vm/vnode_pager.h>
60
61#include <nfs/rpcv2.h>
62#include <nfs/nfsproto.h>
63#include <nfs/nfs.h>
64#include <nfs/nfsmount.h>
65#include <nfs/nqnfs.h>
66#include <nfs/nfsnode.h>
67
68static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
69 struct proc *p));
70
71extern int nfs_numasync;
72extern struct nfsstats nfsstats;
73
74/*
75 * Vnode op for VM getpages.
76 */
77int
78nfs_getpages(ap)
79 struct vop_getpages_args *ap;
80{
81 int i, bsize;
82 vm_object_t obj;
83 int pcount;
84 struct uio auio;
85 struct iovec aiov;
86 int error;
87 vm_page_t m;
88
89 if (!(ap->a_vp->v_flag & VVMIO)) {
90 printf("nfs_getpages: called with non-VMIO vnode??\n");
91 return EOPNOTSUPP;
92 }
93
94 pcount = round_page(ap->a_count) / PAGE_SIZE;
95
96 obj = ap->a_m[ap->a_reqpage]->object;
97 bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
98
99 for (i = 0; i < pcount; i++) {
100 if (i != ap->a_reqpage) {
101 vnode_pager_freepage(ap->a_m[i]);
102 }
103 }
104 m = ap->a_m[ap->a_reqpage];
105
106 m->busy++;
107 m->flags &= ~PG_BUSY;
108
109 auio.uio_iov = &aiov;
110 auio.uio_iovcnt = 1;
111 aiov.iov_base = 0;
112 aiov.iov_len = PAGE_SIZE;
113 auio.uio_resid = PAGE_SIZE;
114 auio.uio_offset = IDX_TO_OFF(m->pindex);
115 auio.uio_segflg = UIO_NOCOPY;
116 auio.uio_rw = UIO_READ;
117 auio.uio_procp = curproc;
118 error = nfs_bioread(ap->a_vp, &auio, IO_NODELOCKED, curproc->p_ucred, 1);
119
120 m->flags |= PG_BUSY;
121 m->busy--;
122
123 if (error && (auio.uio_resid == PAGE_SIZE))
124 return VM_PAGER_ERROR;
125 return 0;
126}
127
128/*
129 * Vnode op for read using bio
130 * Any similarity to readip() is purely coincidental
131 */
132int
133nfs_bioread(vp, uio, ioflag, cred, getpages)
134 register struct vnode *vp;
135 register struct uio *uio;
136 int ioflag;
137 struct ucred *cred;
138 int getpages;
139{
140 register struct nfsnode *np = VTONFS(vp);
141 register int biosize, diff, i;
142 struct buf *bp = 0, *rabp;
143 struct vattr vattr;
144 struct proc *p;
145 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
146 daddr_t lbn, rabn;
147 int bufsize;
148 int nra, error = 0, n = 0, on = 0, not_readin;
149
150#ifdef DIAGNOSTIC
151 if (uio->uio_rw != UIO_READ)
152 panic("nfs_read mode");
153#endif
154 if (uio->uio_resid == 0)
155 return (0);
156 if (uio->uio_offset < 0)
157 return (EINVAL);
158 p = uio->uio_procp;
159 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
160 (void)nfs_fsinfo(nmp, vp, cred, p);
161 biosize = vp->v_mount->mnt_stat.f_iosize;
162 /*
163 * For nfs, cache consistency can only be maintained approximately.
164 * Although RFC1094 does not specify the criteria, the following is
165 * believed to be compatible with the reference port.
166 * For nqnfs, full cache consistency is maintained within the loop.
167 * For nfs:
168 * If the file's modify time on the server has changed since the
169 * last read rpc or you have written to the file,
170 * you may have lost data cache consistency with the
171 * server, so flush all of the file's data out of the cache.
172 * Then force a getattr rpc to ensure that you have up to date
173 * attributes.
174 * NB: This implies that cache data can be read when up to
175 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
176 * attributes this could be forced by setting n_attrstamp to 0 before
177 * the VOP_GETATTR() call.
178 */
179 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
180 if (np->n_flag & NMODIFIED) {
181 if (vp->v_type != VREG) {
182 if (vp->v_type != VDIR)
183 panic("nfs: bioread, not dir");
184 nfs_invaldir(vp);
185 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
186 if (error)
187 return (error);
188 }
189 np->n_attrstamp = 0;
190 error = VOP_GETATTR(vp, &vattr, cred, p);
191 if (error)
192 return (error);
193 np->n_mtime = vattr.va_mtime.tv_sec;
194 } else {
195 error = VOP_GETATTR(vp, &vattr, cred, p);
196 if (error)
197 return (error);
198 if (np->n_mtime != vattr.va_mtime.tv_sec) {
199 if (vp->v_type == VDIR)
200 nfs_invaldir(vp);
201 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
202 if (error)
203 return (error);
204 np->n_mtime = vattr.va_mtime.tv_sec;
205 }
206 }
207 }
208 do {
209
210 /*
211 * Get a valid lease. If cached data is stale, flush it.
212 */
213 if (nmp->nm_flag & NFSMNT_NQNFS) {
214 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
215 do {
216 error = nqnfs_getlease(vp, ND_READ, cred, p);
217 } while (error == NQNFS_EXPIRED);
218 if (error)
219 return (error);
220 if (np->n_lrev != np->n_brev ||
221 (np->n_flag & NQNFSNONCACHE) ||
222 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
223 if (vp->v_type == VDIR)
224 nfs_invaldir(vp);
225 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
226 if (error)
227 return (error);
228 np->n_brev = np->n_lrev;
229 }
230 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
231 nfs_invaldir(vp);
232 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
233 if (error)
234 return (error);
235 }
236 }
237 if (np->n_flag & NQNFSNONCACHE) {
238 switch (vp->v_type) {
239 case VREG:
240 return (nfs_readrpc(vp, uio, cred));
241 case VLNK:
242 return (nfs_readlinkrpc(vp, uio, cred));
243 case VDIR:
244 break;
245 default:
246 printf(" NQNFSNONCACHE: type %x unexpected\n",
247 vp->v_type);
248 };
249 }
250 switch (vp->v_type) {
251 case VREG:
252 nfsstats.biocache_reads++;
253 lbn = uio->uio_offset / biosize;
254 on = uio->uio_offset & (biosize - 1);
255 not_readin = 1;
256
257 /*
258 * Start the read ahead(s), as required.
259 */
260 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
261 for (nra = 0; nra < nmp->nm_readahead &&
262 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
263 rabn = lbn + 1 + nra;
264 if (!incore(vp, rabn)) {
265 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
266 if (!rabp)
267 return (EINTR);
268 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
269 rabp->b_flags |= (B_READ | B_ASYNC);
270 vfs_busy_pages(rabp, 0);
271 if (nfs_asyncio(rabp, cred)) {
272 rabp->b_flags |= B_INVAL|B_ERROR;
273 vfs_unbusy_pages(rabp);
274 brelse(rabp);
275 }
276 } else
277 brelse(rabp);
278 }
279 }
280 }
281
282 /*
283 * If the block is in the cache and has the required data
284 * in a valid region, just copy it out.
285 * Otherwise, get the block and write back/read in,
286 * as required.
287 */
288again:
289 bufsize = biosize;
290 if ((off_t)(lbn + 1) * biosize > np->n_size &&
291 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
292 bufsize = np->n_size - lbn * biosize;
293 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
294 }
295 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
296 if (!bp)
297 return (EINTR);
298 /*
299 * If we are being called from nfs_getpages, we must
300 * make sure the buffer is a vmio buffer. The vp will
301 * already be setup for vmio but there may be some old
302 * non-vmio buffers attached to it.
303 */
304 if (getpages && !(bp->b_flags & B_VMIO)) {
305#ifdef DIAGNOSTIC
306 printf("nfs_bioread: non vmio buf found, discarding\n");
307#endif
308 bp->b_flags |= B_NOCACHE;
309 bp->b_flags |= B_INVAFTERWRITE;
310 if (bp->b_dirtyend > 0) {
311 if ((bp->b_flags & B_DELWRI) == 0)
312 panic("nfsbioread");
313 if (VOP_BWRITE(bp) == EINTR)
314 return (EINTR);
315 } else
316 brelse(bp);
317 goto again;
318 }
319 if ((bp->b_flags & B_CACHE) == 0) {
320 bp->b_flags |= B_READ;
321 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322 not_readin = 0;
323 vfs_busy_pages(bp, 0);
324 error = nfs_doio(bp, cred, p);
325 if (error) {
326 brelse(bp);
327 return (error);
328 }
329 }
330 if (bufsize > on) {
331 n = min((unsigned)(bufsize - on), uio->uio_resid);
332 } else {
333 n = 0;
334 }
335 diff = np->n_size - uio->uio_offset;
336 if (diff < n)
337 n = diff;
338 if (not_readin && n > 0) {
339 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
340 bp->b_flags |= B_NOCACHE;
341 bp->b_flags |= B_INVAFTERWRITE;
342 if (bp->b_dirtyend > 0) {
343 if ((bp->b_flags & B_DELWRI) == 0)
344 panic("nfsbioread");
345 if (VOP_BWRITE(bp) == EINTR)
346 return (EINTR);
347 } else
348 brelse(bp);
349 goto again;
350 }
351 }
352 vp->v_lastr = lbn;
353 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
354 if (diff < n)
355 n = diff;
356 break;
357 case VLNK:
358 nfsstats.biocache_readlinks++;
359 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
360 if (!bp)
361 return (EINTR);
362 if ((bp->b_flags & B_CACHE) == 0) {
363 bp->b_flags |= B_READ;
364 vfs_busy_pages(bp, 0);
365 error = nfs_doio(bp, cred, p);
366 if (error) {
367 bp->b_flags |= B_ERROR;
368 brelse(bp);
369 return (error);
370 }
371 }
372 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
373 on = 0;
374 break;
375 case VDIR:
376 nfsstats.biocache_readdirs++;
377 if (np->n_direofoffset
378 && uio->uio_offset >= np->n_direofoffset) {
379 return (0);
380 }
381 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
382 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
383 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
384 if (!bp)
385 return (EINTR);
386 if ((bp->b_flags & B_CACHE) == 0) {
387 bp->b_flags |= B_READ;
388 vfs_busy_pages(bp, 0);
389 error = nfs_doio(bp, cred, p);
390 if (error) {
391 vfs_unbusy_pages(bp);
392 brelse(bp);
393 while (error == NFSERR_BAD_COOKIE) {
394 nfs_invaldir(vp);
395 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
396 /*
397 * Yuck! The directory has been modified on the
398 * server. The only way to get the block is by
399 * reading from the beginning to get all the
400 * offset cookies.
401 */
402 for (i = 0; i <= lbn && !error; i++) {
403 if (np->n_direofoffset
404 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
405 return (0);
406 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
407 if (!bp)
408 return (EINTR);
409 if ((bp->b_flags & B_DONE) == 0) {
410 bp->b_flags |= B_READ;
411 vfs_busy_pages(bp, 0);
412 error = nfs_doio(bp, cred, p);
413 if (error) {
414 vfs_unbusy_pages(bp);
415 brelse(bp);
416 } else if (i < lbn)
417 brelse(bp);
418 }
419 }
420 }
421 if (error)
422 return (error);
423 }
424 }
425
426 /*
427 * If not eof and read aheads are enabled, start one.
428 * (You need the current block first, so that you have the
429 * directory offset cookie of the next block.)
430 */
431 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
432 (np->n_direofoffset == 0 ||
433 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
434 !(np->n_flag & NQNFSNONCACHE) &&
435 !incore(vp, lbn + 1)) {
436 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
437 if (rabp) {
438 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
439 rabp->b_flags |= (B_READ | B_ASYNC);
440 vfs_busy_pages(rabp, 0);
441 if (nfs_asyncio(rabp, cred)) {
442 rabp->b_flags |= B_INVAL|B_ERROR;
443 vfs_unbusy_pages(rabp);
444 brelse(rabp);
445 }
446 } else {
447 brelse(rabp);
448 }
449 }
450 }
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_prot.h>
56#include <vm/vm_page.h>
57#include <vm/vm_object.h>
58#include <vm/vm_pager.h>
59#include <vm/vnode_pager.h>
60
61#include <nfs/rpcv2.h>
62#include <nfs/nfsproto.h>
63#include <nfs/nfs.h>
64#include <nfs/nfsmount.h>
65#include <nfs/nqnfs.h>
66#include <nfs/nfsnode.h>
67
68static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
69 struct proc *p));
70
71extern int nfs_numasync;
72extern struct nfsstats nfsstats;
73
74/*
75 * Vnode op for VM getpages.
76 */
77int
78nfs_getpages(ap)
79 struct vop_getpages_args *ap;
80{
81 int i, bsize;
82 vm_object_t obj;
83 int pcount;
84 struct uio auio;
85 struct iovec aiov;
86 int error;
87 vm_page_t m;
88
89 if (!(ap->a_vp->v_flag & VVMIO)) {
90 printf("nfs_getpages: called with non-VMIO vnode??\n");
91 return EOPNOTSUPP;
92 }
93
94 pcount = round_page(ap->a_count) / PAGE_SIZE;
95
96 obj = ap->a_m[ap->a_reqpage]->object;
97 bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
98
99 for (i = 0; i < pcount; i++) {
100 if (i != ap->a_reqpage) {
101 vnode_pager_freepage(ap->a_m[i]);
102 }
103 }
104 m = ap->a_m[ap->a_reqpage];
105
106 m->busy++;
107 m->flags &= ~PG_BUSY;
108
109 auio.uio_iov = &aiov;
110 auio.uio_iovcnt = 1;
111 aiov.iov_base = 0;
112 aiov.iov_len = PAGE_SIZE;
113 auio.uio_resid = PAGE_SIZE;
114 auio.uio_offset = IDX_TO_OFF(m->pindex);
115 auio.uio_segflg = UIO_NOCOPY;
116 auio.uio_rw = UIO_READ;
117 auio.uio_procp = curproc;
118 error = nfs_bioread(ap->a_vp, &auio, IO_NODELOCKED, curproc->p_ucred, 1);
119
120 m->flags |= PG_BUSY;
121 m->busy--;
122
123 if (error && (auio.uio_resid == PAGE_SIZE))
124 return VM_PAGER_ERROR;
125 return 0;
126}
127
128/*
129 * Vnode op for read using bio
130 * Any similarity to readip() is purely coincidental
131 */
132int
133nfs_bioread(vp, uio, ioflag, cred, getpages)
134 register struct vnode *vp;
135 register struct uio *uio;
136 int ioflag;
137 struct ucred *cred;
138 int getpages;
139{
140 register struct nfsnode *np = VTONFS(vp);
141 register int biosize, diff, i;
142 struct buf *bp = 0, *rabp;
143 struct vattr vattr;
144 struct proc *p;
145 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
146 daddr_t lbn, rabn;
147 int bufsize;
148 int nra, error = 0, n = 0, on = 0, not_readin;
149
150#ifdef DIAGNOSTIC
151 if (uio->uio_rw != UIO_READ)
152 panic("nfs_read mode");
153#endif
154 if (uio->uio_resid == 0)
155 return (0);
156 if (uio->uio_offset < 0)
157 return (EINVAL);
158 p = uio->uio_procp;
159 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
160 (void)nfs_fsinfo(nmp, vp, cred, p);
161 biosize = vp->v_mount->mnt_stat.f_iosize;
162 /*
163 * For nfs, cache consistency can only be maintained approximately.
164 * Although RFC1094 does not specify the criteria, the following is
165 * believed to be compatible with the reference port.
166 * For nqnfs, full cache consistency is maintained within the loop.
167 * For nfs:
168 * If the file's modify time on the server has changed since the
169 * last read rpc or you have written to the file,
170 * you may have lost data cache consistency with the
171 * server, so flush all of the file's data out of the cache.
172 * Then force a getattr rpc to ensure that you have up to date
173 * attributes.
174 * NB: This implies that cache data can be read when up to
175 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
176 * attributes this could be forced by setting n_attrstamp to 0 before
177 * the VOP_GETATTR() call.
178 */
179 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
180 if (np->n_flag & NMODIFIED) {
181 if (vp->v_type != VREG) {
182 if (vp->v_type != VDIR)
183 panic("nfs: bioread, not dir");
184 nfs_invaldir(vp);
185 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
186 if (error)
187 return (error);
188 }
189 np->n_attrstamp = 0;
190 error = VOP_GETATTR(vp, &vattr, cred, p);
191 if (error)
192 return (error);
193 np->n_mtime = vattr.va_mtime.tv_sec;
194 } else {
195 error = VOP_GETATTR(vp, &vattr, cred, p);
196 if (error)
197 return (error);
198 if (np->n_mtime != vattr.va_mtime.tv_sec) {
199 if (vp->v_type == VDIR)
200 nfs_invaldir(vp);
201 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
202 if (error)
203 return (error);
204 np->n_mtime = vattr.va_mtime.tv_sec;
205 }
206 }
207 }
208 do {
209
210 /*
211 * Get a valid lease. If cached data is stale, flush it.
212 */
213 if (nmp->nm_flag & NFSMNT_NQNFS) {
214 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
215 do {
216 error = nqnfs_getlease(vp, ND_READ, cred, p);
217 } while (error == NQNFS_EXPIRED);
218 if (error)
219 return (error);
220 if (np->n_lrev != np->n_brev ||
221 (np->n_flag & NQNFSNONCACHE) ||
222 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
223 if (vp->v_type == VDIR)
224 nfs_invaldir(vp);
225 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
226 if (error)
227 return (error);
228 np->n_brev = np->n_lrev;
229 }
230 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
231 nfs_invaldir(vp);
232 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
233 if (error)
234 return (error);
235 }
236 }
237 if (np->n_flag & NQNFSNONCACHE) {
238 switch (vp->v_type) {
239 case VREG:
240 return (nfs_readrpc(vp, uio, cred));
241 case VLNK:
242 return (nfs_readlinkrpc(vp, uio, cred));
243 case VDIR:
244 break;
245 default:
246 printf(" NQNFSNONCACHE: type %x unexpected\n",
247 vp->v_type);
248 };
249 }
250 switch (vp->v_type) {
251 case VREG:
252 nfsstats.biocache_reads++;
253 lbn = uio->uio_offset / biosize;
254 on = uio->uio_offset & (biosize - 1);
255 not_readin = 1;
256
257 /*
258 * Start the read ahead(s), as required.
259 */
260 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
261 for (nra = 0; nra < nmp->nm_readahead &&
262 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
263 rabn = lbn + 1 + nra;
264 if (!incore(vp, rabn)) {
265 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
266 if (!rabp)
267 return (EINTR);
268 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
269 rabp->b_flags |= (B_READ | B_ASYNC);
270 vfs_busy_pages(rabp, 0);
271 if (nfs_asyncio(rabp, cred)) {
272 rabp->b_flags |= B_INVAL|B_ERROR;
273 vfs_unbusy_pages(rabp);
274 brelse(rabp);
275 }
276 } else
277 brelse(rabp);
278 }
279 }
280 }
281
282 /*
283 * If the block is in the cache and has the required data
284 * in a valid region, just copy it out.
285 * Otherwise, get the block and write back/read in,
286 * as required.
287 */
288again:
289 bufsize = biosize;
290 if ((off_t)(lbn + 1) * biosize > np->n_size &&
291 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
292 bufsize = np->n_size - lbn * biosize;
293 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
294 }
295 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
296 if (!bp)
297 return (EINTR);
298 /*
299 * If we are being called from nfs_getpages, we must
300 * make sure the buffer is a vmio buffer. The vp will
301 * already be setup for vmio but there may be some old
302 * non-vmio buffers attached to it.
303 */
304 if (getpages && !(bp->b_flags & B_VMIO)) {
305#ifdef DIAGNOSTIC
306 printf("nfs_bioread: non vmio buf found, discarding\n");
307#endif
308 bp->b_flags |= B_NOCACHE;
309 bp->b_flags |= B_INVAFTERWRITE;
310 if (bp->b_dirtyend > 0) {
311 if ((bp->b_flags & B_DELWRI) == 0)
312 panic("nfsbioread");
313 if (VOP_BWRITE(bp) == EINTR)
314 return (EINTR);
315 } else
316 brelse(bp);
317 goto again;
318 }
319 if ((bp->b_flags & B_CACHE) == 0) {
320 bp->b_flags |= B_READ;
321 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322 not_readin = 0;
323 vfs_busy_pages(bp, 0);
324 error = nfs_doio(bp, cred, p);
325 if (error) {
326 brelse(bp);
327 return (error);
328 }
329 }
330 if (bufsize > on) {
331 n = min((unsigned)(bufsize - on), uio->uio_resid);
332 } else {
333 n = 0;
334 }
335 diff = np->n_size - uio->uio_offset;
336 if (diff < n)
337 n = diff;
338 if (not_readin && n > 0) {
339 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
340 bp->b_flags |= B_NOCACHE;
341 bp->b_flags |= B_INVAFTERWRITE;
342 if (bp->b_dirtyend > 0) {
343 if ((bp->b_flags & B_DELWRI) == 0)
344 panic("nfsbioread");
345 if (VOP_BWRITE(bp) == EINTR)
346 return (EINTR);
347 } else
348 brelse(bp);
349 goto again;
350 }
351 }
352 vp->v_lastr = lbn;
353 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
354 if (diff < n)
355 n = diff;
356 break;
357 case VLNK:
358 nfsstats.biocache_readlinks++;
359 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
360 if (!bp)
361 return (EINTR);
362 if ((bp->b_flags & B_CACHE) == 0) {
363 bp->b_flags |= B_READ;
364 vfs_busy_pages(bp, 0);
365 error = nfs_doio(bp, cred, p);
366 if (error) {
367 bp->b_flags |= B_ERROR;
368 brelse(bp);
369 return (error);
370 }
371 }
372 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
373 on = 0;
374 break;
375 case VDIR:
376 nfsstats.biocache_readdirs++;
377 if (np->n_direofoffset
378 && uio->uio_offset >= np->n_direofoffset) {
379 return (0);
380 }
381 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
382 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
383 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
384 if (!bp)
385 return (EINTR);
386 if ((bp->b_flags & B_CACHE) == 0) {
387 bp->b_flags |= B_READ;
388 vfs_busy_pages(bp, 0);
389 error = nfs_doio(bp, cred, p);
390 if (error) {
391 vfs_unbusy_pages(bp);
392 brelse(bp);
393 while (error == NFSERR_BAD_COOKIE) {
394 nfs_invaldir(vp);
395 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
396 /*
397 * Yuck! The directory has been modified on the
398 * server. The only way to get the block is by
399 * reading from the beginning to get all the
400 * offset cookies.
401 */
402 for (i = 0; i <= lbn && !error; i++) {
403 if (np->n_direofoffset
404 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
405 return (0);
406 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
407 if (!bp)
408 return (EINTR);
409 if ((bp->b_flags & B_DONE) == 0) {
410 bp->b_flags |= B_READ;
411 vfs_busy_pages(bp, 0);
412 error = nfs_doio(bp, cred, p);
413 if (error) {
414 vfs_unbusy_pages(bp);
415 brelse(bp);
416 } else if (i < lbn)
417 brelse(bp);
418 }
419 }
420 }
421 if (error)
422 return (error);
423 }
424 }
425
426 /*
427 * If not eof and read aheads are enabled, start one.
428 * (You need the current block first, so that you have the
429 * directory offset cookie of the next block.)
430 */
431 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
432 (np->n_direofoffset == 0 ||
433 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
434 !(np->n_flag & NQNFSNONCACHE) &&
435 !incore(vp, lbn + 1)) {
436 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
437 if (rabp) {
438 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
439 rabp->b_flags |= (B_READ | B_ASYNC);
440 vfs_busy_pages(rabp, 0);
441 if (nfs_asyncio(rabp, cred)) {
442 rabp->b_flags |= B_INVAL|B_ERROR;
443 vfs_unbusy_pages(rabp);
444 brelse(rabp);
445 }
446 } else {
447 brelse(rabp);
448 }
449 }
450 }
451 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
451 /*
452 * Make sure we use a signed variant of min() since
453 * the second term may be negative.
454 */
455 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
452 break;
453 default:
454 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
455 break;
456 };
457
458 if (n > 0) {
459 error = uiomove(bp->b_data + on, (int)n, uio);
460 }
461 switch (vp->v_type) {
462 case VREG:
463 break;
464 case VLNK:
465 n = 0;
466 break;
467 case VDIR:
468 if (np->n_flag & NQNFSNONCACHE)
469 bp->b_flags |= B_INVAL;
470 break;
471 default:
472 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
473 }
474 brelse(bp);
475 } while (error == 0 && uio->uio_resid > 0 && n > 0);
476 return (error);
477}
478
479/*
480 * Vnode op for write using bio
481 */
482int
483nfs_write(ap)
484 struct vop_write_args /* {
485 struct vnode *a_vp;
486 struct uio *a_uio;
487 int a_ioflag;
488 struct ucred *a_cred;
489 } */ *ap;
490{
491 register int biosize;
492 register struct uio *uio = ap->a_uio;
493 struct proc *p = uio->uio_procp;
494 register struct vnode *vp = ap->a_vp;
495 struct nfsnode *np = VTONFS(vp);
496 register struct ucred *cred = ap->a_cred;
497 int ioflag = ap->a_ioflag;
498 struct buf *bp;
499 struct vattr vattr;
500 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
501 daddr_t lbn;
502 int bufsize;
503 int n, on, error = 0, iomode, must_commit;
504
505#ifdef DIAGNOSTIC
506 if (uio->uio_rw != UIO_WRITE)
507 panic("nfs_write mode");
508 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
509 panic("nfs_write proc");
510#endif
511 if (vp->v_type != VREG)
512 return (EIO);
513 if (np->n_flag & NWRITEERR) {
514 np->n_flag &= ~NWRITEERR;
515 return (np->n_error);
516 }
517 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
518 (void)nfs_fsinfo(nmp, vp, cred, p);
519 if (ioflag & (IO_APPEND | IO_SYNC)) {
520 if (np->n_flag & NMODIFIED) {
521 np->n_attrstamp = 0;
522 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
523 if (error)
524 return (error);
525 }
526 if (ioflag & IO_APPEND) {
527 np->n_attrstamp = 0;
528 error = VOP_GETATTR(vp, &vattr, cred, p);
529 if (error)
530 return (error);
531 uio->uio_offset = np->n_size;
532 }
533 }
534 if (uio->uio_offset < 0)
535 return (EINVAL);
536 if (uio->uio_resid == 0)
537 return (0);
538 /*
539 * Maybe this should be above the vnode op call, but so long as
540 * file servers have no limits, i don't think it matters
541 */
542 if (p && uio->uio_offset + uio->uio_resid >
543 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
544 psignal(p, SIGXFSZ);
545 return (EFBIG);
546 }
547 /*
548 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
549 * will be the same size within a filesystem. nfs_writerpc will
550 * still use nm_wsize when sizing the rpc's.
551 */
552 biosize = vp->v_mount->mnt_stat.f_iosize;
553 do {
554 /*
555 * Check for a valid write lease.
556 */
557 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
558 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
559 do {
560 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
561 } while (error == NQNFS_EXPIRED);
562 if (error)
563 return (error);
564 if (np->n_lrev != np->n_brev ||
565 (np->n_flag & NQNFSNONCACHE)) {
566 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
567 if (error)
568 return (error);
569 np->n_brev = np->n_lrev;
570 }
571 }
572 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
573 iomode = NFSV3WRITE_FILESYNC;
574 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
575 if (must_commit)
576 nfs_clearcommit(vp->v_mount);
577 return (error);
578 }
579 nfsstats.biocache_writes++;
580 lbn = uio->uio_offset / biosize;
581 on = uio->uio_offset & (biosize-1);
582 n = min((unsigned)(biosize - on), uio->uio_resid);
583again:
584 if (uio->uio_offset + n > np->n_size) {
585 np->n_size = uio->uio_offset + n;
586 np->n_flag |= NMODIFIED;
587 vnode_pager_setsize(vp, (u_long)np->n_size);
588 }
589 bufsize = biosize;
590 if ((lbn + 1) * biosize > np->n_size) {
591 bufsize = np->n_size - lbn * biosize;
592 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
593 }
594 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
595 if (!bp)
596 return (EINTR);
597 if (bp->b_wcred == NOCRED) {
598 crhold(cred);
599 bp->b_wcred = cred;
600 }
601 np->n_flag |= NMODIFIED;
602
603 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
604 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
605 }
606
607 /*
608 * If the new write will leave a contiguous dirty
609 * area, just update the b_dirtyoff and b_dirtyend,
610 * otherwise force a write rpc of the old dirty area.
611 */
612 if (bp->b_dirtyend > 0 &&
613 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
614 bp->b_proc = p;
615 if (VOP_BWRITE(bp) == EINTR)
616 return (EINTR);
617 goto again;
618 }
619
620 /*
621 * Check for valid write lease and get one as required.
622 * In case getblk() and/or bwrite() delayed us.
623 */
624 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
625 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
626 do {
627 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
628 } while (error == NQNFS_EXPIRED);
629 if (error) {
630 brelse(bp);
631 return (error);
632 }
633 if (np->n_lrev != np->n_brev ||
634 (np->n_flag & NQNFSNONCACHE)) {
635 brelse(bp);
636 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
637 if (error)
638 return (error);
639 np->n_brev = np->n_lrev;
640 goto again;
641 }
642 }
643 error = uiomove((char *)bp->b_data + on, n, uio);
644 if (error) {
645 bp->b_flags |= B_ERROR;
646 brelse(bp);
647 return (error);
648 }
649 if (bp->b_dirtyend > 0) {
650 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
651 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
652 } else {
653 bp->b_dirtyoff = on;
654 bp->b_dirtyend = on + n;
655 }
656 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
657 bp->b_validoff > bp->b_dirtyend) {
658 bp->b_validoff = bp->b_dirtyoff;
659 bp->b_validend = bp->b_dirtyend;
660 } else {
661 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
662 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
663 }
664
665 /*
666 * Since this block is being modified, it must be written
667 * again and not just committed.
668 */
669 bp->b_flags &= ~B_NEEDCOMMIT;
670
671 /*
672 * If the lease is non-cachable or IO_SYNC do bwrite().
673 */
674 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
675 bp->b_proc = p;
676 error = VOP_BWRITE(bp);
677 if (error)
678 return (error);
679 if (np->n_flag & NQNFSNONCACHE) {
680 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
681 if (error)
682 return (error);
683 }
684 } else if ((n + on) == biosize &&
685 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
686 bp->b_proc = (struct proc *)0;
687 bp->b_flags |= B_ASYNC;
688 (void)nfs_writebp(bp, 0);
689 } else
690 bdwrite(bp);
691 } while (uio->uio_resid > 0 && n > 0);
692 return (0);
693}
694
695/*
696 * Get an nfs cache block.
697 * Allocate a new one if the block isn't currently in the cache
698 * and return the block marked busy. If the calling process is
699 * interrupted by a signal for an interruptible mount point, return
700 * NULL.
701 */
702static struct buf *
703nfs_getcacheblk(vp, bn, size, p)
704 struct vnode *vp;
705 daddr_t bn;
706 int size;
707 struct proc *p;
708{
709 register struct buf *bp;
710 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
711 int biosize = vp->v_mount->mnt_stat.f_iosize;
712
713 if (nmp->nm_flag & NFSMNT_INT) {
714 bp = getblk(vp, bn, size, PCATCH, 0);
715 while (bp == (struct buf *)0) {
716 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
717 return ((struct buf *)0);
718 bp = getblk(vp, bn, size, 0, 2 * hz);
719 }
720 } else
721 bp = getblk(vp, bn, size, 0, 0);
722
723 if( vp->v_type == VREG)
724 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
725
726 return (bp);
727}
728
729/*
730 * Flush and invalidate all dirty buffers. If another process is already
731 * doing the flush, just wait for completion.
732 */
733int
734nfs_vinvalbuf(vp, flags, cred, p, intrflg)
735 struct vnode *vp;
736 int flags;
737 struct ucred *cred;
738 struct proc *p;
739 int intrflg;
740{
741 register struct nfsnode *np = VTONFS(vp);
742 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
743 int error = 0, slpflag, slptimeo;
744
745 if ((nmp->nm_flag & NFSMNT_INT) == 0)
746 intrflg = 0;
747 if (intrflg) {
748 slpflag = PCATCH;
749 slptimeo = 2 * hz;
750 } else {
751 slpflag = 0;
752 slptimeo = 0;
753 }
754 /*
755 * First wait for any other process doing a flush to complete.
756 */
757 while (np->n_flag & NFLUSHINPROG) {
758 np->n_flag |= NFLUSHWANT;
759 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
760 slptimeo);
761 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
762 return (EINTR);
763 }
764
765 /*
766 * Now, flush as required.
767 */
768 np->n_flag |= NFLUSHINPROG;
769 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
770 while (error) {
771 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
772 np->n_flag &= ~NFLUSHINPROG;
773 if (np->n_flag & NFLUSHWANT) {
774 np->n_flag &= ~NFLUSHWANT;
775 wakeup((caddr_t)&np->n_flag);
776 }
777 return (EINTR);
778 }
779 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
780 }
781 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
782 if (np->n_flag & NFLUSHWANT) {
783 np->n_flag &= ~NFLUSHWANT;
784 wakeup((caddr_t)&np->n_flag);
785 }
786 return (0);
787}
788
789/*
790 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
791 * This is mainly to avoid queueing async I/O requests when the nfsiods
792 * are all hung on a dead server.
793 */
794int
795nfs_asyncio(bp, cred)
796 register struct buf *bp;
797 struct ucred *cred;
798{
799 struct nfsmount *nmp;
800 int i;
801 int gotiod;
802 int slpflag = 0;
803 int slptimeo = 0;
804 int error;
805
806 if (nfs_numasync == 0)
807 return (EIO);
808
809 nmp = VFSTONFS(bp->b_vp->v_mount);
810again:
811 if (nmp->nm_flag & NFSMNT_INT)
812 slpflag = PCATCH;
813 gotiod = FALSE;
814
815 /*
816 * Find a free iod to process this request.
817 */
818 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
819 if (nfs_iodwant[i]) {
820 /*
821 * Found one, so wake it up and tell it which
822 * mount to process.
823 */
824 NFS_DPF(ASYNCIO,
825 ("nfs_asyncio: waking iod %d for mount %p\n",
826 i, nmp));
827 nfs_iodwant[i] = (struct proc *)0;
828 nfs_iodmount[i] = nmp;
829 nmp->nm_bufqiods++;
830 wakeup((caddr_t)&nfs_iodwant[i]);
831 gotiod = TRUE;
832 break;
833 }
834
835 /*
836 * If none are free, we may already have an iod working on this mount
837 * point. If so, it will process our request.
838 */
839 if (!gotiod) {
840 if (nmp->nm_bufqiods > 0) {
841 NFS_DPF(ASYNCIO,
842 ("nfs_asyncio: %d iods are already processing mount %p\n",
843 nmp->nm_bufqiods, nmp));
844 gotiod = TRUE;
845 }
846 }
847
848 /*
849 * If we have an iod which can process the request, then queue
850 * the buffer.
851 */
852 if (gotiod) {
853 /*
854 * Ensure that the queue never grows too large.
855 */
856 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
857 NFS_DPF(ASYNCIO,
858 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
859 nmp->nm_bufqwant = TRUE;
860 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
861 "nfsaio", slptimeo);
862 if (error) {
863 if (nfs_sigintr(nmp, NULL, bp->b_proc))
864 return (EINTR);
865 if (slpflag == PCATCH) {
866 slpflag = 0;
867 slptimeo = 2 * hz;
868 }
869 }
870 /*
871 * We might have lost our iod while sleeping,
872 * so check and loop if nescessary.
873 */
874 if (nmp->nm_bufqiods == 0) {
875 NFS_DPF(ASYNCIO,
876 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
877 goto again;
878 }
879 }
880
881 if (bp->b_flags & B_READ) {
882 if (bp->b_rcred == NOCRED && cred != NOCRED) {
883 crhold(cred);
884 bp->b_rcred = cred;
885 }
886 } else {
887 bp->b_flags |= B_WRITEINPROG;
888 if (bp->b_wcred == NOCRED && cred != NOCRED) {
889 crhold(cred);
890 bp->b_wcred = cred;
891 }
892 }
893
894 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
895 nmp->nm_bufqlen++;
896 return (0);
897 }
898
899 /*
900 * All the iods are busy on other mounts, so return EIO to
901 * force the caller to process the i/o synchronously.
902 */
903 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
904 return (EIO);
905}
906
907/*
908 * Do an I/O operation to/from a cache block. This may be called
909 * synchronously or from an nfsiod.
910 */
911int
912nfs_doio(bp, cr, p)
913 register struct buf *bp;
914 struct ucred *cr;
915 struct proc *p;
916{
917 register struct uio *uiop;
918 register struct vnode *vp;
919 struct nfsnode *np;
920 struct nfsmount *nmp;
921 int error = 0, diff, len, iomode, must_commit = 0;
922 struct uio uio;
923 struct iovec io;
924
925 vp = bp->b_vp;
926 np = VTONFS(vp);
927 nmp = VFSTONFS(vp->v_mount);
928 uiop = &uio;
929 uiop->uio_iov = &io;
930 uiop->uio_iovcnt = 1;
931 uiop->uio_segflg = UIO_SYSSPACE;
932 uiop->uio_procp = p;
933
934 /*
935 * Historically, paging was done with physio, but no more.
936 */
937 if (bp->b_flags & B_PHYS) {
938 /*
939 * ...though reading /dev/drum still gets us here.
940 */
941 io.iov_len = uiop->uio_resid = bp->b_bcount;
942 /* mapping was done by vmapbuf() */
943 io.iov_base = bp->b_data;
944 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
945 if (bp->b_flags & B_READ) {
946 uiop->uio_rw = UIO_READ;
947 nfsstats.read_physios++;
948 error = nfs_readrpc(vp, uiop, cr);
949 } else {
950 int com;
951
952 iomode = NFSV3WRITE_DATASYNC;
953 uiop->uio_rw = UIO_WRITE;
954 nfsstats.write_physios++;
955 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
956 }
957 if (error) {
958 bp->b_flags |= B_ERROR;
959 bp->b_error = error;
960 }
961 } else if (bp->b_flags & B_READ) {
962 io.iov_len = uiop->uio_resid = bp->b_bcount;
963 io.iov_base = bp->b_data;
964 uiop->uio_rw = UIO_READ;
965 switch (vp->v_type) {
966 case VREG:
967 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
968 nfsstats.read_bios++;
969 error = nfs_readrpc(vp, uiop, cr);
970 if (!error) {
971 bp->b_validoff = 0;
972 if (uiop->uio_resid) {
973 /*
974 * If len > 0, there is a hole in the file and
975 * no writes after the hole have been pushed to
976 * the server yet.
977 * Just zero fill the rest of the valid area.
978 */
979 diff = bp->b_bcount - uiop->uio_resid;
980 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
981 + diff);
982 if (len > 0) {
983 len = min(len, uiop->uio_resid);
984 bzero((char *)bp->b_data + diff, len);
985 bp->b_validend = diff + len;
986 } else
987 bp->b_validend = diff;
988 } else
989 bp->b_validend = bp->b_bcount;
990 }
991 if (p && (vp->v_flag & VTEXT) &&
992 (((nmp->nm_flag & NFSMNT_NQNFS) &&
993 NQNFS_CKINVALID(vp, np, ND_READ) &&
994 np->n_lrev != np->n_brev) ||
995 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
996 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
997 uprintf("Process killed due to text file modification\n");
998 psignal(p, SIGKILL);
999#ifdef __NetBSD__
1000 p->p_holdcnt++;
1001#else
1002 p->p_flag |= P_NOSWAP;
1003#endif
1004 }
1005 break;
1006 case VLNK:
1007 uiop->uio_offset = (off_t)0;
1008 nfsstats.readlink_bios++;
1009 error = nfs_readlinkrpc(vp, uiop, cr);
1010 break;
1011 case VDIR:
1012 nfsstats.readdir_bios++;
1013 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1014 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1015 error = nfs_readdirplusrpc(vp, uiop, cr);
1016 if (error == NFSERR_NOTSUPP)
1017 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1018 }
1019 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1020 error = nfs_readdirrpc(vp, uiop, cr);
1021 break;
1022 default:
1023 printf("nfs_doio: type %x unexpected\n",vp->v_type);
1024 break;
1025 };
1026 if (error) {
1027 bp->b_flags |= B_ERROR;
1028 bp->b_error = error;
1029 }
1030 } else {
1031 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1032 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1033
1034 if (bp->b_dirtyend > bp->b_dirtyoff) {
1035 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1036 - bp->b_dirtyoff;
1037 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1038 + bp->b_dirtyoff;
1039 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1040 uiop->uio_rw = UIO_WRITE;
1041 nfsstats.write_bios++;
1042 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1043 iomode = NFSV3WRITE_UNSTABLE;
1044 else
1045 iomode = NFSV3WRITE_FILESYNC;
1046 bp->b_flags |= B_WRITEINPROG;
1047 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1048 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1049 bp->b_flags |= B_NEEDCOMMIT;
1050 if (bp->b_dirtyoff == 0
1051 && bp->b_dirtyend == bp->b_bufsize)
1052 bp->b_flags |= B_CLUSTEROK;
1053 } else
1054 bp->b_flags &= ~B_NEEDCOMMIT;
1055 bp->b_flags &= ~B_WRITEINPROG;
1056
1057 /*
1058 * For an interrupted write, the buffer is still valid
1059 * and the write hasn't been pushed to the server yet,
1060 * so we can't set B_ERROR and report the interruption
1061 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1062 * is not relevant, so the rpc attempt is essentially
1063 * a noop. For the case of a V3 write rpc not being
1064 * committed to stable storage, the block is still
1065 * dirty and requires either a commit rpc or another
1066 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1067 * the block is reused. This is indicated by setting
1068 * the B_DELWRI and B_NEEDCOMMIT flags.
1069 */
1070 if (error == EINTR
1071 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1072 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1073 bp->b_flags |= B_DELWRI;
1074
1075 /*
1076 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1077 * buffer to the clean list, we have to reassign it back to the
1078 * dirty one. Ugh.
1079 */
1080 if (bp->b_flags & B_ASYNC)
1081 reassignbuf(bp, vp);
1082 else
1083 bp->b_flags |= B_EINTR;
1084 } else {
1085 if (error) {
1086 bp->b_flags |= B_ERROR;
1087 bp->b_error = np->n_error = error;
1088 np->n_flag |= NWRITEERR;
1089 }
1090 bp->b_dirtyoff = bp->b_dirtyend = 0;
1091 }
1092 } else {
1093 bp->b_resid = 0;
1094 biodone(bp);
1095 return (0);
1096 }
1097 }
1098 bp->b_resid = uiop->uio_resid;
1099 if (must_commit)
1100 nfs_clearcommit(vp->v_mount);
1101 biodone(bp);
1102 return (error);
1103}
456 break;
457 default:
458 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
459 break;
460 };
461
462 if (n > 0) {
463 error = uiomove(bp->b_data + on, (int)n, uio);
464 }
465 switch (vp->v_type) {
466 case VREG:
467 break;
468 case VLNK:
469 n = 0;
470 break;
471 case VDIR:
472 if (np->n_flag & NQNFSNONCACHE)
473 bp->b_flags |= B_INVAL;
474 break;
475 default:
476 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
477 }
478 brelse(bp);
479 } while (error == 0 && uio->uio_resid > 0 && n > 0);
480 return (error);
481}
482
483/*
484 * Vnode op for write using bio
485 */
486int
487nfs_write(ap)
488 struct vop_write_args /* {
489 struct vnode *a_vp;
490 struct uio *a_uio;
491 int a_ioflag;
492 struct ucred *a_cred;
493 } */ *ap;
494{
495 register int biosize;
496 register struct uio *uio = ap->a_uio;
497 struct proc *p = uio->uio_procp;
498 register struct vnode *vp = ap->a_vp;
499 struct nfsnode *np = VTONFS(vp);
500 register struct ucred *cred = ap->a_cred;
501 int ioflag = ap->a_ioflag;
502 struct buf *bp;
503 struct vattr vattr;
504 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
505 daddr_t lbn;
506 int bufsize;
507 int n, on, error = 0, iomode, must_commit;
508
509#ifdef DIAGNOSTIC
510 if (uio->uio_rw != UIO_WRITE)
511 panic("nfs_write mode");
512 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
513 panic("nfs_write proc");
514#endif
515 if (vp->v_type != VREG)
516 return (EIO);
517 if (np->n_flag & NWRITEERR) {
518 np->n_flag &= ~NWRITEERR;
519 return (np->n_error);
520 }
521 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
522 (void)nfs_fsinfo(nmp, vp, cred, p);
523 if (ioflag & (IO_APPEND | IO_SYNC)) {
524 if (np->n_flag & NMODIFIED) {
525 np->n_attrstamp = 0;
526 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
527 if (error)
528 return (error);
529 }
530 if (ioflag & IO_APPEND) {
531 np->n_attrstamp = 0;
532 error = VOP_GETATTR(vp, &vattr, cred, p);
533 if (error)
534 return (error);
535 uio->uio_offset = np->n_size;
536 }
537 }
538 if (uio->uio_offset < 0)
539 return (EINVAL);
540 if (uio->uio_resid == 0)
541 return (0);
542 /*
543 * Maybe this should be above the vnode op call, but so long as
544 * file servers have no limits, i don't think it matters
545 */
546 if (p && uio->uio_offset + uio->uio_resid >
547 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
548 psignal(p, SIGXFSZ);
549 return (EFBIG);
550 }
551 /*
552 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
553 * will be the same size within a filesystem. nfs_writerpc will
554 * still use nm_wsize when sizing the rpc's.
555 */
556 biosize = vp->v_mount->mnt_stat.f_iosize;
557 do {
558 /*
559 * Check for a valid write lease.
560 */
561 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
562 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
563 do {
564 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
565 } while (error == NQNFS_EXPIRED);
566 if (error)
567 return (error);
568 if (np->n_lrev != np->n_brev ||
569 (np->n_flag & NQNFSNONCACHE)) {
570 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
571 if (error)
572 return (error);
573 np->n_brev = np->n_lrev;
574 }
575 }
576 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
577 iomode = NFSV3WRITE_FILESYNC;
578 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
579 if (must_commit)
580 nfs_clearcommit(vp->v_mount);
581 return (error);
582 }
583 nfsstats.biocache_writes++;
584 lbn = uio->uio_offset / biosize;
585 on = uio->uio_offset & (biosize-1);
586 n = min((unsigned)(biosize - on), uio->uio_resid);
587again:
588 if (uio->uio_offset + n > np->n_size) {
589 np->n_size = uio->uio_offset + n;
590 np->n_flag |= NMODIFIED;
591 vnode_pager_setsize(vp, (u_long)np->n_size);
592 }
593 bufsize = biosize;
594 if ((lbn + 1) * biosize > np->n_size) {
595 bufsize = np->n_size - lbn * biosize;
596 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
597 }
598 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
599 if (!bp)
600 return (EINTR);
601 if (bp->b_wcred == NOCRED) {
602 crhold(cred);
603 bp->b_wcred = cred;
604 }
605 np->n_flag |= NMODIFIED;
606
607 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
608 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
609 }
610
611 /*
612 * If the new write will leave a contiguous dirty
613 * area, just update the b_dirtyoff and b_dirtyend,
614 * otherwise force a write rpc of the old dirty area.
615 */
616 if (bp->b_dirtyend > 0 &&
617 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
618 bp->b_proc = p;
619 if (VOP_BWRITE(bp) == EINTR)
620 return (EINTR);
621 goto again;
622 }
623
624 /*
625 * Check for valid write lease and get one as required.
626 * In case getblk() and/or bwrite() delayed us.
627 */
628 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
629 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
630 do {
631 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
632 } while (error == NQNFS_EXPIRED);
633 if (error) {
634 brelse(bp);
635 return (error);
636 }
637 if (np->n_lrev != np->n_brev ||
638 (np->n_flag & NQNFSNONCACHE)) {
639 brelse(bp);
640 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
641 if (error)
642 return (error);
643 np->n_brev = np->n_lrev;
644 goto again;
645 }
646 }
647 error = uiomove((char *)bp->b_data + on, n, uio);
648 if (error) {
649 bp->b_flags |= B_ERROR;
650 brelse(bp);
651 return (error);
652 }
653 if (bp->b_dirtyend > 0) {
654 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
655 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
656 } else {
657 bp->b_dirtyoff = on;
658 bp->b_dirtyend = on + n;
659 }
660 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
661 bp->b_validoff > bp->b_dirtyend) {
662 bp->b_validoff = bp->b_dirtyoff;
663 bp->b_validend = bp->b_dirtyend;
664 } else {
665 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
666 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
667 }
668
669 /*
670 * Since this block is being modified, it must be written
671 * again and not just committed.
672 */
673 bp->b_flags &= ~B_NEEDCOMMIT;
674
675 /*
676 * If the lease is non-cachable or IO_SYNC do bwrite().
677 */
678 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
679 bp->b_proc = p;
680 error = VOP_BWRITE(bp);
681 if (error)
682 return (error);
683 if (np->n_flag & NQNFSNONCACHE) {
684 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
685 if (error)
686 return (error);
687 }
688 } else if ((n + on) == biosize &&
689 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
690 bp->b_proc = (struct proc *)0;
691 bp->b_flags |= B_ASYNC;
692 (void)nfs_writebp(bp, 0);
693 } else
694 bdwrite(bp);
695 } while (uio->uio_resid > 0 && n > 0);
696 return (0);
697}
698
699/*
700 * Get an nfs cache block.
701 * Allocate a new one if the block isn't currently in the cache
702 * and return the block marked busy. If the calling process is
703 * interrupted by a signal for an interruptible mount point, return
704 * NULL.
705 */
706static struct buf *
707nfs_getcacheblk(vp, bn, size, p)
708 struct vnode *vp;
709 daddr_t bn;
710 int size;
711 struct proc *p;
712{
713 register struct buf *bp;
714 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
715 int biosize = vp->v_mount->mnt_stat.f_iosize;
716
717 if (nmp->nm_flag & NFSMNT_INT) {
718 bp = getblk(vp, bn, size, PCATCH, 0);
719 while (bp == (struct buf *)0) {
720 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
721 return ((struct buf *)0);
722 bp = getblk(vp, bn, size, 0, 2 * hz);
723 }
724 } else
725 bp = getblk(vp, bn, size, 0, 0);
726
727 if( vp->v_type == VREG)
728 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
729
730 return (bp);
731}
732
733/*
734 * Flush and invalidate all dirty buffers. If another process is already
735 * doing the flush, just wait for completion.
736 */
737int
738nfs_vinvalbuf(vp, flags, cred, p, intrflg)
739 struct vnode *vp;
740 int flags;
741 struct ucred *cred;
742 struct proc *p;
743 int intrflg;
744{
745 register struct nfsnode *np = VTONFS(vp);
746 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
747 int error = 0, slpflag, slptimeo;
748
749 if ((nmp->nm_flag & NFSMNT_INT) == 0)
750 intrflg = 0;
751 if (intrflg) {
752 slpflag = PCATCH;
753 slptimeo = 2 * hz;
754 } else {
755 slpflag = 0;
756 slptimeo = 0;
757 }
758 /*
759 * First wait for any other process doing a flush to complete.
760 */
761 while (np->n_flag & NFLUSHINPROG) {
762 np->n_flag |= NFLUSHWANT;
763 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
764 slptimeo);
765 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
766 return (EINTR);
767 }
768
769 /*
770 * Now, flush as required.
771 */
772 np->n_flag |= NFLUSHINPROG;
773 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
774 while (error) {
775 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
776 np->n_flag &= ~NFLUSHINPROG;
777 if (np->n_flag & NFLUSHWANT) {
778 np->n_flag &= ~NFLUSHWANT;
779 wakeup((caddr_t)&np->n_flag);
780 }
781 return (EINTR);
782 }
783 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
784 }
785 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
786 if (np->n_flag & NFLUSHWANT) {
787 np->n_flag &= ~NFLUSHWANT;
788 wakeup((caddr_t)&np->n_flag);
789 }
790 return (0);
791}
792
793/*
794 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
795 * This is mainly to avoid queueing async I/O requests when the nfsiods
796 * are all hung on a dead server.
797 */
798int
799nfs_asyncio(bp, cred)
800 register struct buf *bp;
801 struct ucred *cred;
802{
803 struct nfsmount *nmp;
804 int i;
805 int gotiod;
806 int slpflag = 0;
807 int slptimeo = 0;
808 int error;
809
810 if (nfs_numasync == 0)
811 return (EIO);
812
813 nmp = VFSTONFS(bp->b_vp->v_mount);
814again:
815 if (nmp->nm_flag & NFSMNT_INT)
816 slpflag = PCATCH;
817 gotiod = FALSE;
818
819 /*
820 * Find a free iod to process this request.
821 */
822 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
823 if (nfs_iodwant[i]) {
824 /*
825 * Found one, so wake it up and tell it which
826 * mount to process.
827 */
828 NFS_DPF(ASYNCIO,
829 ("nfs_asyncio: waking iod %d for mount %p\n",
830 i, nmp));
831 nfs_iodwant[i] = (struct proc *)0;
832 nfs_iodmount[i] = nmp;
833 nmp->nm_bufqiods++;
834 wakeup((caddr_t)&nfs_iodwant[i]);
835 gotiod = TRUE;
836 break;
837 }
838
839 /*
840 * If none are free, we may already have an iod working on this mount
841 * point. If so, it will process our request.
842 */
843 if (!gotiod) {
844 if (nmp->nm_bufqiods > 0) {
845 NFS_DPF(ASYNCIO,
846 ("nfs_asyncio: %d iods are already processing mount %p\n",
847 nmp->nm_bufqiods, nmp));
848 gotiod = TRUE;
849 }
850 }
851
852 /*
853 * If we have an iod which can process the request, then queue
854 * the buffer.
855 */
856 if (gotiod) {
857 /*
858 * Ensure that the queue never grows too large.
859 */
860 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
861 NFS_DPF(ASYNCIO,
862 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
863 nmp->nm_bufqwant = TRUE;
864 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
865 "nfsaio", slptimeo);
866 if (error) {
867 if (nfs_sigintr(nmp, NULL, bp->b_proc))
868 return (EINTR);
869 if (slpflag == PCATCH) {
870 slpflag = 0;
871 slptimeo = 2 * hz;
872 }
873 }
874 /*
875 * We might have lost our iod while sleeping,
876 * so check and loop if nescessary.
877 */
878 if (nmp->nm_bufqiods == 0) {
879 NFS_DPF(ASYNCIO,
880 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
881 goto again;
882 }
883 }
884
885 if (bp->b_flags & B_READ) {
886 if (bp->b_rcred == NOCRED && cred != NOCRED) {
887 crhold(cred);
888 bp->b_rcred = cred;
889 }
890 } else {
891 bp->b_flags |= B_WRITEINPROG;
892 if (bp->b_wcred == NOCRED && cred != NOCRED) {
893 crhold(cred);
894 bp->b_wcred = cred;
895 }
896 }
897
898 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
899 nmp->nm_bufqlen++;
900 return (0);
901 }
902
903 /*
904 * All the iods are busy on other mounts, so return EIO to
905 * force the caller to process the i/o synchronously.
906 */
907 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
908 return (EIO);
909}
910
911/*
912 * Do an I/O operation to/from a cache block. This may be called
913 * synchronously or from an nfsiod.
914 */
915int
916nfs_doio(bp, cr, p)
917 register struct buf *bp;
918 struct ucred *cr;
919 struct proc *p;
920{
921 register struct uio *uiop;
922 register struct vnode *vp;
923 struct nfsnode *np;
924 struct nfsmount *nmp;
925 int error = 0, diff, len, iomode, must_commit = 0;
926 struct uio uio;
927 struct iovec io;
928
929 vp = bp->b_vp;
930 np = VTONFS(vp);
931 nmp = VFSTONFS(vp->v_mount);
932 uiop = &uio;
933 uiop->uio_iov = &io;
934 uiop->uio_iovcnt = 1;
935 uiop->uio_segflg = UIO_SYSSPACE;
936 uiop->uio_procp = p;
937
938 /*
939 * Historically, paging was done with physio, but no more.
940 */
941 if (bp->b_flags & B_PHYS) {
942 /*
943 * ...though reading /dev/drum still gets us here.
944 */
945 io.iov_len = uiop->uio_resid = bp->b_bcount;
946 /* mapping was done by vmapbuf() */
947 io.iov_base = bp->b_data;
948 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
949 if (bp->b_flags & B_READ) {
950 uiop->uio_rw = UIO_READ;
951 nfsstats.read_physios++;
952 error = nfs_readrpc(vp, uiop, cr);
953 } else {
954 int com;
955
956 iomode = NFSV3WRITE_DATASYNC;
957 uiop->uio_rw = UIO_WRITE;
958 nfsstats.write_physios++;
959 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
960 }
961 if (error) {
962 bp->b_flags |= B_ERROR;
963 bp->b_error = error;
964 }
965 } else if (bp->b_flags & B_READ) {
966 io.iov_len = uiop->uio_resid = bp->b_bcount;
967 io.iov_base = bp->b_data;
968 uiop->uio_rw = UIO_READ;
969 switch (vp->v_type) {
970 case VREG:
971 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
972 nfsstats.read_bios++;
973 error = nfs_readrpc(vp, uiop, cr);
974 if (!error) {
975 bp->b_validoff = 0;
976 if (uiop->uio_resid) {
977 /*
978 * If len > 0, there is a hole in the file and
979 * no writes after the hole have been pushed to
980 * the server yet.
981 * Just zero fill the rest of the valid area.
982 */
983 diff = bp->b_bcount - uiop->uio_resid;
984 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
985 + diff);
986 if (len > 0) {
987 len = min(len, uiop->uio_resid);
988 bzero((char *)bp->b_data + diff, len);
989 bp->b_validend = diff + len;
990 } else
991 bp->b_validend = diff;
992 } else
993 bp->b_validend = bp->b_bcount;
994 }
995 if (p && (vp->v_flag & VTEXT) &&
996 (((nmp->nm_flag & NFSMNT_NQNFS) &&
997 NQNFS_CKINVALID(vp, np, ND_READ) &&
998 np->n_lrev != np->n_brev) ||
999 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1000 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1001 uprintf("Process killed due to text file modification\n");
1002 psignal(p, SIGKILL);
1003#ifdef __NetBSD__
1004 p->p_holdcnt++;
1005#else
1006 p->p_flag |= P_NOSWAP;
1007#endif
1008 }
1009 break;
1010 case VLNK:
1011 uiop->uio_offset = (off_t)0;
1012 nfsstats.readlink_bios++;
1013 error = nfs_readlinkrpc(vp, uiop, cr);
1014 break;
1015 case VDIR:
1016 nfsstats.readdir_bios++;
1017 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1018 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1019 error = nfs_readdirplusrpc(vp, uiop, cr);
1020 if (error == NFSERR_NOTSUPP)
1021 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1022 }
1023 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1024 error = nfs_readdirrpc(vp, uiop, cr);
1025 break;
1026 default:
1027 printf("nfs_doio: type %x unexpected\n",vp->v_type);
1028 break;
1029 };
1030 if (error) {
1031 bp->b_flags |= B_ERROR;
1032 bp->b_error = error;
1033 }
1034 } else {
1035 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1036 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1037
1038 if (bp->b_dirtyend > bp->b_dirtyoff) {
1039 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1040 - bp->b_dirtyoff;
1041 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1042 + bp->b_dirtyoff;
1043 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1044 uiop->uio_rw = UIO_WRITE;
1045 nfsstats.write_bios++;
1046 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1047 iomode = NFSV3WRITE_UNSTABLE;
1048 else
1049 iomode = NFSV3WRITE_FILESYNC;
1050 bp->b_flags |= B_WRITEINPROG;
1051 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1052 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1053 bp->b_flags |= B_NEEDCOMMIT;
1054 if (bp->b_dirtyoff == 0
1055 && bp->b_dirtyend == bp->b_bufsize)
1056 bp->b_flags |= B_CLUSTEROK;
1057 } else
1058 bp->b_flags &= ~B_NEEDCOMMIT;
1059 bp->b_flags &= ~B_WRITEINPROG;
1060
1061 /*
1062 * For an interrupted write, the buffer is still valid
1063 * and the write hasn't been pushed to the server yet,
1064 * so we can't set B_ERROR and report the interruption
1065 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1066 * is not relevant, so the rpc attempt is essentially
1067 * a noop. For the case of a V3 write rpc not being
1068 * committed to stable storage, the block is still
1069 * dirty and requires either a commit rpc or another
1070 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1071 * the block is reused. This is indicated by setting
1072 * the B_DELWRI and B_NEEDCOMMIT flags.
1073 */
1074 if (error == EINTR
1075 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1076 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1077 bp->b_flags |= B_DELWRI;
1078
1079 /*
1080 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1081 * buffer to the clean list, we have to reassign it back to the
1082 * dirty one. Ugh.
1083 */
1084 if (bp->b_flags & B_ASYNC)
1085 reassignbuf(bp, vp);
1086 else
1087 bp->b_flags |= B_EINTR;
1088 } else {
1089 if (error) {
1090 bp->b_flags |= B_ERROR;
1091 bp->b_error = np->n_error = error;
1092 np->n_flag |= NWRITEERR;
1093 }
1094 bp->b_dirtyoff = bp->b_dirtyend = 0;
1095 }
1096 } else {
1097 bp->b_resid = 0;
1098 biodone(bp);
1099 return (0);
1100 }
1101 }
1102 bp->b_resid = uiop->uio_resid;
1103 if (must_commit)
1104 nfs_clearcommit(vp->v_mount);
1105 biodone(bp);
1106 return (error);
1107}