1#define MSNFS	/* HACK HACK */
2/*
3 * linux/fs/nfsd/vfs.c
4 *
5 * File operations used by nfsd. Some of these have been ripped from
6 * other parts of the kernel because they weren't exported, others
7 * are partial duplicates with added or changed functionality.
8 *
9 * Note that several functions dget() the dentry upon which they want
10 * to act, most notably those that create directory entries. Response
11 * dentry's are dput()'d if necessary in the release callback.
12 * So if you notice code paths that apparently fail to dput() the
13 * dentry, don't worry--they have been taken care of.
14 *
15 * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
16 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
17 */
18
19#include <linux/string.h>
20#include <linux/time.h>
21#include <linux/errno.h>
22#include <linux/fs.h>
23#include <linux/file.h>
24#include <linux/mount.h>
25#include <linux/major.h>
26#include <linux/ext2_fs.h>
27#include <linux/proc_fs.h>
28#include <linux/stat.h>
29#include <linux/fcntl.h>
30#include <linux/net.h>
31#include <linux/unistd.h>
32#include <linux/slab.h>
33#include <linux/pagemap.h>
34#include <linux/in.h>
35#include <linux/module.h>
36#include <linux/namei.h>
37#include <linux/vfs.h>
38#include <linux/delay.h>
39#include <linux/sunrpc/svc.h>
40#include <linux/nfsd/nfsd.h>
41#ifdef CONFIG_NFSD_V3
42#include <linux/nfs3.h>
43#include <linux/nfsd/xdr3.h>
44#endif /* CONFIG_NFSD_V3 */
45#include <linux/nfsd/nfsfh.h>
46#include <linux/quotaops.h>
47#include <linux/fsnotify.h>
48#include <linux/posix_acl.h>
49#include <linux/posix_acl_xattr.h>
50#include <linux/xattr.h>
51#ifdef CONFIG_NFSD_V4
52#include <linux/nfs4.h>
53#include <linux/nfs4_acl.h>
54#include <linux/nfsd_idmap.h>
55#include <linux/security.h>
56#endif /* CONFIG_NFSD_V4 */
57#include <linux/jhash.h>
58
59#include <asm/uaccess.h>
60
61#define NFSDDBG_FACILITY		NFSDDBG_FILEOP
62
63
64/* We must ignore files (but only files) which might have mandatory
65 * locks on them because there is no way to know if the accesser has
66 * the lock.
67 */
68#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
69
70/*
71 * This is a cache of readahead params that help us choose the proper
72 * readahead strategy. Initially, we set all readahead parameters to 0
73 * and let the VFS handle things.
74 * If you increase the number of cached files very much, you'll need to
75 * add a hash table here.
76 */
77struct raparms {
78	struct raparms		*p_next;
79	unsigned int		p_count;
80	ino_t			p_ino;
81	dev_t			p_dev;
82	int			p_set;
83	struct file_ra_state	p_ra;
84	unsigned int		p_hindex;
85};
86
87struct raparm_hbucket {
88	struct raparms		*pb_head;
89	spinlock_t		pb_lock;
90} ____cacheline_aligned_in_smp;
91
92static struct raparms *		raparml;
93#define RAPARM_HASH_BITS	4
94#define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
95#define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
96static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
97
98/*
99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
100 * a mount point.
101 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
102 *  or nfs_ok having possibly changed *dpp and *expp
103 */
104int
105nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
106		        struct svc_export **expp)
107{
108	struct svc_export *exp = *expp, *exp2 = NULL;
109	struct dentry *dentry = *dpp;
110	struct vfsmount *mnt = mntget(exp->ex_mnt);
111	struct dentry *mounts = dget(dentry);
112	int err = 0;
113
114	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
115
116	exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle);
117	if (IS_ERR(exp2)) {
118		err = PTR_ERR(exp2);
119		dput(mounts);
120		mntput(mnt);
121		goto out;
122	}
123	if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2))) {
124		/* successfully crossed mount point */
125		exp_put(exp);
126		*expp = exp2;
127		dput(dentry);
128		*dpp = mounts;
129	} else {
130		if (exp2) exp_put(exp2);
131		dput(mounts);
132	}
133	mntput(mnt);
134out:
135	return err;
136}
137
138/*
139 * Look up one component of a pathname.
140 * N.B. After this call _both_ fhp and resfh need an fh_put
141 *
142 * If the lookup would cross a mountpoint, and the mounted filesystem
143 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
144 * accepted as it stands and the mounted directory is
145 * returned. Otherwise the covered directory is returned.
146 * NOTE: this mountpoint crossing is not supported properly by all
147 *   clients and is explicitly disallowed for NFSv3
148 *      NeilBrown <neilb@cse.unsw.edu.au>
149 */
150__be32
151nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
152					int len, struct svc_fh *resfh)
153{
154	struct svc_export	*exp;
155	struct dentry		*dparent;
156	struct dentry		*dentry;
157	__be32			err;
158	int			host_err;
159
160	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
161
162	/* Obtain dentry and export. */
163	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
164	if (err)
165		return err;
166
167	dparent = fhp->fh_dentry;
168	exp  = fhp->fh_export;
169	exp_get(exp);
170
171	err = nfserr_acces;
172
173	/* Lookup the name, but don't follow links */
174	if (isdotent(name, len)) {
175		if (len==1)
176			dentry = dget(dparent);
177		else if (dparent != exp->ex_dentry) {
178			dentry = dget_parent(dparent);
179		} else  if (!EX_NOHIDE(exp))
180			dentry = dget(dparent); /* .. == . just like at / */
181		else {
182			/* checking mountpoint crossing is very different when stepping up */
183			struct svc_export *exp2 = NULL;
184			struct dentry *dp;
185			struct vfsmount *mnt = mntget(exp->ex_mnt);
186			dentry = dget(dparent);
187			while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
188				;
189			dp = dget_parent(dentry);
190			dput(dentry);
191			dentry = dp;
192
193			exp2 = exp_parent(exp->ex_client, mnt, dentry,
194					  &rqstp->rq_chandle);
195			if (IS_ERR(exp2)) {
196				host_err = PTR_ERR(exp2);
197				dput(dentry);
198				mntput(mnt);
199				goto out_nfserr;
200			}
201			if (!exp2) {
202				dput(dentry);
203				dentry = dget(dparent);
204			} else {
205				exp_put(exp);
206				exp = exp2;
207			}
208			mntput(mnt);
209		}
210	} else {
211		fh_lock(fhp);
212		dentry = lookup_one_len(name, dparent, len);
213		host_err = PTR_ERR(dentry);
214		if (IS_ERR(dentry))
215			goto out_nfserr;
216		/*
217		 * check if we have crossed a mount point ...
218		 */
219		if (d_mountpoint(dentry)) {
220			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
221				dput(dentry);
222				goto out_nfserr;
223			}
224		}
225	}
226	/*
227	 * Note: we compose the file handle now, but as the
228	 * dentry may be negative, it may need to be updated.
229	 */
230	err = fh_compose(resfh, exp, dentry, fhp);
231	if (!err && !dentry->d_inode)
232		err = nfserr_noent;
233	dput(dentry);
234out:
235	exp_put(exp);
236	return err;
237
238out_nfserr:
239	err = nfserrno(host_err);
240	goto out;
241}
242
243/*
244 * Set various file attributes.
245 * N.B. After this call fhp needs an fh_put
246 */
247__be32
248nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
249	     int check_guard, time_t guardtime)
250{
251	struct dentry	*dentry;
252	struct inode	*inode;
253	int		accmode = MAY_SATTR;
254	int		ftype = 0;
255	int		imode;
256	__be32		err;
257	int		host_err;
258	int		size_change = 0;
259
260	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
261		accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE;
262	if (iap->ia_valid & ATTR_SIZE)
263		ftype = S_IFREG;
264
265	/* Get inode */
266	err = fh_verify(rqstp, fhp, ftype, accmode);
267	if (err)
268		goto out;
269
270	dentry = fhp->fh_dentry;
271	inode = dentry->d_inode;
272
273	/* Ignore any mode updates on symlinks */
274	if (S_ISLNK(inode->i_mode))
275		iap->ia_valid &= ~ATTR_MODE;
276
277	if (!iap->ia_valid)
278		goto out;
279
280	/* NFSv2 does not differentiate between "set-[ac]time-to-now"
281	 * which only requires access, and "set-[ac]time-to-X" which
282	 * requires ownership.
283	 * So if it looks like it might be "set both to the same time which
284	 * is close to now", and if inode_change_ok fails, then we
285	 * convert to "set to now" instead of "set to explicit time"
286	 *
287	 * We only call inode_change_ok as the last test as technically
288	 * it is not an interface that we should be using.  It is only
289	 * valid if the filesystem does not define it's own i_op->setattr.
290	 */
291#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
292#define	MAX_TOUCH_TIME_ERROR (30*60)
293	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET
294	    && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec
295	    ) {
296	    /* Looks probable.  Now just make sure time is in the right ballpark.
297	     * Solaris, at least, doesn't seem to care what the time request is.
298	     * We require it be within 30 minutes of now.
299	     */
300	    time_t delta = iap->ia_atime.tv_sec - get_seconds();
301	    if (delta<0) delta = -delta;
302	    if (delta < MAX_TOUCH_TIME_ERROR &&
303		inode_change_ok(inode, iap) != 0) {
304		/* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME
305		 * this will cause notify_change to set these times to "now"
306		 */
307		iap->ia_valid &= ~BOTH_TIME_SET;
308	    }
309	}
310
311	/* The size case is special. It changes the file as well as the attributes.  */
312	if (iap->ia_valid & ATTR_SIZE) {
313		if (iap->ia_size < inode->i_size) {
314			err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
315			if (err)
316				goto out;
317		}
318
319		/*
320		 * If we are changing the size of the file, then
321		 * we need to break all leases.
322		 */
323		host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
324		if (host_err == -EWOULDBLOCK)
325			host_err = -ETIMEDOUT;
326		if (host_err) /* ENOMEM or EWOULDBLOCK */
327			goto out_nfserr;
328
329		host_err = get_write_access(inode);
330		if (host_err)
331			goto out_nfserr;
332
333		size_change = 1;
334		host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
335		if (host_err) {
336			put_write_access(inode);
337			goto out_nfserr;
338		}
339		DQUOT_INIT(inode);
340	}
341
342	imode = inode->i_mode;
343	if (iap->ia_valid & ATTR_MODE) {
344		iap->ia_mode &= S_IALLUGO;
345		imode = iap->ia_mode |= (imode & ~S_IALLUGO);
346	}
347
348	/* Revoke setuid/setgid bit on chown/chgrp */
349	if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid)
350		iap->ia_valid |= ATTR_KILL_SUID;
351	if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)
352		iap->ia_valid |= ATTR_KILL_SGID;
353
354	/* Change the attributes. */
355
356	iap->ia_valid |= ATTR_CTIME;
357
358	err = nfserr_notsync;
359	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
360		fh_lock(fhp);
361		host_err = notify_change(dentry, iap);
362		err = nfserrno(host_err);
363		fh_unlock(fhp);
364	}
365	if (size_change)
366		put_write_access(inode);
367	if (!err)
368		if (EX_ISSYNC(fhp->fh_export))
369			write_inode_now(inode, 1);
370out:
371	return err;
372
373out_nfserr:
374	err = nfserrno(host_err);
375	goto out;
376}
377
378#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) || \
379	defined(CONFIG_NFSD_V4)
380static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
381{
382	ssize_t buflen;
383
384	buflen = vfs_getxattr(dentry, key, NULL, 0);
385	if (buflen <= 0)
386		return buflen;
387
388	*buf = kmalloc(buflen, GFP_KERNEL);
389	if (!*buf)
390		return -ENOMEM;
391
392	return vfs_getxattr(dentry, key, *buf, buflen);
393}
394#endif
395
396#if defined(CONFIG_NFSD_V4)
397static int
398set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
399{
400	int len;
401	size_t buflen;
402	char *buf = NULL;
403	int error = 0;
404
405	buflen = posix_acl_xattr_size(pacl->a_count);
406	buf = kmalloc(buflen, GFP_KERNEL);
407	error = -ENOMEM;
408	if (buf == NULL)
409		goto out;
410
411	len = posix_acl_to_xattr(pacl, buf, buflen);
412	if (len < 0) {
413		error = len;
414		goto out;
415	}
416
417	error = vfs_setxattr(dentry, key, buf, len, 0);
418out:
419	kfree(buf);
420	return error;
421}
422
423__be32
424nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
425    struct nfs4_acl *acl)
426{
427	__be32 error;
428	int host_error;
429	struct dentry *dentry;
430	struct inode *inode;
431	struct posix_acl *pacl = NULL, *dpacl = NULL;
432	unsigned int flags = 0;
433
434	/* Get inode */
435	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
436	if (error)
437		goto out;
438
439	dentry = fhp->fh_dentry;
440	inode = dentry->d_inode;
441	if (S_ISDIR(inode->i_mode))
442		flags = NFS4_ACL_DIR;
443
444	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
445	if (host_error == -EINVAL) {
446		error = nfserr_attrnotsupp;
447		goto out;
448	} else if (host_error < 0)
449		goto out_nfserr;
450
451	host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
452	if (host_error < 0)
453		goto out_nfserr;
454
455	if (S_ISDIR(inode->i_mode)) {
456		host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
457		if (host_error < 0)
458			goto out_nfserr;
459	}
460
461	error = nfs_ok;
462
463out:
464	posix_acl_release(pacl);
465	posix_acl_release(dpacl);
466	return (error);
467out_nfserr:
468	if (host_error == -EOPNOTSUPP)
469		error = nfserr_attrnotsupp;
470	else
471		error = nfserrno(host_error);
472	goto out;
473}
474
475static struct posix_acl *
476_get_posix_acl(struct dentry *dentry, char *key)
477{
478	void *buf = NULL;
479	struct posix_acl *pacl = NULL;
480	int buflen;
481
482	buflen = nfsd_getxattr(dentry, key, &buf);
483	if (!buflen)
484		buflen = -ENODATA;
485	if (buflen <= 0)
486		return ERR_PTR(buflen);
487
488	pacl = posix_acl_from_xattr(buf, buflen);
489	kfree(buf);
490	return pacl;
491}
492
493int
494nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
495{
496	struct inode *inode = dentry->d_inode;
497	int error = 0;
498	struct posix_acl *pacl = NULL, *dpacl = NULL;
499	unsigned int flags = 0;
500
501	pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
502	if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
503		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
504	if (IS_ERR(pacl)) {
505		error = PTR_ERR(pacl);
506		pacl = NULL;
507		goto out;
508	}
509
510	if (S_ISDIR(inode->i_mode)) {
511		dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
512		if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
513			dpacl = NULL;
514		else if (IS_ERR(dpacl)) {
515			error = PTR_ERR(dpacl);
516			dpacl = NULL;
517			goto out;
518		}
519		flags = NFS4_ACL_DIR;
520	}
521
522	*acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
523	if (IS_ERR(*acl)) {
524		error = PTR_ERR(*acl);
525		*acl = NULL;
526	}
527 out:
528	posix_acl_release(pacl);
529	posix_acl_release(dpacl);
530	return error;
531}
532
533#endif /* defined(CONFIG_NFS_V4) */
534
535#ifdef CONFIG_NFSD_V3
536/*
537 * Check server access rights to a file system object
538 */
539struct accessmap {
540	u32		access;
541	int		how;
542};
543static struct accessmap	nfs3_regaccess[] = {
544    {	NFS3_ACCESS_READ,	MAY_READ			},
545    {	NFS3_ACCESS_EXECUTE,	MAY_EXEC			},
546    {	NFS3_ACCESS_MODIFY,	MAY_WRITE|MAY_TRUNC		},
547    {	NFS3_ACCESS_EXTEND,	MAY_WRITE			},
548
549    {	0,			0				}
550};
551
552static struct accessmap	nfs3_diraccess[] = {
553    {	NFS3_ACCESS_READ,	MAY_READ			},
554    {	NFS3_ACCESS_LOOKUP,	MAY_EXEC			},
555    {	NFS3_ACCESS_MODIFY,	MAY_EXEC|MAY_WRITE|MAY_TRUNC	},
556    {	NFS3_ACCESS_EXTEND,	MAY_EXEC|MAY_WRITE		},
557    {	NFS3_ACCESS_DELETE,	MAY_REMOVE			},
558
559    {	0,			0				}
560};
561
562static struct accessmap	nfs3_anyaccess[] = {
563	/* Some clients - Solaris 2.6 at least, make an access call
564	 * to the server to check for access for things like /dev/null
565	 * (which really, the server doesn't care about).  So
566	 * We provide simple access checking for them, looking
567	 * mainly at mode bits, and we make sure to ignore read-only
568	 * filesystem checks
569	 */
570    {	NFS3_ACCESS_READ,	MAY_READ			},
571    {	NFS3_ACCESS_EXECUTE,	MAY_EXEC			},
572    {	NFS3_ACCESS_MODIFY,	MAY_WRITE|MAY_LOCAL_ACCESS	},
573    {	NFS3_ACCESS_EXTEND,	MAY_WRITE|MAY_LOCAL_ACCESS	},
574
575    {	0,			0				}
576};
577
578__be32
579nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
580{
581	struct accessmap	*map;
582	struct svc_export	*export;
583	struct dentry		*dentry;
584	u32			query, result = 0, sresult = 0;
585	__be32			error;
586
587	error = fh_verify(rqstp, fhp, 0, MAY_NOP);
588	if (error)
589		goto out;
590
591	export = fhp->fh_export;
592	dentry = fhp->fh_dentry;
593
594	if (S_ISREG(dentry->d_inode->i_mode))
595		map = nfs3_regaccess;
596	else if (S_ISDIR(dentry->d_inode->i_mode))
597		map = nfs3_diraccess;
598	else
599		map = nfs3_anyaccess;
600
601
602	query = *access;
603	for  (; map->access; map++) {
604		if (map->access & query) {
605			__be32 err2;
606
607			sresult |= map->access;
608
609			err2 = nfsd_permission(export, dentry, map->how);
610			switch (err2) {
611			case nfs_ok:
612				result |= map->access;
613				break;
614
615			/* the following error codes just mean the access was not allowed,
616			 * rather than an error occurred */
617			case nfserr_rofs:
618			case nfserr_acces:
619			case nfserr_perm:
620				/* simply don't "or" in the access bit. */
621				break;
622			default:
623				error = err2;
624				goto out;
625			}
626		}
627	}
628	*access = result;
629	if (supported)
630		*supported = sresult;
631
632 out:
633	return error;
634}
635#endif /* CONFIG_NFSD_V3 */
636
637
638
639/*
640 * Open an existing file or directory.
641 * The access argument indicates the type of open (read/write/lock)
642 * N.B. After this call fhp needs an fh_put
643 */
644__be32
645nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
646			int access, struct file **filp)
647{
648	struct dentry	*dentry;
649	struct inode	*inode;
650	int		flags = O_RDONLY|O_LARGEFILE;
651	__be32		err;
652	int		host_err;
653
654	/*
655	 * If we get here, then the client has already done an "open",
656	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
657	 * in case a chmod has now revoked permission.
658	 */
659	err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE);
660	if (err)
661		goto out;
662
663	dentry = fhp->fh_dentry;
664	inode = dentry->d_inode;
665
666	/* Disallow write access to files with the append-only bit set
667	 * or any access when mandatory locking enabled
668	 */
669	err = nfserr_perm;
670	if (IS_APPEND(inode) && (access & MAY_WRITE))
671		goto out;
672	if (IS_ISMNDLK(inode))
673		goto out;
674
675	if (!inode->i_fop)
676		goto out;
677
678	/*
679	 * Check to see if there are any leases on this file.
680	 * This may block while leases are broken.
681	 */
682	host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
683	if (host_err == -EWOULDBLOCK)
684		host_err = -ETIMEDOUT;
685	if (host_err) /* NOMEM or WOULDBLOCK */
686		goto out_nfserr;
687
688	if (access & MAY_WRITE) {
689		if (access & MAY_READ)
690			flags = O_RDWR|O_LARGEFILE;
691		else
692			flags = O_WRONLY|O_LARGEFILE;
693
694		DQUOT_INIT(inode);
695	}
696	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
697	if (IS_ERR(*filp))
698		host_err = PTR_ERR(*filp);
699out_nfserr:
700	err = nfserrno(host_err);
701out:
702	return err;
703}
704
705/*
706 * Close a file.
707 */
708void
709nfsd_close(struct file *filp)
710{
711	fput(filp);
712}
713
714/*
715 * Sync a file
716 * As this calls fsync (not fdatasync) there is no need for a write_inode
717 * after it.
718 */
719static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
720			      const struct file_operations *fop)
721{
722	struct inode *inode = dp->d_inode;
723	int (*fsync) (struct file *, struct dentry *, int);
724	int err;
725
726	err = filemap_fdatawrite(inode->i_mapping);
727	if (err == 0 && fop && (fsync = fop->fsync))
728		err = fsync(filp, dp, 0);
729	if (err == 0)
730		err = filemap_fdatawait(inode->i_mapping);
731
732	return err;
733}
734
735
736static int
737nfsd_sync(struct file *filp)
738{
739        int err;
740	struct inode *inode = filp->f_path.dentry->d_inode;
741	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
742	mutex_lock(&inode->i_mutex);
743	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
744	mutex_unlock(&inode->i_mutex);
745
746	return err;
747}
748
749int
750nfsd_sync_dir(struct dentry *dp)
751{
752	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
753}
754
755/*
756 * Obtain the readahead parameters for the file
757 * specified by (dev, ino).
758 */
759
760static inline struct raparms *
761nfsd_get_raparms(dev_t dev, ino_t ino)
762{
763	struct raparms	*ra, **rap, **frap = NULL;
764	int depth = 0;
765	unsigned int hash;
766	struct raparm_hbucket *rab;
767
768	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
769	rab = &raparm_hash[hash];
770
771	spin_lock(&rab->pb_lock);
772	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
773		if (ra->p_ino == ino && ra->p_dev == dev)
774			goto found;
775		depth++;
776		if (ra->p_count == 0)
777			frap = rap;
778	}
779	depth = nfsdstats.ra_size*11/10;
780	if (!frap) {
781		spin_unlock(&rab->pb_lock);
782		return NULL;
783	}
784	rap = frap;
785	ra = *frap;
786	ra->p_dev = dev;
787	ra->p_ino = ino;
788	ra->p_set = 0;
789	ra->p_hindex = hash;
790found:
791	if (rap != &rab->pb_head) {
792		*rap = ra->p_next;
793		ra->p_next   = rab->pb_head;
794		rab->pb_head = ra;
795	}
796	ra->p_count++;
797	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
798	spin_unlock(&rab->pb_lock);
799	return ra;
800}
801
802/*
803 * Grab and keep cached pages assosiated with a file in the svc_rqst
804 * so that they can be passed to the netowork sendmsg/sendpage routines
805 * directrly. They will be released after the sending has completed.
806 */
807static int
808nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size)
809{
810	unsigned long count = desc->count;
811	struct svc_rqst *rqstp = desc->arg.data;
812	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
813
814	if (size > count)
815		size = count;
816
817	if (rqstp->rq_res.page_len == 0) {
818		get_page(page);
819		put_page(*pp);
820		*pp = page;
821		rqstp->rq_resused++;
822		rqstp->rq_res.page_base = offset;
823		rqstp->rq_res.page_len = size;
824	} else if (page != pp[-1]) {
825		get_page(page);
826		if (*pp)
827			put_page(*pp);
828		*pp = page;
829		rqstp->rq_resused++;
830		rqstp->rq_res.page_len += size;
831	} else
832		rqstp->rq_res.page_len += size;
833
834	desc->count = count - size;
835	desc->written += size;
836	return size;
837}
838
839static __be32
840nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
841              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
842{
843	struct inode *inode;
844	struct raparms	*ra;
845	mm_segment_t	oldfs;
846	__be32		err;
847	int		host_err;
848
849	err = nfserr_perm;
850	inode = file->f_path.dentry->d_inode;
851#ifdef MSNFS
852	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
853		(!lock_may_read(inode, offset, *count)))
854		goto out;
855#endif
856
857	/* Get readahead parameters */
858	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
859
860	if (ra && ra->p_set)
861		file->f_ra = ra->p_ra;
862
863	if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
864		rqstp->rq_resused = 1;
865		host_err = file->f_op->sendfile(file, &offset, *count,
866						 nfsd_read_actor, rqstp);
867	} else {
868		oldfs = get_fs();
869		set_fs(KERNEL_DS);
870		host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
871		set_fs(oldfs);
872	}
873
874	/* Write back readahead params */
875	if (ra) {
876		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
877		spin_lock(&rab->pb_lock);
878		ra->p_ra = file->f_ra;
879		ra->p_set = 1;
880		ra->p_count--;
881		spin_unlock(&rab->pb_lock);
882	}
883
884	if (host_err >= 0) {
885		nfsdstats.io_read += host_err;
886		*count = host_err;
887		err = 0;
888		fsnotify_access(file->f_path.dentry);
889	} else
890		err = nfserrno(host_err);
891out:
892	return err;
893}
894
895static void kill_suid(struct dentry *dentry)
896{
897	struct iattr	ia;
898	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
899
900	mutex_lock(&dentry->d_inode->i_mutex);
901	notify_change(dentry, &ia);
902	mutex_unlock(&dentry->d_inode->i_mutex);
903}
904
905static __be32
906nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
907				loff_t offset, struct kvec *vec, int vlen,
908	   			unsigned long cnt, int *stablep)
909{
910	struct svc_export	*exp;
911	struct dentry		*dentry;
912	struct inode		*inode;
913	mm_segment_t		oldfs;
914	__be32			err = 0;
915	int			host_err;
916	int			stable = *stablep;
917
918#ifdef MSNFS
919	err = nfserr_perm;
920
921	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
922		(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
923		goto out;
924#endif
925
926	dentry = file->f_path.dentry;
927	inode = dentry->d_inode;
928	exp   = fhp->fh_export;
929
930	/*
931	 * Request sync writes if
932	 *  -	the sync export option has been set, or
933	 *  -	the client requested O_SYNC behavior (NFSv3 feature).
934	 *  -   The file system doesn't support fsync().
935	 * When gathered writes have been configured for this volume,
936	 * flushing the data to disk is handled separately below.
937	 */
938
939	if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */
940	       stable = 2;
941	       *stablep = 2; /* FILE_SYNC */
942	}
943
944	if (!EX_ISSYNC(exp))
945		stable = 0;
946	if (stable && !EX_WGATHER(exp))
947		file->f_flags |= O_SYNC;
948
949	/* Write the data. */
950	oldfs = get_fs(); set_fs(KERNEL_DS);
951	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
952	set_fs(oldfs);
953	if (host_err >= 0) {
954		nfsdstats.io_write += cnt;
955		fsnotify_modify(file->f_path.dentry);
956	}
957
958	/* clear setuid/setgid flag after write */
959	if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
960		kill_suid(dentry);
961
962	if (host_err >= 0 && stable) {
963		static ino_t	last_ino;
964		static dev_t	last_dev;
965
966		/*
967		 * Gathered writes: If another process is currently
968		 * writing to the file, there's a high chance
969		 * this is another nfsd (triggered by a bulk write
970		 * from a client's biod). Rather than syncing the
971		 * file with each write request, we sleep for 10 msec.
972		 *
973		 * I don't know if this roughly approximates
974		 * C. Juszak's idea of gathered writes, but it's a
975		 * nice and simple solution (IMHO), and it seems to
976		 * work:-)
977		 */
978		if (EX_WGATHER(exp)) {
979			if (atomic_read(&inode->i_writecount) > 1
980			    || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
981				dprintk("nfsd: write defer %d\n", current->pid);
982				msleep(10);
983				dprintk("nfsd: write resume %d\n", current->pid);
984			}
985
986			if (inode->i_state & I_DIRTY) {
987				dprintk("nfsd: write sync %d\n", current->pid);
988				host_err=nfsd_sync(file);
989			}
990		}
991		last_ino = inode->i_ino;
992		last_dev = inode->i_sb->s_dev;
993	}
994
995	dprintk("nfsd: write complete host_err=%d\n", host_err);
996	if (host_err >= 0)
997		err = 0;
998	else
999		err = nfserrno(host_err);
1000out:
1001	return err;
1002}
1003
1004/*
1005 * Read data from a file. count must contain the requested read count
1006 * on entry. On return, *count contains the number of bytes actually read.
1007 * N.B. After this call fhp needs an fh_put
1008 */
1009__be32
1010nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1011		loff_t offset, struct kvec *vec, int vlen,
1012		unsigned long *count)
1013{
1014	__be32		err;
1015
1016	if (file) {
1017		err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
1018				MAY_READ|MAY_OWNER_OVERRIDE);
1019		if (err)
1020			goto out;
1021		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1022	} else {
1023		err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
1024		if (err)
1025			goto out;
1026		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1027		nfsd_close(file);
1028	}
1029out:
1030	return err;
1031}
1032
1033/*
1034 * Write data to a file.
1035 * The stable flag requests synchronous writes.
1036 * N.B. After this call fhp needs an fh_put
1037 */
1038__be32
1039nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1040		loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
1041		int *stablep)
1042{
1043	__be32			err = 0;
1044
1045	if (file) {
1046		err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
1047				MAY_WRITE|MAY_OWNER_OVERRIDE);
1048		if (err)
1049			goto out;
1050		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
1051				stablep);
1052	} else {
1053		err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
1054		if (err)
1055			goto out;
1056
1057		if (cnt)
1058			err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
1059					     cnt, stablep);
1060		nfsd_close(file);
1061	}
1062out:
1063	return err;
1064}
1065
1066#ifdef CONFIG_NFSD_V3
1067/*
1068 * Commit all pending writes to stable storage.
1069 * Strictly speaking, we could sync just the indicated file region here,
1070 * but there's currently no way we can ask the VFS to do so.
1071 *
1072 * Unfortunately we cannot lock the file to make sure we return full WCC
1073 * data to the client, as locking happens lower down in the filesystem.
1074 */
1075__be32
1076nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1077               loff_t offset, unsigned long count)
1078{
1079	struct file	*file;
1080	__be32		err;
1081
1082	if ((u64)count > ~(u64)offset)
1083		return nfserr_inval;
1084
1085	if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
1086		return err;
1087	if (EX_ISSYNC(fhp->fh_export)) {
1088		if (file->f_op && file->f_op->fsync) {
1089			err = nfserrno(nfsd_sync(file));
1090		} else {
1091			err = nfserr_notsupp;
1092		}
1093	}
1094
1095	nfsd_close(file);
1096	return err;
1097}
1098#endif /* CONFIG_NFSD_V3 */
1099
1100/*
1101 * Create a file (regular, directory, device, fifo); UNIX sockets
1102 * not yet implemented.
1103 * If the response fh has been verified, the parent directory should
1104 * already be locked. Note that the parent directory is left locked.
1105 *
1106 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
1107 */
1108__be32
1109nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1110		char *fname, int flen, struct iattr *iap,
1111		int type, dev_t rdev, struct svc_fh *resfhp)
1112{
1113	struct dentry	*dentry, *dchild = NULL;
1114	struct inode	*dirp;
1115	__be32		err;
1116	int		host_err;
1117
1118	err = nfserr_perm;
1119	if (!flen)
1120		goto out;
1121	err = nfserr_exist;
1122	if (isdotent(fname, flen))
1123		goto out;
1124
1125	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1126	if (err)
1127		goto out;
1128
1129	dentry = fhp->fh_dentry;
1130	dirp = dentry->d_inode;
1131
1132	err = nfserr_notdir;
1133	if(!dirp->i_op || !dirp->i_op->lookup)
1134		goto out;
1135	/*
1136	 * Check whether the response file handle has been verified yet.
1137	 * If it has, the parent directory should already be locked.
1138	 */
1139	if (!resfhp->fh_dentry) {
1140		/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
1141		fh_lock_nested(fhp, I_MUTEX_PARENT);
1142		dchild = lookup_one_len(fname, dentry, flen);
1143		host_err = PTR_ERR(dchild);
1144		if (IS_ERR(dchild))
1145			goto out_nfserr;
1146		err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1147		if (err)
1148			goto out;
1149	} else {
1150		/* called from nfsd_proc_create */
1151		dchild = dget(resfhp->fh_dentry);
1152		if (!fhp->fh_locked) {
1153			/* not actually possible */
1154			printk(KERN_ERR
1155				"nfsd_create: parent %s/%s not locked!\n",
1156				dentry->d_parent->d_name.name,
1157				dentry->d_name.name);
1158			err = nfserr_io;
1159			goto out;
1160		}
1161	}
1162	/*
1163	 * Make sure the child dentry is still negative ...
1164	 */
1165	err = nfserr_exist;
1166	if (dchild->d_inode) {
1167		dprintk("nfsd_create: dentry %s/%s not negative!\n",
1168			dentry->d_name.name, dchild->d_name.name);
1169		goto out;
1170	}
1171
1172	if (!(iap->ia_valid & ATTR_MODE))
1173		iap->ia_mode = 0;
1174	iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
1175
1176	/*
1177	 * Get the dir op function pointer.
1178	 */
1179	err = 0;
1180	switch (type) {
1181	case S_IFREG:
1182		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1183		break;
1184	case S_IFDIR:
1185		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
1186		break;
1187	case S_IFCHR:
1188	case S_IFBLK:
1189	case S_IFIFO:
1190	case S_IFSOCK:
1191		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
1192		break;
1193	default:
1194	        printk("nfsd: bad file type %o in nfsd_create\n", type);
1195		host_err = -EINVAL;
1196	}
1197	if (host_err < 0)
1198		goto out_nfserr;
1199
1200	if (EX_ISSYNC(fhp->fh_export)) {
1201		err = nfserrno(nfsd_sync_dir(dentry));
1202		write_inode_now(dchild->d_inode, 1);
1203	}
1204
1205
1206	/* Set file attributes. Mode has already been set and
1207	 * setting uid/gid works only for root. Irix appears to
1208	 * send along the gid when it tries to implement setgid
1209	 * directories via NFS.
1210	 */
1211	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1212		__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1213		if (err2)
1214			err = err2;
1215	}
1216	/*
1217	 * Update the file handle to get the new inode info.
1218	 */
1219	if (!err)
1220		err = fh_update(resfhp);
1221out:
1222	if (dchild && !IS_ERR(dchild))
1223		dput(dchild);
1224	return err;
1225
1226out_nfserr:
1227	err = nfserrno(host_err);
1228	goto out;
1229}
1230
1231#ifdef CONFIG_NFSD_V3
1232/*
1233 * NFSv3 version of nfsd_create
1234 */
1235__be32
1236nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1237		char *fname, int flen, struct iattr *iap,
1238		struct svc_fh *resfhp, int createmode, u32 *verifier,
1239	        int *truncp, int *created)
1240{
1241	struct dentry	*dentry, *dchild = NULL;
1242	struct inode	*dirp;
1243	__be32		err;
1244	int		host_err;
1245	__u32		v_mtime=0, v_atime=0;
1246
1247	err = nfserr_perm;
1248	if (!flen)
1249		goto out;
1250	err = nfserr_exist;
1251	if (isdotent(fname, flen))
1252		goto out;
1253	if (!(iap->ia_valid & ATTR_MODE))
1254		iap->ia_mode = 0;
1255	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1256	if (err)
1257		goto out;
1258
1259	dentry = fhp->fh_dentry;
1260	dirp = dentry->d_inode;
1261
1262	/* Get all the sanity checks out of the way before
1263	 * we lock the parent. */
1264	err = nfserr_notdir;
1265	if(!dirp->i_op || !dirp->i_op->lookup)
1266		goto out;
1267	fh_lock_nested(fhp, I_MUTEX_PARENT);
1268
1269	/*
1270	 * Compose the response file handle.
1271	 */
1272	dchild = lookup_one_len(fname, dentry, flen);
1273	host_err = PTR_ERR(dchild);
1274	if (IS_ERR(dchild))
1275		goto out_nfserr;
1276
1277	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1278	if (err)
1279		goto out;
1280
1281	if (createmode == NFS3_CREATE_EXCLUSIVE) {
1282		/* solaris7 gets confused (bugid 4218508) if these have
1283		 * the high bit set, so just clear the high bits.
1284		 */
1285		v_mtime = verifier[0]&0x7fffffff;
1286		v_atime = verifier[1]&0x7fffffff;
1287	}
1288
1289	if (dchild->d_inode) {
1290		err = 0;
1291
1292		switch (createmode) {
1293		case NFS3_CREATE_UNCHECKED:
1294			if (! S_ISREG(dchild->d_inode->i_mode))
1295				err = nfserr_exist;
1296			else if (truncp) {
1297				/* in nfsv4, we need to treat this case a little
1298				 * differently.  we don't want to truncate the
1299				 * file now; this would be wrong if the OPEN
1300				 * fails for some other reason.  furthermore,
1301				 * if the size is nonzero, we should ignore it
1302				 * according to spec!
1303				 */
1304				*truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
1305			}
1306			else {
1307				iap->ia_valid &= ATTR_SIZE;
1308				goto set_attr;
1309			}
1310			break;
1311		case NFS3_CREATE_EXCLUSIVE:
1312			if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
1313			    && dchild->d_inode->i_atime.tv_sec == v_atime
1314			    && dchild->d_inode->i_size  == 0 )
1315				break;
1316			 /* fallthru */
1317		case NFS3_CREATE_GUARDED:
1318			err = nfserr_exist;
1319		}
1320		goto out;
1321	}
1322
1323	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1324	if (host_err < 0)
1325		goto out_nfserr;
1326	if (created)
1327		*created = 1;
1328
1329	if (EX_ISSYNC(fhp->fh_export)) {
1330		err = nfserrno(nfsd_sync_dir(dentry));
1331		/* setattr will sync the child (or not) */
1332	}
1333
1334	if (createmode == NFS3_CREATE_EXCLUSIVE) {
1335		/* Cram the verifier into atime/mtime */
1336		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
1337			| ATTR_MTIME_SET|ATTR_ATIME_SET;
1338		iap->ia_mtime.tv_sec = v_mtime;
1339		iap->ia_atime.tv_sec = v_atime;
1340		iap->ia_mtime.tv_nsec = 0;
1341		iap->ia_atime.tv_nsec = 0;
1342	}
1343
1344	/* Set file attributes.
1345	 * Irix appears to send along the gid when it tries to
1346	 * implement setgid directories via NFS. Clear out all that cruft.
1347	 */
1348 set_attr:
1349	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1350 		__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1351		if (err2)
1352			err = err2;
1353	}
1354
1355	/*
1356	 * Update the filehandle to get the new inode info.
1357	 */
1358	if (!err)
1359		err = fh_update(resfhp);
1360
1361 out:
1362	fh_unlock(fhp);
1363	if (dchild && !IS_ERR(dchild))
1364		dput(dchild);
1365 	return err;
1366
1367 out_nfserr:
1368	err = nfserrno(host_err);
1369	goto out;
1370}
1371#endif /* CONFIG_NFSD_V3 */
1372
1373/*
1374 * Read a symlink. On entry, *lenp must contain the maximum path length that
1375 * fits into the buffer. On return, it contains the true length.
1376 * N.B. After this call fhp needs an fh_put
1377 */
1378__be32
1379nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1380{
1381	struct dentry	*dentry;
1382	struct inode	*inode;
1383	mm_segment_t	oldfs;
1384	__be32		err;
1385	int		host_err;
1386
1387	err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
1388	if (err)
1389		goto out;
1390
1391	dentry = fhp->fh_dentry;
1392	inode = dentry->d_inode;
1393
1394	err = nfserr_inval;
1395	if (!inode->i_op || !inode->i_op->readlink)
1396		goto out;
1397
1398	touch_atime(fhp->fh_export->ex_mnt, dentry);
1399	/* N.B. Why does this call need a get_fs()??
1400	 * Remove the set_fs and watch the fireworks:-) --okir
1401	 */
1402
1403	oldfs = get_fs(); set_fs(KERNEL_DS);
1404	host_err = inode->i_op->readlink(dentry, buf, *lenp);
1405	set_fs(oldfs);
1406
1407	if (host_err < 0)
1408		goto out_nfserr;
1409	*lenp = host_err;
1410	err = 0;
1411out:
1412	return err;
1413
1414out_nfserr:
1415	err = nfserrno(host_err);
1416	goto out;
1417}
1418
1419/*
1420 * Create a symlink and look up its inode
1421 * N.B. After this call _both_ fhp and resfhp need an fh_put
1422 */
1423__be32
1424nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1425				char *fname, int flen,
1426				char *path,  int plen,
1427				struct svc_fh *resfhp,
1428				struct iattr *iap)
1429{
1430	struct dentry	*dentry, *dnew;
1431	__be32		err, cerr;
1432	int		host_err;
1433	umode_t		mode;
1434
1435	err = nfserr_noent;
1436	if (!flen || !plen)
1437		goto out;
1438	err = nfserr_exist;
1439	if (isdotent(fname, flen))
1440		goto out;
1441
1442	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1443	if (err)
1444		goto out;
1445	fh_lock(fhp);
1446	dentry = fhp->fh_dentry;
1447	dnew = lookup_one_len(fname, dentry, flen);
1448	host_err = PTR_ERR(dnew);
1449	if (IS_ERR(dnew))
1450		goto out_nfserr;
1451
1452	mode = S_IALLUGO;
1453	/* Only the MODE ATTRibute is even vaguely meaningful */
1454	if (iap && (iap->ia_valid & ATTR_MODE))
1455		mode = iap->ia_mode & S_IALLUGO;
1456
1457	if (unlikely(path[plen] != 0)) {
1458		char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1459		if (path_alloced == NULL)
1460			host_err = -ENOMEM;
1461		else {
1462			strncpy(path_alloced, path, plen);
1463			path_alloced[plen] = 0;
1464			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
1465			kfree(path_alloced);
1466		}
1467	} else
1468		host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
1469
1470	if (!host_err) {
1471		if (EX_ISSYNC(fhp->fh_export))
1472			host_err = nfsd_sync_dir(dentry);
1473	}
1474	err = nfserrno(host_err);
1475	fh_unlock(fhp);
1476
1477	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
1478	dput(dnew);
1479	if (err==0) err = cerr;
1480out:
1481	return err;
1482
1483out_nfserr:
1484	err = nfserrno(host_err);
1485	goto out;
1486}
1487
1488/*
1489 * Create a hardlink
1490 * N.B. After this call _both_ ffhp and tfhp need an fh_put
1491 */
1492__be32
1493nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1494				char *name, int len, struct svc_fh *tfhp)
1495{
1496	struct dentry	*ddir, *dnew, *dold;
1497	struct inode	*dirp, *dest;
1498	__be32		err;
1499	int		host_err;
1500
1501	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
1502	if (err)
1503		goto out;
1504	err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
1505	if (err)
1506		goto out;
1507
1508	err = nfserr_perm;
1509	if (!len)
1510		goto out;
1511	err = nfserr_exist;
1512	if (isdotent(name, len))
1513		goto out;
1514
1515	fh_lock_nested(ffhp, I_MUTEX_PARENT);
1516	ddir = ffhp->fh_dentry;
1517	dirp = ddir->d_inode;
1518
1519	dnew = lookup_one_len(name, ddir, len);
1520	host_err = PTR_ERR(dnew);
1521	if (IS_ERR(dnew))
1522		goto out_nfserr;
1523
1524	dold = tfhp->fh_dentry;
1525	dest = dold->d_inode;
1526
1527	host_err = vfs_link(dold, dirp, dnew);
1528	if (!host_err) {
1529		if (EX_ISSYNC(ffhp->fh_export)) {
1530			err = nfserrno(nfsd_sync_dir(ddir));
1531			write_inode_now(dest, 1);
1532		}
1533		err = 0;
1534	} else {
1535		if (host_err == -EXDEV && rqstp->rq_vers == 2)
1536			err = nfserr_acces;
1537		else
1538			err = nfserrno(host_err);
1539	}
1540
1541	dput(dnew);
1542out_unlock:
1543	fh_unlock(ffhp);
1544out:
1545	return err;
1546
1547out_nfserr:
1548	err = nfserrno(host_err);
1549	goto out_unlock;
1550}
1551
1552/*
1553 * Rename a file
1554 * N.B. After this call _both_ ffhp and tfhp need an fh_put
1555 */
1556__be32
1557nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1558			    struct svc_fh *tfhp, char *tname, int tlen)
1559{
1560	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
1561	struct inode	*fdir, *tdir;
1562	__be32		err;
1563	int		host_err;
1564
1565	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
1566	if (err)
1567		goto out;
1568	err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
1569	if (err)
1570		goto out;
1571
1572	fdentry = ffhp->fh_dentry;
1573	fdir = fdentry->d_inode;
1574
1575	tdentry = tfhp->fh_dentry;
1576	tdir = tdentry->d_inode;
1577
1578	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
1579	if (ffhp->fh_export != tfhp->fh_export)
1580		goto out;
1581
1582	err = nfserr_perm;
1583	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
1584		goto out;
1585
1586	/* cannot use fh_lock as we need deadlock protective ordering
1587	 * so do it by hand */
1588	trap = lock_rename(tdentry, fdentry);
1589	ffhp->fh_locked = tfhp->fh_locked = 1;
1590	fill_pre_wcc(ffhp);
1591	fill_pre_wcc(tfhp);
1592
1593	odentry = lookup_one_len(fname, fdentry, flen);
1594	host_err = PTR_ERR(odentry);
1595	if (IS_ERR(odentry))
1596		goto out_nfserr;
1597
1598	host_err = -ENOENT;
1599	if (!odentry->d_inode)
1600		goto out_dput_old;
1601	host_err = -EINVAL;
1602	if (odentry == trap)
1603		goto out_dput_old;
1604
1605	ndentry = lookup_one_len(tname, tdentry, tlen);
1606	host_err = PTR_ERR(ndentry);
1607	if (IS_ERR(ndentry))
1608		goto out_dput_old;
1609	host_err = -ENOTEMPTY;
1610	if (ndentry == trap)
1611		goto out_dput_new;
1612
1613#ifdef MSNFS
1614	if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1615		((atomic_read(&odentry->d_count) > 1)
1616		 || (atomic_read(&ndentry->d_count) > 1))) {
1617			host_err = -EPERM;
1618	} else
1619#endif
1620	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1621	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
1622		host_err = nfsd_sync_dir(tdentry);
1623		if (!host_err)
1624			host_err = nfsd_sync_dir(fdentry);
1625	}
1626
1627 out_dput_new:
1628	dput(ndentry);
1629 out_dput_old:
1630	dput(odentry);
1631 out_nfserr:
1632	err = nfserrno(host_err);
1633
1634	/* we cannot reply on fh_unlock on the two filehandles,
1635	 * as that would do the wrong thing if the two directories
1636	 * were the same, so again we do it by hand
1637	 */
1638	fill_post_wcc(ffhp);
1639	fill_post_wcc(tfhp);
1640	unlock_rename(tdentry, fdentry);
1641	ffhp->fh_locked = tfhp->fh_locked = 0;
1642
1643out:
1644	return err;
1645}
1646
1647/*
1648 * Unlink a file or directory
1649 * N.B. After this call fhp needs an fh_put
1650 */
1651__be32
1652nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1653				char *fname, int flen)
1654{
1655	struct dentry	*dentry, *rdentry;
1656	struct inode	*dirp;
1657	__be32		err;
1658	int		host_err;
1659
1660	err = nfserr_acces;
1661	if (!flen || isdotent(fname, flen))
1662		goto out;
1663	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
1664	if (err)
1665		goto out;
1666
1667	fh_lock_nested(fhp, I_MUTEX_PARENT);
1668	dentry = fhp->fh_dentry;
1669	dirp = dentry->d_inode;
1670
1671	rdentry = lookup_one_len(fname, dentry, flen);
1672	host_err = PTR_ERR(rdentry);
1673	if (IS_ERR(rdentry))
1674		goto out_nfserr;
1675
1676	if (!rdentry->d_inode) {
1677		dput(rdentry);
1678		err = nfserr_noent;
1679		goto out;
1680	}
1681
1682	if (!type)
1683		type = rdentry->d_inode->i_mode & S_IFMT;
1684
1685	if (type != S_IFDIR) { /* It's UNLINK */
1686#ifdef MSNFS
1687		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1688			(atomic_read(&rdentry->d_count) > 1)) {
1689			host_err = -EPERM;
1690		} else
1691#endif
1692		host_err = vfs_unlink(dirp, rdentry);
1693	} else { /* It's RMDIR */
1694		host_err = vfs_rmdir(dirp, rdentry);
1695	}
1696
1697	dput(rdentry);
1698
1699	if (host_err)
1700		goto out_nfserr;
1701	if (EX_ISSYNC(fhp->fh_export))
1702		host_err = nfsd_sync_dir(dentry);
1703
1704out_nfserr:
1705	err = nfserrno(host_err);
1706out:
1707	return err;
1708}
1709
1710/*
1711 * Read entries from a directory.
1712 * The  NFSv3/4 verifier we ignore for now.
1713 */
1714__be32
1715nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
1716	     struct readdir_cd *cdp, filldir_t func)
1717{
1718	__be32		err;
1719	int 		host_err;
1720	struct file	*file;
1721	loff_t		offset = *offsetp;
1722
1723	err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
1724	if (err)
1725		goto out;
1726
1727	offset = vfs_llseek(file, offset, 0);
1728	if (offset < 0) {
1729		err = nfserrno((int)offset);
1730		goto out_close;
1731	}
1732
1733	/*
1734	 * Read the directory entries. This silly loop is necessary because
1735	 * readdir() is not guaranteed to fill up the entire buffer, but
1736	 * may choose to do less.
1737	 */
1738
1739	do {
1740		cdp->err = nfserr_eof; /* will be cleared on successful read */
1741		host_err = vfs_readdir(file, func, cdp);
1742	} while (host_err >=0 && cdp->err == nfs_ok);
1743	if (host_err)
1744		err = nfserrno(host_err);
1745	else
1746		err = cdp->err;
1747	*offsetp = vfs_llseek(file, 0, 1);
1748
1749	if (err == nfserr_eof || err == nfserr_toosmall)
1750		err = nfs_ok; /* can still be found in ->err */
1751out_close:
1752	nfsd_close(file);
1753out:
1754	return err;
1755}
1756
1757/*
1758 * Get file system stats
1759 * N.B. After this call fhp needs an fh_put
1760 */
1761__be32
1762nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
1763{
1764	__be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
1765	if (!err && vfs_statfs(fhp->fh_dentry,stat))
1766		err = nfserr_io;
1767	return err;
1768}
1769
1770/*
1771 * Check for a user's access permissions to this inode.
1772 */
1773__be32
1774nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
1775{
1776	struct inode	*inode = dentry->d_inode;
1777	int		err;
1778
1779	if (acc == MAY_NOP)
1780		return 0;
1781
1782	/* Normally we reject any write/sattr etc access on a read-only file
1783	 * system.  But if it is IRIX doing check on write-access for a
1784	 * device special file, we ignore rofs.
1785	 */
1786	if (!(acc & MAY_LOCAL_ACCESS))
1787		if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
1788			if (EX_RDONLY(exp) || IS_RDONLY(inode))
1789				return nfserr_rofs;
1790			if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
1791				return nfserr_perm;
1792		}
1793	if ((acc & MAY_TRUNC) && IS_APPEND(inode))
1794		return nfserr_perm;
1795
1796	if (acc & MAY_LOCK) {
1797		/* If we cannot rely on authentication in NLM requests,
1798		 * just allow locks, otherwise require read permission, or
1799		 * ownership
1800		 */
1801		if (exp->ex_flags & NFSEXP_NOAUTHNLM)
1802			return 0;
1803		else
1804			acc = MAY_READ | MAY_OWNER_OVERRIDE;
1805	}
1806	/*
1807	 * The file owner always gets access permission for accesses that
1808	 * would normally be checked at open time. This is to make
1809	 * file access work even when the client has done a fchmod(fd, 0).
1810	 *
1811	 * However, `cp foo bar' should fail nevertheless when bar is
1812	 * readonly. A sensible way to do this might be to reject all
1813	 * attempts to truncate a read-only file, because a creat() call
1814	 * always implies file truncation.
1815	 * ... but this isn't really fair.  A process may reasonably call
1816	 * ftruncate on an open file descriptor on a file with perm 000.
1817	 * We must trust the client to do permission checking - using "ACCESS"
1818	 * with NFSv3.
1819	 */
1820	if ((acc & MAY_OWNER_OVERRIDE) &&
1821	    inode->i_uid == current->fsuid)
1822		return 0;
1823
1824	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
1825
1826	/* Allow read access to binaries even when mode 111 */
1827	if (err == -EACCES && S_ISREG(inode->i_mode) &&
1828	    acc == (MAY_READ | MAY_OWNER_OVERRIDE))
1829		err = permission(inode, MAY_EXEC, NULL);
1830
1831	return err? nfserrno(err) : 0;
1832}
1833
1834void
1835nfsd_racache_shutdown(void)
1836{
1837	if (!raparml)
1838		return;
1839	dprintk("nfsd: freeing readahead buffers.\n");
1840	kfree(raparml);
1841	raparml = NULL;
1842}
1843/*
1844 * Initialize readahead param cache
1845 */
1846int
1847nfsd_racache_init(int cache_size)
1848{
1849	int	i;
1850	int	j = 0;
1851	int	nperbucket;
1852
1853
1854	if (raparml)
1855		return 0;
1856	if (cache_size < 2*RAPARM_HASH_SIZE)
1857		cache_size = 2*RAPARM_HASH_SIZE;
1858	raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
1859
1860	if (!raparml) {
1861		printk(KERN_WARNING
1862			"nfsd: Could not allocate memory read-ahead cache.\n");
1863		return -ENOMEM;
1864	}
1865
1866	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
1867	for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
1868		raparm_hash[i].pb_head = NULL;
1869		spin_lock_init(&raparm_hash[i].pb_lock);
1870	}
1871	nperbucket = cache_size >> RAPARM_HASH_BITS;
1872	for (i = 0; i < cache_size - 1; i++) {
1873		if (i % nperbucket == 0)
1874			raparm_hash[j++].pb_head = raparml + i;
1875		if (i % nperbucket < nperbucket-1)
1876			raparml[i].p_next = raparml + i + 1;
1877	}
1878
1879	nfsdstats.ra_size = cache_size;
1880	return 0;
1881}
1882
1883#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
1884struct posix_acl *
1885nfsd_get_posix_acl(struct svc_fh *fhp, int type)
1886{
1887	struct inode *inode = fhp->fh_dentry->d_inode;
1888	char *name;
1889	void *value = NULL;
1890	ssize_t size;
1891	struct posix_acl *acl;
1892
1893	if (!IS_POSIXACL(inode))
1894		return ERR_PTR(-EOPNOTSUPP);
1895
1896	switch (type) {
1897	case ACL_TYPE_ACCESS:
1898		name = POSIX_ACL_XATTR_ACCESS;
1899		break;
1900	case ACL_TYPE_DEFAULT:
1901		name = POSIX_ACL_XATTR_DEFAULT;
1902		break;
1903	default:
1904		return ERR_PTR(-EOPNOTSUPP);
1905	}
1906
1907	size = nfsd_getxattr(fhp->fh_dentry, name, &value);
1908	if (size < 0)
1909		return ERR_PTR(size);
1910
1911	acl = posix_acl_from_xattr(value, size);
1912	kfree(value);
1913	return acl;
1914}
1915
1916int
1917nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
1918{
1919	struct inode *inode = fhp->fh_dentry->d_inode;
1920	char *name;
1921	void *value = NULL;
1922	size_t size;
1923	int error;
1924
1925	if (!IS_POSIXACL(inode) || !inode->i_op ||
1926	    !inode->i_op->setxattr || !inode->i_op->removexattr)
1927		return -EOPNOTSUPP;
1928	switch(type) {
1929		case ACL_TYPE_ACCESS:
1930			name = POSIX_ACL_XATTR_ACCESS;
1931			break;
1932		case ACL_TYPE_DEFAULT:
1933			name = POSIX_ACL_XATTR_DEFAULT;
1934			break;
1935		default:
1936			return -EOPNOTSUPP;
1937	}
1938
1939	if (acl && acl->a_count) {
1940		size = posix_acl_xattr_size(acl->a_count);
1941		value = kmalloc(size, GFP_KERNEL);
1942		if (!value)
1943			return -ENOMEM;
1944		error = posix_acl_to_xattr(acl, value, size);
1945		if (error < 0)
1946			goto getout;
1947		size = error;
1948	} else
1949		size = 0;
1950
1951	if (size)
1952		error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
1953	else {
1954		if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
1955			error = 0;
1956		else {
1957			error = vfs_removexattr(fhp->fh_dentry, name);
1958			if (error == -ENODATA)
1959				error = 0;
1960		}
1961	}
1962
1963getout:
1964	kfree(value);
1965	return error;
1966}
1967#endif  /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
1968