1/*
2 *  linux/fs/namei.c
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 */
6
7/*
8 * Some corrections by tytso.
9 */
10
11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12 * lookup logic.
13 */
14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15 */
16
17#include <linux/init.h>
18#include <linux/module.h>
19#include <linux/slab.h>
20#include <linux/fs.h>
21#include <linux/namei.h>
22#include <linux/pagemap.h>
23#include <linux/fsnotify.h>
24#include <linux/personality.h>
25#include <linux/security.h>
26#include <linux/ima.h>
27#include <linux/syscalls.h>
28#include <linux/mount.h>
29#include <linux/audit.h>
30#include <linux/capability.h>
31#include <linux/file.h>
32#include <linux/fcntl.h>
33#include <linux/device_cgroup.h>
34#include <linux/fs_struct.h>
35#include <asm/uaccess.h>
36
37#include "internal.h"
38
39/* [Feb-1997 T. Schoebel-Theuer]
40 * Fundamental changes in the pathname lookup mechanisms (namei)
41 * were necessary because of omirr.  The reason is that omirr needs
42 * to know the _real_ pathname, not the user-supplied one, in case
43 * of symlinks (and also when transname replacements occur).
44 *
45 * The new code replaces the old recursive symlink resolution with
46 * an iterative one (in case of non-nested symlink chains).  It does
47 * this with calls to <fs>_follow_link().
48 * As a side effect, dir_namei(), _namei() and follow_link() are now
49 * replaced with a single function lookup_dentry() that can handle all
50 * the special cases of the former code.
51 *
52 * With the new dcache, the pathname is stored at each inode, at least as
53 * long as the refcount of the inode is positive.  As a side effect, the
54 * size of the dcache depends on the inode cache and thus is dynamic.
55 *
56 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
57 * resolution to correspond with current state of the code.
58 *
59 * Note that the symlink resolution is not *completely* iterative.
60 * There is still a significant amount of tail- and mid- recursion in
61 * the algorithm.  Also, note that <fs>_readlink() is not used in
62 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
63 * may return different results than <fs>_follow_link().  Many virtual
64 * filesystems (including /proc) exhibit this behavior.
65 */
66
67/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
68 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
69 * and the name already exists in form of a symlink, try to create the new
70 * name indicated by the symlink. The old code always complained that the
71 * name already exists, due to not following the symlink even if its target
72 * is nonexistent.  The new semantics affects also mknod() and link() when
73 * the name is a symlink pointing to a non-existant name.
74 *
75 * I don't know which semantics is the right one, since I have no access
76 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
77 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
78 * "old" one. Personally, I think the new semantics is much more logical.
79 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
80 * file does succeed in both HP-UX and SunOs, but not in Solaris
81 * and in the old Linux semantics.
82 */
83
84/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
85 * semantics.  See the comments in "open_namei" and "do_link" below.
86 *
87 * [10-Sep-98 Alan Modra] Another symlink change.
88 */
89
90/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
91 *	inside the path - always follow.
92 *	in the last component in creation/removal/renaming - never follow.
93 *	if LOOKUP_FOLLOW passed - follow.
94 *	if the pathname has trailing slashes - follow.
95 *	otherwise - don't follow.
96 * (applied in that order).
97 *
98 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
99 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
100 * During the 2.4 we need to fix the userland stuff depending on it -
101 * hopefully we will be able to get rid of that wart in 2.5. So far only
102 * XEmacs seems to be relying on it...
103 */
104/*
105 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
106 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
107 * any extra contention...
108 */
109
110/* In order to reduce some races, while at the same time doing additional
111 * checking and hopefully speeding things up, we copy filenames to the
112 * kernel data space before using them..
113 *
114 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
115 * PATH_MAX includes the nul terminator --RR.
116 */
117static int do_getname(const char __user *filename, char *page)
118{
119	int retval;
120	unsigned long len = PATH_MAX;
121
122	if (!segment_eq(get_fs(), KERNEL_DS)) {
123		if ((unsigned long) filename >= TASK_SIZE)
124			return -EFAULT;
125		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
126			len = TASK_SIZE - (unsigned long) filename;
127	}
128
129	retval = strncpy_from_user(page, filename, len);
130	if (retval > 0) {
131		if (retval < len)
132			return 0;
133		return -ENAMETOOLONG;
134	} else if (!retval)
135		retval = -ENOENT;
136	return retval;
137}
138
139char * getname(const char __user * filename)
140{
141	char *tmp, *result;
142
143	result = ERR_PTR(-ENOMEM);
144	tmp = __getname();
145	if (tmp)  {
146		int retval = do_getname(filename, tmp);
147
148		result = tmp;
149		if (retval < 0) {
150			__putname(tmp);
151			result = ERR_PTR(retval);
152		}
153	}
154	audit_getname(result);
155	return result;
156}
157
158#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name)
160{
161	if (unlikely(!audit_dummy_context()))
162		audit_putname(name);
163	else
164		__putname(name);
165}
166EXPORT_SYMBOL(putname);
167#endif
168
169/*
170 * This does basic POSIX ACL permission checking
171 */
172static int acl_permission_check(struct inode *inode, int mask,
173		int (*check_acl)(struct inode *inode, int mask))
174{
175	umode_t			mode = inode->i_mode;
176
177	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
178
179	if (current_fsuid() == inode->i_uid)
180		mode >>= 6;
181	else {
182		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183			int error = check_acl(inode, mask);
184			if (error != -EAGAIN)
185				return error;
186		}
187
188		if (in_group_p(inode->i_gid))
189			mode >>= 3;
190	}
191
192	/*
193	 * If the DACs are ok we don't need any capability check.
194	 */
195	if ((mask & ~mode) == 0)
196		return 0;
197	return -EACCES;
198}
199
200/**
201 * generic_permission  -  check for access rights on a Posix-like filesystem
202 * @inode:	inode to check access rights for
203 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
204 * @check_acl:	optional callback to check for Posix ACLs
205 *
206 * Used to check for read/write/execute permissions on a file.
207 * We use "fsuid" for this, letting us set arbitrary permissions
208 * for filesystem access without changing the "normal" uids which
209 * are used for other things..
210 */
211int generic_permission(struct inode *inode, int mask,
212		int (*check_acl)(struct inode *inode, int mask))
213{
214	int ret;
215
216	/*
217	 * Do the basic POSIX ACL permission checks.
218	 */
219	ret = acl_permission_check(inode, mask, check_acl);
220	if (ret != -EACCES)
221		return ret;
222
223	/*
224	 * Read/write DACs are always overridable.
225	 * Executable DACs are overridable if at least one exec bit is set.
226	 */
227	if (!(mask & MAY_EXEC) || execute_ok(inode))
228		if (capable(CAP_DAC_OVERRIDE))
229			return 0;
230
231	/*
232	 * Searching includes executable on directories, else just read.
233	 */
234	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
235	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
236		if (capable(CAP_DAC_READ_SEARCH))
237			return 0;
238
239	return -EACCES;
240}
241
242/**
243 * inode_permission  -  check for access rights to a given inode
244 * @inode:	inode to check permission on
245 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
246 *
247 * Used to check for read/write/execute permissions on an inode.
248 * We use "fsuid" for this, letting us set arbitrary permissions
249 * for filesystem access without changing the "normal" uids which
250 * are used for other things.
251 */
252int inode_permission(struct inode *inode, int mask)
253{
254	int retval;
255
256	if (mask & MAY_WRITE) {
257		umode_t mode = inode->i_mode;
258
259		/*
260		 * Nobody gets write access to a read-only fs.
261		 */
262		if (IS_RDONLY(inode) &&
263		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
264			return -EROFS;
265
266		/*
267		 * Nobody gets write access to an immutable file.
268		 */
269		if (IS_IMMUTABLE(inode))
270			return -EACCES;
271	}
272
273	if (inode->i_op->permission)
274		retval = inode->i_op->permission(inode, mask);
275	else
276		retval = generic_permission(inode, mask, inode->i_op->check_acl);
277
278	if (retval)
279		return retval;
280
281	retval = devcgroup_inode_permission(inode, mask);
282	if (retval)
283		return retval;
284
285	return security_inode_permission(inode, mask);
286}
287
288/**
289 * file_permission  -  check for additional access rights to a given file
290 * @file:	file to check access rights for
291 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
292 *
293 * Used to check for read/write/execute permissions on an already opened
294 * file.
295 *
296 * Note:
297 *	Do not use this function in new code.  All access checks should
298 *	be done using inode_permission().
299 */
300int file_permission(struct file *file, int mask)
301{
302	return inode_permission(file->f_path.dentry->d_inode, mask);
303}
304
305/*
306 * get_write_access() gets write permission for a file.
307 * put_write_access() releases this write permission.
308 * This is used for regular files.
309 * We cannot support write (and maybe mmap read-write shared) accesses and
310 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
311 * can have the following values:
312 * 0: no writers, no VM_DENYWRITE mappings
313 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
314 * > 0: (i_writecount) users are writing to the file.
315 *
316 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
317 * except for the cases where we don't hold i_writecount yet. Then we need to
318 * use {get,deny}_write_access() - these functions check the sign and refuse
319 * to do the change if sign is wrong. Exclusion between them is provided by
320 * the inode->i_lock spinlock.
321 */
322
323int get_write_access(struct inode * inode)
324{
325	spin_lock(&inode->i_lock);
326	if (atomic_read(&inode->i_writecount) < 0) {
327		spin_unlock(&inode->i_lock);
328		return -ETXTBSY;
329	}
330	atomic_inc(&inode->i_writecount);
331	spin_unlock(&inode->i_lock);
332
333	return 0;
334}
335
336int deny_write_access(struct file * file)
337{
338	struct inode *inode = file->f_path.dentry->d_inode;
339
340	spin_lock(&inode->i_lock);
341	if (atomic_read(&inode->i_writecount) > 0) {
342		spin_unlock(&inode->i_lock);
343		return -ETXTBSY;
344	}
345	atomic_dec(&inode->i_writecount);
346	spin_unlock(&inode->i_lock);
347
348	return 0;
349}
350
351/**
352 * path_get - get a reference to a path
353 * @path: path to get the reference to
354 *
355 * Given a path increment the reference count to the dentry and the vfsmount.
356 */
357void path_get(struct path *path)
358{
359	mntget(path->mnt);
360	dget(path->dentry);
361}
362EXPORT_SYMBOL(path_get);
363
364/**
365 * path_put - put a reference to a path
366 * @path: path to put the reference to
367 *
368 * Given a path decrement the reference count to the dentry and the vfsmount.
369 */
370void path_put(struct path *path)
371{
372	dput(path->dentry);
373	mntput(path->mnt);
374}
375EXPORT_SYMBOL(path_put);
376
377/**
378 * release_open_intent - free up open intent resources
379 * @nd: pointer to nameidata
380 */
381void release_open_intent(struct nameidata *nd)
382{
383	if (nd->intent.open.file->f_path.dentry == NULL)
384		put_filp(nd->intent.open.file);
385	else
386		fput(nd->intent.open.file);
387}
388
389static inline struct dentry *
390do_revalidate(struct dentry *dentry, struct nameidata *nd)
391{
392	int status = dentry->d_op->d_revalidate(dentry, nd);
393	if (unlikely(status <= 0)) {
394		/*
395		 * The dentry failed validation.
396		 * If d_revalidate returned 0 attempt to invalidate
397		 * the dentry otherwise d_revalidate is asking us
398		 * to return a fail status.
399		 */
400		if (!status) {
401			if (!d_invalidate(dentry)) {
402				dput(dentry);
403				dentry = NULL;
404			}
405		} else {
406			dput(dentry);
407			dentry = ERR_PTR(status);
408		}
409	}
410	return dentry;
411}
412
413/*
414 * force_reval_path - force revalidation of a dentry
415 *
416 * In some situations the path walking code will trust dentries without
417 * revalidating them. This causes problems for filesystems that depend on
418 * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
419 * (which indicates that it's possible for the dentry to go stale), force
420 * a d_revalidate call before proceeding.
421 *
422 * Returns 0 if the revalidation was successful. If the revalidation fails,
423 * either return the error returned by d_revalidate or -ESTALE if the
424 * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
425 * invalidate the dentry. It's up to the caller to handle putting references
426 * to the path if necessary.
427 */
428static int
429force_reval_path(struct path *path, struct nameidata *nd)
430{
431	int status;
432	struct dentry *dentry = path->dentry;
433
434	/*
435	 * only check on filesystems where it's possible for the dentry to
436	 * become stale. It's assumed that if this flag is set then the
437	 * d_revalidate op will also be defined.
438	 */
439	if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
440		return 0;
441
442	status = dentry->d_op->d_revalidate(dentry, nd);
443	if (status > 0)
444		return 0;
445
446	if (!status) {
447		d_invalidate(dentry);
448		status = -ESTALE;
449	}
450	return status;
451}
452
453/*
454 * Short-cut version of permission(), for calling on directories
455 * during pathname resolution.  Combines parts of permission()
456 * and generic_permission(), and tests ONLY for MAY_EXEC permission.
457 *
458 * If appropriate, check DAC only.  If not appropriate, or
459 * short-cut DAC fails, then call ->permission() to do more
460 * complete permission check.
461 */
462static int exec_permission(struct inode *inode)
463{
464	int ret;
465
466	if (inode->i_op->permission) {
467		ret = inode->i_op->permission(inode, MAY_EXEC);
468		if (!ret)
469			goto ok;
470		return ret;
471	}
472	ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
473	if (!ret)
474		goto ok;
475
476	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
477		goto ok;
478
479	return ret;
480ok:
481	return security_inode_permission(inode, MAY_EXEC);
482}
483
484static __always_inline void set_root(struct nameidata *nd)
485{
486	if (!nd->root.mnt)
487		get_fs_root(current->fs, &nd->root);
488}
489
490static int link_path_walk(const char *, struct nameidata *);
491
492static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
493{
494	if (IS_ERR(link))
495		goto fail;
496
497	if (*link == '/') {
498		set_root(nd);
499		path_put(&nd->path);
500		nd->path = nd->root;
501		path_get(&nd->root);
502	}
503
504	return link_path_walk(link, nd);
505fail:
506	path_put(&nd->path);
507	return PTR_ERR(link);
508}
509
510static void path_put_conditional(struct path *path, struct nameidata *nd)
511{
512	dput(path->dentry);
513	if (path->mnt != nd->path.mnt)
514		mntput(path->mnt);
515}
516
517static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
518{
519	dput(nd->path.dentry);
520	if (nd->path.mnt != path->mnt) {
521		mntput(nd->path.mnt);
522		nd->path.mnt = path->mnt;
523	}
524	nd->path.dentry = path->dentry;
525}
526
527static __always_inline int
528__do_follow_link(struct path *path, struct nameidata *nd, void **p)
529{
530	int error;
531	struct dentry *dentry = path->dentry;
532
533	touch_atime(path->mnt, dentry);
534	nd_set_link(nd, NULL);
535
536	if (path->mnt != nd->path.mnt) {
537		path_to_nameidata(path, nd);
538		dget(dentry);
539	}
540	mntget(path->mnt);
541	nd->last_type = LAST_BIND;
542	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
543	error = PTR_ERR(*p);
544	if (!IS_ERR(*p)) {
545		char *s = nd_get_link(nd);
546		error = 0;
547		if (s)
548			error = __vfs_follow_link(nd, s);
549		else if (nd->last_type == LAST_BIND) {
550			error = force_reval_path(&nd->path, nd);
551			if (error)
552				path_put(&nd->path);
553		}
554	}
555	return error;
556}
557
558/*
559 * This limits recursive symlink follows to 8, while
560 * limiting consecutive symlinks to 40.
561 *
562 * Without that kind of total limit, nasty chains of consecutive
563 * symlinks can cause almost arbitrarily long lookups.
564 */
565static inline int do_follow_link(struct path *path, struct nameidata *nd)
566{
567	void *cookie;
568	int err = -ELOOP;
569	if (current->link_count >= MAX_NESTED_LINKS)
570		goto loop;
571	if (current->total_link_count >= 40)
572		goto loop;
573	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
574	cond_resched();
575	err = security_inode_follow_link(path->dentry, nd);
576	if (err)
577		goto loop;
578	current->link_count++;
579	current->total_link_count++;
580	nd->depth++;
581	err = __do_follow_link(path, nd, &cookie);
582	if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
583		path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
584	path_put(path);
585	current->link_count--;
586	nd->depth--;
587	return err;
588loop:
589	path_put_conditional(path, nd);
590	path_put(&nd->path);
591	return err;
592}
593
594int follow_up(struct path *path)
595{
596	struct vfsmount *parent;
597	struct dentry *mountpoint;
598
599	br_read_lock(vfsmount_lock);
600	parent = path->mnt->mnt_parent;
601	if (parent == path->mnt) {
602		br_read_unlock(vfsmount_lock);
603		return 0;
604	}
605	mntget(parent);
606	mountpoint = dget(path->mnt->mnt_mountpoint);
607	br_read_unlock(vfsmount_lock);
608	dput(path->dentry);
609	path->dentry = mountpoint;
610	mntput(path->mnt);
611	path->mnt = parent;
612	return 1;
613}
614
615/* no need for dcache_lock, as serialization is taken care in
616 * namespace.c
617 */
618static int __follow_mount(struct path *path)
619{
620	int res = 0;
621	while (d_mountpoint(path->dentry)) {
622		struct vfsmount *mounted = lookup_mnt(path);
623		if (!mounted)
624			break;
625		dput(path->dentry);
626		if (res)
627			mntput(path->mnt);
628		path->mnt = mounted;
629		path->dentry = dget(mounted->mnt_root);
630		res = 1;
631	}
632	return res;
633}
634
635static void follow_mount(struct path *path)
636{
637	while (d_mountpoint(path->dentry)) {
638		struct vfsmount *mounted = lookup_mnt(path);
639		if (!mounted)
640			break;
641		dput(path->dentry);
642		mntput(path->mnt);
643		path->mnt = mounted;
644		path->dentry = dget(mounted->mnt_root);
645	}
646}
647
648/* no need for dcache_lock, as serialization is taken care in
649 * namespace.c
650 */
651int follow_down(struct path *path)
652{
653	struct vfsmount *mounted;
654
655	mounted = lookup_mnt(path);
656	if (mounted) {
657		dput(path->dentry);
658		mntput(path->mnt);
659		path->mnt = mounted;
660		path->dentry = dget(mounted->mnt_root);
661		return 1;
662	}
663	return 0;
664}
665
666static __always_inline void follow_dotdot(struct nameidata *nd)
667{
668	set_root(nd);
669
670	while(1) {
671		struct dentry *old = nd->path.dentry;
672
673		if (nd->path.dentry == nd->root.dentry &&
674		    nd->path.mnt == nd->root.mnt) {
675			break;
676		}
677		if (nd->path.dentry != nd->path.mnt->mnt_root) {
678			/* rare case of legitimate dget_parent()... */
679			nd->path.dentry = dget_parent(nd->path.dentry);
680			dput(old);
681			break;
682		}
683		if (!follow_up(&nd->path))
684			break;
685	}
686	follow_mount(&nd->path);
687}
688
689/*
690 * Allocate a dentry with name and parent, and perform a parent
691 * directory ->lookup on it. Returns the new dentry, or ERR_PTR
692 * on error. parent->d_inode->i_mutex must be held. d_lookup must
693 * have verified that no child exists while under i_mutex.
694 */
695static struct dentry *d_alloc_and_lookup(struct dentry *parent,
696				struct qstr *name, struct nameidata *nd)
697{
698	struct inode *inode = parent->d_inode;
699	struct dentry *dentry;
700	struct dentry *old;
701
702	/* Don't create child dentry for a dead directory. */
703	if (unlikely(IS_DEADDIR(inode)))
704		return ERR_PTR(-ENOENT);
705
706	dentry = d_alloc(parent, name);
707	if (unlikely(!dentry))
708		return ERR_PTR(-ENOMEM);
709
710	old = inode->i_op->lookup(inode, dentry, nd);
711	if (unlikely(old)) {
712		dput(dentry);
713		dentry = old;
714	}
715	return dentry;
716}
717
718/*
719 *  It's more convoluted than I'd like it to be, but... it's still fairly
720 *  small and for now I'd prefer to have fast path as straight as possible.
721 *  It _is_ time-critical.
722 */
723static int do_lookup(struct nameidata *nd, struct qstr *name,
724		     struct path *path)
725{
726	struct vfsmount *mnt = nd->path.mnt;
727	struct dentry *dentry, *parent;
728	struct inode *dir;
729	/*
730	 * See if the low-level filesystem might want
731	 * to use its own hash..
732	 */
733	if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
734		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
735		if (err < 0)
736			return err;
737	}
738
739	/*
740	 * Rename seqlock is not required here because in the off chance
741	 * of a false negative due to a concurrent rename, we're going to
742	 * do the non-racy lookup, below.
743	 */
744	dentry = __d_lookup(nd->path.dentry, name);
745	if (!dentry)
746		goto need_lookup;
747found:
748	if (dentry->d_op && dentry->d_op->d_revalidate)
749		goto need_revalidate;
750done:
751	path->mnt = mnt;
752	path->dentry = dentry;
753	__follow_mount(path);
754	return 0;
755
756need_lookup:
757	parent = nd->path.dentry;
758	dir = parent->d_inode;
759
760	mutex_lock(&dir->i_mutex);
761	/*
762	 * First re-do the cached lookup just in case it was created
763	 * while we waited for the directory semaphore, or the first
764	 * lookup failed due to an unrelated rename.
765	 *
766	 * This could use version numbering or similar to avoid unnecessary
767	 * cache lookups, but then we'd have to do the first lookup in the
768	 * non-racy way. However in the common case here, everything should
769	 * be hot in cache, so would it be a big win?
770	 */
771	dentry = d_lookup(parent, name);
772	if (likely(!dentry)) {
773		dentry = d_alloc_and_lookup(parent, name, nd);
774		mutex_unlock(&dir->i_mutex);
775		if (IS_ERR(dentry))
776			goto fail;
777		goto done;
778	}
779	/*
780	 * Uhhuh! Nasty case: the cache was re-populated while
781	 * we waited on the semaphore. Need to revalidate.
782	 */
783	mutex_unlock(&dir->i_mutex);
784	goto found;
785
786need_revalidate:
787	dentry = do_revalidate(dentry, nd);
788	if (!dentry)
789		goto need_lookup;
790	if (IS_ERR(dentry))
791		goto fail;
792	goto done;
793
794fail:
795	return PTR_ERR(dentry);
796}
797
798/*
799 * This is a temporary kludge to deal with "automount" symlinks; proper
800 * solution is to trigger them on follow_mount(), so that do_lookup()
801 * would DTRT.  To be killed before 2.6.34-final.
802 */
803static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
804{
805	return inode && unlikely(inode->i_op->follow_link) &&
806		((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
807}
808
809/*
810 * Name resolution.
811 * This is the basic name resolution function, turning a pathname into
812 * the final dentry. We expect 'base' to be positive and a directory.
813 *
814 * Returns 0 and nd will have valid dentry and mnt on success.
815 * Returns error and drops reference to input namei data on failure.
816 */
817static int link_path_walk(const char *name, struct nameidata *nd)
818{
819	struct path next;
820	struct inode *inode;
821	int err;
822	unsigned int lookup_flags = nd->flags;
823
824	while (*name=='/')
825		name++;
826	if (!*name)
827		goto return_reval;
828
829	inode = nd->path.dentry->d_inode;
830	if (nd->depth)
831		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
832
833	/* At this point we know we have a real path component. */
834	for(;;) {
835		unsigned long hash;
836		struct qstr this;
837		unsigned int c;
838
839		nd->flags |= LOOKUP_CONTINUE;
840		err = exec_permission(inode);
841 		if (err)
842			break;
843
844		this.name = name;
845		c = *(const unsigned char *)name;
846
847		hash = init_name_hash();
848		do {
849			name++;
850			hash = partial_name_hash(c, hash);
851			c = *(const unsigned char *)name;
852		} while (c && (c != '/'));
853		this.len = name - (const char *) this.name;
854		this.hash = end_name_hash(hash);
855
856		/* remove trailing slashes? */
857		if (!c)
858			goto last_component;
859		while (*++name == '/');
860		if (!*name)
861			goto last_with_slashes;
862
863		/*
864		 * "." and ".." are special - ".." especially so because it has
865		 * to be able to know about the current root directory and
866		 * parent relationships.
867		 */
868		if (this.name[0] == '.') switch (this.len) {
869			default:
870				break;
871			case 2:
872				if (this.name[1] != '.')
873					break;
874				follow_dotdot(nd);
875				inode = nd->path.dentry->d_inode;
876				/* fallthrough */
877			case 1:
878				continue;
879		}
880		/* This does the actual lookups.. */
881		err = do_lookup(nd, &this, &next);
882		if (err)
883			break;
884
885		err = -ENOENT;
886		inode = next.dentry->d_inode;
887		if (!inode)
888			goto out_dput;
889
890		if (inode->i_op->follow_link) {
891			err = do_follow_link(&next, nd);
892			if (err)
893				goto return_err;
894			err = -ENOENT;
895			inode = nd->path.dentry->d_inode;
896			if (!inode)
897				break;
898		} else
899			path_to_nameidata(&next, nd);
900		err = -ENOTDIR;
901		if (!inode->i_op->lookup)
902			break;
903		continue;
904		/* here ends the main loop */
905
906last_with_slashes:
907		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
908last_component:
909		/* Clear LOOKUP_CONTINUE iff it was previously unset */
910		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
911		if (lookup_flags & LOOKUP_PARENT)
912			goto lookup_parent;
913		if (this.name[0] == '.') switch (this.len) {
914			default:
915				break;
916			case 2:
917				if (this.name[1] != '.')
918					break;
919				follow_dotdot(nd);
920				inode = nd->path.dentry->d_inode;
921				/* fallthrough */
922			case 1:
923				goto return_reval;
924		}
925		err = do_lookup(nd, &this, &next);
926		if (err)
927			break;
928		inode = next.dentry->d_inode;
929		if (follow_on_final(inode, lookup_flags)) {
930			err = do_follow_link(&next, nd);
931			if (err)
932				goto return_err;
933			inode = nd->path.dentry->d_inode;
934		} else
935			path_to_nameidata(&next, nd);
936		err = -ENOENT;
937		if (!inode)
938			break;
939		if (lookup_flags & LOOKUP_DIRECTORY) {
940			err = -ENOTDIR;
941			if (!inode->i_op->lookup)
942				break;
943		}
944		goto return_base;
945lookup_parent:
946		nd->last = this;
947		nd->last_type = LAST_NORM;
948		if (this.name[0] != '.')
949			goto return_base;
950		if (this.len == 1)
951			nd->last_type = LAST_DOT;
952		else if (this.len == 2 && this.name[1] == '.')
953			nd->last_type = LAST_DOTDOT;
954		else
955			goto return_base;
956return_reval:
957		/*
958		 * We bypassed the ordinary revalidation routines.
959		 * We may need to check the cached dentry for staleness.
960		 */
961		if (nd->path.dentry && nd->path.dentry->d_sb &&
962		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
963			err = -ESTALE;
964			/* Note: we do not d_invalidate() */
965			if (!nd->path.dentry->d_op->d_revalidate(
966					nd->path.dentry, nd))
967				break;
968		}
969return_base:
970		return 0;
971out_dput:
972		path_put_conditional(&next, nd);
973		break;
974	}
975	path_put(&nd->path);
976return_err:
977	return err;
978}
979
980static int path_walk(const char *name, struct nameidata *nd)
981{
982	struct path save = nd->path;
983	int result;
984
985	current->total_link_count = 0;
986
987	/* make sure the stuff we saved doesn't go away */
988	path_get(&save);
989
990	result = link_path_walk(name, nd);
991	if (result == -ESTALE) {
992		/* nd->path had been dropped */
993		current->total_link_count = 0;
994		nd->path = save;
995		path_get(&nd->path);
996		nd->flags |= LOOKUP_REVAL;
997		result = link_path_walk(name, nd);
998	}
999
1000	path_put(&save);
1001
1002	return result;
1003}
1004
1005static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1006{
1007	int retval = 0;
1008	int fput_needed;
1009	struct file *file;
1010
1011	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1012	nd->flags = flags;
1013	nd->depth = 0;
1014	nd->root.mnt = NULL;
1015
1016	if (*name=='/') {
1017		set_root(nd);
1018		nd->path = nd->root;
1019		path_get(&nd->root);
1020	} else if (dfd == AT_FDCWD) {
1021		get_fs_pwd(current->fs, &nd->path);
1022	} else {
1023		struct dentry *dentry;
1024
1025		file = fget_light(dfd, &fput_needed);
1026		retval = -EBADF;
1027		if (!file)
1028			goto out_fail;
1029
1030		dentry = file->f_path.dentry;
1031
1032		retval = -ENOTDIR;
1033		if (!S_ISDIR(dentry->d_inode->i_mode))
1034			goto fput_fail;
1035
1036		retval = file_permission(file, MAY_EXEC);
1037		if (retval)
1038			goto fput_fail;
1039
1040		nd->path = file->f_path;
1041		path_get(&file->f_path);
1042
1043		fput_light(file, fput_needed);
1044	}
1045	return 0;
1046
1047fput_fail:
1048	fput_light(file, fput_needed);
1049out_fail:
1050	return retval;
1051}
1052
1053/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1054static int do_path_lookup(int dfd, const char *name,
1055				unsigned int flags, struct nameidata *nd)
1056{
1057	int retval = path_init(dfd, name, flags, nd);
1058	if (!retval)
1059		retval = path_walk(name, nd);
1060	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1061				nd->path.dentry->d_inode))
1062		audit_inode(name, nd->path.dentry);
1063	if (nd->root.mnt) {
1064		path_put(&nd->root);
1065		nd->root.mnt = NULL;
1066	}
1067	return retval;
1068}
1069
1070int path_lookup(const char *name, unsigned int flags,
1071			struct nameidata *nd)
1072{
1073	return do_path_lookup(AT_FDCWD, name, flags, nd);
1074}
1075
1076int kern_path(const char *name, unsigned int flags, struct path *path)
1077{
1078	struct nameidata nd;
1079	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
1080	if (!res)
1081		*path = nd.path;
1082	return res;
1083}
1084
1085/**
1086 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1087 * @dentry:  pointer to dentry of the base directory
1088 * @mnt: pointer to vfs mount of the base directory
1089 * @name: pointer to file name
1090 * @flags: lookup flags
1091 * @nd: pointer to nameidata
1092 */
1093int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1094		    const char *name, unsigned int flags,
1095		    struct nameidata *nd)
1096{
1097	int retval;
1098
1099	/* same as do_path_lookup */
1100	nd->last_type = LAST_ROOT;
1101	nd->flags = flags;
1102	nd->depth = 0;
1103
1104	nd->path.dentry = dentry;
1105	nd->path.mnt = mnt;
1106	path_get(&nd->path);
1107	nd->root = nd->path;
1108	path_get(&nd->root);
1109
1110	retval = path_walk(name, nd);
1111	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1112				nd->path.dentry->d_inode))
1113		audit_inode(name, nd->path.dentry);
1114
1115	path_put(&nd->root);
1116	nd->root.mnt = NULL;
1117
1118	return retval;
1119}
1120
1121static struct dentry *__lookup_hash(struct qstr *name,
1122		struct dentry *base, struct nameidata *nd)
1123{
1124	struct dentry *dentry;
1125	struct inode *inode;
1126	int err;
1127
1128	inode = base->d_inode;
1129
1130	/*
1131	 * See if the low-level filesystem might want
1132	 * to use its own hash..
1133	 */
1134	if (base->d_op && base->d_op->d_hash) {
1135		err = base->d_op->d_hash(base, name);
1136		dentry = ERR_PTR(err);
1137		if (err < 0)
1138			goto out;
1139	}
1140
1141	/*
1142	 * Don't bother with __d_lookup: callers are for creat as
1143	 * well as unlink, so a lot of the time it would cost
1144	 * a double lookup.
1145	 */
1146	dentry = d_lookup(base, name);
1147
1148	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1149		dentry = do_revalidate(dentry, nd);
1150
1151	if (!dentry)
1152		dentry = d_alloc_and_lookup(base, name, nd);
1153out:
1154	return dentry;
1155}
1156
1157/*
1158 * Restricted form of lookup. Doesn't follow links, single-component only,
1159 * needs parent already locked. Doesn't follow mounts.
1160 * SMP-safe.
1161 */
1162static struct dentry *lookup_hash(struct nameidata *nd)
1163{
1164	int err;
1165
1166	err = exec_permission(nd->path.dentry->d_inode);
1167	if (err)
1168		return ERR_PTR(err);
1169	return __lookup_hash(&nd->last, nd->path.dentry, nd);
1170}
1171
1172static int __lookup_one_len(const char *name, struct qstr *this,
1173		struct dentry *base, int len)
1174{
1175	unsigned long hash;
1176	unsigned int c;
1177
1178	this->name = name;
1179	this->len = len;
1180	if (!len)
1181		return -EACCES;
1182
1183	hash = init_name_hash();
1184	while (len--) {
1185		c = *(const unsigned char *)name++;
1186		if (c == '/' || c == '\0')
1187			return -EACCES;
1188		hash = partial_name_hash(c, hash);
1189	}
1190	this->hash = end_name_hash(hash);
1191	return 0;
1192}
1193
1194/**
1195 * lookup_one_len - filesystem helper to lookup single pathname component
1196 * @name:	pathname component to lookup
1197 * @base:	base directory to lookup from
1198 * @len:	maximum length @len should be interpreted to
1199 *
1200 * Note that this routine is purely a helper for filesystem usage and should
1201 * not be called by generic code.  Also note that by using this function the
1202 * nameidata argument is passed to the filesystem methods and a filesystem
1203 * using this helper needs to be prepared for that.
1204 */
1205struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1206{
1207	int err;
1208	struct qstr this;
1209
1210	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1211
1212	err = __lookup_one_len(name, &this, base, len);
1213	if (err)
1214		return ERR_PTR(err);
1215
1216	err = exec_permission(base->d_inode);
1217	if (err)
1218		return ERR_PTR(err);
1219	return __lookup_hash(&this, base, NULL);
1220}
1221
1222int user_path_at(int dfd, const char __user *name, unsigned flags,
1223		 struct path *path)
1224{
1225	struct nameidata nd;
1226	char *tmp = getname(name);
1227	int err = PTR_ERR(tmp);
1228	if (!IS_ERR(tmp)) {
1229
1230		BUG_ON(flags & LOOKUP_PARENT);
1231
1232		err = do_path_lookup(dfd, tmp, flags, &nd);
1233		putname(tmp);
1234		if (!err)
1235			*path = nd.path;
1236	}
1237	return err;
1238}
1239
1240static int user_path_parent(int dfd, const char __user *path,
1241			struct nameidata *nd, char **name)
1242{
1243	char *s = getname(path);
1244	int error;
1245
1246	if (IS_ERR(s))
1247		return PTR_ERR(s);
1248
1249	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
1250	if (error)
1251		putname(s);
1252	else
1253		*name = s;
1254
1255	return error;
1256}
1257
1258/*
1259 * It's inline, so penalty for filesystems that don't use sticky bit is
1260 * minimal.
1261 */
1262static inline int check_sticky(struct inode *dir, struct inode *inode)
1263{
1264	uid_t fsuid = current_fsuid();
1265
1266	if (!(dir->i_mode & S_ISVTX))
1267		return 0;
1268	if (inode->i_uid == fsuid)
1269		return 0;
1270	if (dir->i_uid == fsuid)
1271		return 0;
1272	return !capable(CAP_FOWNER);
1273}
1274
1275/*
1276 *	Check whether we can remove a link victim from directory dir, check
1277 *  whether the type of victim is right.
1278 *  1. We can't do it if dir is read-only (done in permission())
1279 *  2. We should have write and exec permissions on dir
1280 *  3. We can't remove anything from append-only dir
1281 *  4. We can't do anything with immutable dir (done in permission())
1282 *  5. If the sticky bit on dir is set we should either
1283 *	a. be owner of dir, or
1284 *	b. be owner of victim, or
1285 *	c. have CAP_FOWNER capability
1286 *  6. If the victim is append-only or immutable we can't do antyhing with
1287 *     links pointing to it.
1288 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1289 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1290 *  9. We can't remove a root or mountpoint.
1291 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1292 *     nfs_async_unlink().
1293 */
1294static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1295{
1296	int error;
1297
1298	if (!victim->d_inode)
1299		return -ENOENT;
1300
1301	BUG_ON(victim->d_parent->d_inode != dir);
1302	audit_inode_child(victim, dir);
1303
1304	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1305	if (error)
1306		return error;
1307	if (IS_APPEND(dir))
1308		return -EPERM;
1309	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1310	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
1311		return -EPERM;
1312	if (isdir) {
1313		if (!S_ISDIR(victim->d_inode->i_mode))
1314			return -ENOTDIR;
1315		if (IS_ROOT(victim))
1316			return -EBUSY;
1317	} else if (S_ISDIR(victim->d_inode->i_mode))
1318		return -EISDIR;
1319	if (IS_DEADDIR(dir))
1320		return -ENOENT;
1321	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1322		return -EBUSY;
1323	return 0;
1324}
1325
1326/*	Check whether we can create an object with dentry child in directory
1327 *  dir.
1328 *  1. We can't do it if child already exists (open has special treatment for
1329 *     this case, but since we are inlined it's OK)
1330 *  2. We can't do it if dir is read-only (done in permission())
1331 *  3. We should have write and exec permissions on dir
1332 *  4. We can't do it if dir is immutable (done in permission())
1333 */
1334static inline int may_create(struct inode *dir, struct dentry *child)
1335{
1336	if (child->d_inode)
1337		return -EEXIST;
1338	if (IS_DEADDIR(dir))
1339		return -ENOENT;
1340	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1341}
1342
1343/*
1344 * p1 and p2 should be directories on the same fs.
1345 */
1346struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1347{
1348	struct dentry *p;
1349
1350	if (p1 == p2) {
1351		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1352		return NULL;
1353	}
1354
1355	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1356
1357	p = d_ancestor(p2, p1);
1358	if (p) {
1359		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1360		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1361		return p;
1362	}
1363
1364	p = d_ancestor(p1, p2);
1365	if (p) {
1366		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1367		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1368		return p;
1369	}
1370
1371	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1372	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1373	return NULL;
1374}
1375
1376void unlock_rename(struct dentry *p1, struct dentry *p2)
1377{
1378	mutex_unlock(&p1->d_inode->i_mutex);
1379	if (p1 != p2) {
1380		mutex_unlock(&p2->d_inode->i_mutex);
1381		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1382	}
1383}
1384
1385int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1386		struct nameidata *nd)
1387{
1388	int error = may_create(dir, dentry);
1389
1390	if (error)
1391		return error;
1392
1393	if (!dir->i_op->create)
1394		return -EACCES;	/* shouldn't it be ENOSYS? */
1395	mode &= S_IALLUGO;
1396	mode |= S_IFREG;
1397	error = security_inode_create(dir, dentry, mode);
1398	if (error)
1399		return error;
1400	error = dir->i_op->create(dir, dentry, mode, nd);
1401	if (!error)
1402		fsnotify_create(dir, dentry);
1403	return error;
1404}
1405
1406int may_open(struct path *path, int acc_mode, int flag)
1407{
1408	struct dentry *dentry = path->dentry;
1409	struct inode *inode = dentry->d_inode;
1410	int error;
1411
1412	if (!inode)
1413		return -ENOENT;
1414
1415	switch (inode->i_mode & S_IFMT) {
1416	case S_IFLNK:
1417		return -ELOOP;
1418	case S_IFDIR:
1419		if (acc_mode & MAY_WRITE)
1420			return -EISDIR;
1421		break;
1422	case S_IFBLK:
1423	case S_IFCHR:
1424		if (path->mnt->mnt_flags & MNT_NODEV)
1425			return -EACCES;
1426		/*FALLTHRU*/
1427	case S_IFIFO:
1428	case S_IFSOCK:
1429		flag &= ~O_TRUNC;
1430		break;
1431	}
1432
1433	error = inode_permission(inode, acc_mode);
1434	if (error)
1435		return error;
1436
1437	/*
1438	 * An append-only file must be opened in append mode for writing.
1439	 */
1440	if (IS_APPEND(inode)) {
1441		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1442			return -EPERM;
1443		if (flag & O_TRUNC)
1444			return -EPERM;
1445	}
1446
1447	/* O_NOATIME can only be set by the owner or superuser */
1448	if (flag & O_NOATIME && !is_owner_or_cap(inode))
1449		return -EPERM;
1450
1451	/*
1452	 * Ensure there are no outstanding leases on the file.
1453	 */
1454	return break_lease(inode, flag);
1455}
1456
1457static int handle_truncate(struct path *path)
1458{
1459	struct inode *inode = path->dentry->d_inode;
1460	int error = get_write_access(inode);
1461	if (error)
1462		return error;
1463	/*
1464	 * Refuse to truncate files with mandatory locks held on them.
1465	 */
1466	error = locks_verify_locked(inode);
1467	if (!error)
1468		error = security_path_truncate(path);
1469	if (!error) {
1470		error = do_truncate(path->dentry, 0,
1471				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1472				    NULL);
1473	}
1474	put_write_access(inode);
1475	return error;
1476}
1477
1478/*
1479 * Be careful about ever adding any more callers of this
1480 * function.  Its flags must be in the namei format, not
1481 * what get passed to sys_open().
1482 */
1483static int __open_namei_create(struct nameidata *nd, struct path *path,
1484				int open_flag, int mode)
1485{
1486	int error;
1487	struct dentry *dir = nd->path.dentry;
1488
1489	if (!IS_POSIXACL(dir->d_inode))
1490		mode &= ~current_umask();
1491	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1492	if (error)
1493		goto out_unlock;
1494	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1495out_unlock:
1496	mutex_unlock(&dir->d_inode->i_mutex);
1497	dput(nd->path.dentry);
1498	nd->path.dentry = path->dentry;
1499	if (error)
1500		return error;
1501	/* Don't check for write permission, don't truncate */
1502	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1503}
1504
1505/*
1506 * Note that while the flag value (low two bits) for sys_open means:
1507 *	00 - read-only
1508 *	01 - write-only
1509 *	10 - read-write
1510 *	11 - special
1511 * it is changed into
1512 *	00 - no permissions needed
1513 *	01 - read-permission
1514 *	10 - write-permission
1515 *	11 - read-write
1516 * for the internal routines (ie open_namei()/follow_link() etc)
1517 * This is more logical, and also allows the 00 "no perm needed"
1518 * to be used for symlinks (where the permissions are checked
1519 * later).
1520 *
1521*/
1522static inline int open_to_namei_flags(int flag)
1523{
1524	if ((flag+1) & O_ACCMODE)
1525		flag++;
1526	return flag;
1527}
1528
1529static int open_will_truncate(int flag, struct inode *inode)
1530{
1531	/*
1532	 * We'll never write to the fs underlying
1533	 * a device file.
1534	 */
1535	if (special_file(inode->i_mode))
1536		return 0;
1537	return (flag & O_TRUNC);
1538}
1539
1540static struct file *finish_open(struct nameidata *nd,
1541				int open_flag, int acc_mode)
1542{
1543	struct file *filp;
1544	int will_truncate;
1545	int error;
1546
1547	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1548	if (will_truncate) {
1549		error = mnt_want_write(nd->path.mnt);
1550		if (error)
1551			goto exit;
1552	}
1553	error = may_open(&nd->path, acc_mode, open_flag);
1554	if (error) {
1555		if (will_truncate)
1556			mnt_drop_write(nd->path.mnt);
1557		goto exit;
1558	}
1559	filp = nameidata_to_filp(nd);
1560	if (!IS_ERR(filp)) {
1561		error = ima_file_check(filp, acc_mode);
1562		if (error) {
1563			fput(filp);
1564			filp = ERR_PTR(error);
1565		}
1566	}
1567	if (!IS_ERR(filp)) {
1568		if (will_truncate) {
1569			error = handle_truncate(&nd->path);
1570			if (error) {
1571				fput(filp);
1572				filp = ERR_PTR(error);
1573			}
1574		}
1575	}
1576	/*
1577	 * It is now safe to drop the mnt write
1578	 * because the filp has had a write taken
1579	 * on its behalf.
1580	 */
1581	if (will_truncate)
1582		mnt_drop_write(nd->path.mnt);
1583	return filp;
1584
1585exit:
1586	if (!IS_ERR(nd->intent.open.file))
1587		release_open_intent(nd);
1588	path_put(&nd->path);
1589	return ERR_PTR(error);
1590}
1591
1592static struct file *do_last(struct nameidata *nd, struct path *path,
1593			    int open_flag, int acc_mode,
1594			    int mode, const char *pathname)
1595{
1596	struct dentry *dir = nd->path.dentry;
1597	struct file *filp;
1598	int error = -EISDIR;
1599
1600	switch (nd->last_type) {
1601	case LAST_DOTDOT:
1602		follow_dotdot(nd);
1603		dir = nd->path.dentry;
1604	case LAST_DOT:
1605		if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1606			if (!dir->d_op->d_revalidate(dir, nd)) {
1607				error = -ESTALE;
1608				goto exit;
1609			}
1610		}
1611		/* fallthrough */
1612	case LAST_ROOT:
1613		if (open_flag & O_CREAT)
1614			goto exit;
1615		/* fallthrough */
1616	case LAST_BIND:
1617		audit_inode(pathname, dir);
1618		goto ok;
1619	}
1620
1621	/* trailing slashes? */
1622	if (nd->last.name[nd->last.len]) {
1623		if (open_flag & O_CREAT)
1624			goto exit;
1625		nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1626	}
1627
1628	/* just plain open? */
1629	if (!(open_flag & O_CREAT)) {
1630		error = do_lookup(nd, &nd->last, path);
1631		if (error)
1632			goto exit;
1633		error = -ENOENT;
1634		if (!path->dentry->d_inode)
1635			goto exit_dput;
1636		if (path->dentry->d_inode->i_op->follow_link)
1637			return NULL;
1638		error = -ENOTDIR;
1639		if (nd->flags & LOOKUP_DIRECTORY) {
1640			if (!path->dentry->d_inode->i_op->lookup)
1641				goto exit_dput;
1642		}
1643		path_to_nameidata(path, nd);
1644		audit_inode(pathname, nd->path.dentry);
1645		goto ok;
1646	}
1647
1648	/* OK, it's O_CREAT */
1649	mutex_lock(&dir->d_inode->i_mutex);
1650
1651	path->dentry = lookup_hash(nd);
1652	path->mnt = nd->path.mnt;
1653
1654	error = PTR_ERR(path->dentry);
1655	if (IS_ERR(path->dentry)) {
1656		mutex_unlock(&dir->d_inode->i_mutex);
1657		goto exit;
1658	}
1659
1660	if (IS_ERR(nd->intent.open.file)) {
1661		error = PTR_ERR(nd->intent.open.file);
1662		goto exit_mutex_unlock;
1663	}
1664
1665	/* Negative dentry, just create the file */
1666	if (!path->dentry->d_inode) {
1667		/*
1668		 * This write is needed to ensure that a
1669		 * ro->rw transition does not occur between
1670		 * the time when the file is created and when
1671		 * a permanent write count is taken through
1672		 * the 'struct file' in nameidata_to_filp().
1673		 */
1674		error = mnt_want_write(nd->path.mnt);
1675		if (error)
1676			goto exit_mutex_unlock;
1677		error = __open_namei_create(nd, path, open_flag, mode);
1678		if (error) {
1679			mnt_drop_write(nd->path.mnt);
1680			goto exit;
1681		}
1682		filp = nameidata_to_filp(nd);
1683		mnt_drop_write(nd->path.mnt);
1684		if (!IS_ERR(filp)) {
1685			error = ima_file_check(filp, acc_mode);
1686			if (error) {
1687				fput(filp);
1688				filp = ERR_PTR(error);
1689			}
1690		}
1691		return filp;
1692	}
1693
1694	/*
1695	 * It already exists.
1696	 */
1697	mutex_unlock(&dir->d_inode->i_mutex);
1698	audit_inode(pathname, path->dentry);
1699
1700	error = -EEXIST;
1701	if (open_flag & O_EXCL)
1702		goto exit_dput;
1703
1704	if (__follow_mount(path)) {
1705		error = -ELOOP;
1706		if (open_flag & O_NOFOLLOW)
1707			goto exit_dput;
1708	}
1709
1710	error = -ENOENT;
1711	if (!path->dentry->d_inode)
1712		goto exit_dput;
1713
1714	if (path->dentry->d_inode->i_op->follow_link)
1715		return NULL;
1716
1717	path_to_nameidata(path, nd);
1718	error = -EISDIR;
1719	if (S_ISDIR(path->dentry->d_inode->i_mode))
1720		goto exit;
1721ok:
1722	filp = finish_open(nd, open_flag, acc_mode);
1723	return filp;
1724
1725exit_mutex_unlock:
1726	mutex_unlock(&dir->d_inode->i_mutex);
1727exit_dput:
1728	path_put_conditional(path, nd);
1729exit:
1730	if (!IS_ERR(nd->intent.open.file))
1731		release_open_intent(nd);
1732	path_put(&nd->path);
1733	return ERR_PTR(error);
1734}
1735
1736/*
1737 * Note that the low bits of the passed in "open_flag"
1738 * are not the same as in the local variable "flag". See
1739 * open_to_namei_flags() for more details.
1740 */
1741struct file *do_filp_open(int dfd, const char *pathname,
1742		int open_flag, int mode, int acc_mode)
1743{
1744	struct file *filp;
1745	struct nameidata nd;
1746	int error;
1747	struct path path;
1748	int count = 0;
1749	int flag = open_to_namei_flags(open_flag);
1750	int force_reval = 0;
1751
1752	if (!(open_flag & O_CREAT))
1753		mode = 0;
1754
1755	/*
1756	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
1757	 * check for O_DSYNC if the need any syncing at all we enforce it's
1758	 * always set instead of having to deal with possibly weird behaviour
1759	 * for malicious applications setting only __O_SYNC.
1760	 */
1761	if (open_flag & __O_SYNC)
1762		open_flag |= O_DSYNC;
1763
1764	if (!acc_mode)
1765		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1766
1767	/* O_TRUNC implies we need access checks for write permissions */
1768	if (open_flag & O_TRUNC)
1769		acc_mode |= MAY_WRITE;
1770
1771	/* Allow the LSM permission hook to distinguish append
1772	   access from general write access. */
1773	if (open_flag & O_APPEND)
1774		acc_mode |= MAY_APPEND;
1775
1776	/* find the parent */
1777reval:
1778	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1779	if (error)
1780		return ERR_PTR(error);
1781	if (force_reval)
1782		nd.flags |= LOOKUP_REVAL;
1783
1784	current->total_link_count = 0;
1785	error = link_path_walk(pathname, &nd);
1786	if (error) {
1787		filp = ERR_PTR(error);
1788		goto out;
1789	}
1790	if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1791		audit_inode(pathname, nd.path.dentry);
1792
1793	/*
1794	 * We have the parent and last component.
1795	 */
1796
1797	error = -ENFILE;
1798	filp = get_empty_filp();
1799	if (filp == NULL)
1800		goto exit_parent;
1801	nd.intent.open.file = filp;
1802	filp->f_flags = open_flag;
1803	nd.intent.open.flags = flag;
1804	nd.intent.open.create_mode = mode;
1805	nd.flags &= ~LOOKUP_PARENT;
1806	nd.flags |= LOOKUP_OPEN;
1807	if (open_flag & O_CREAT) {
1808		nd.flags |= LOOKUP_CREATE;
1809		if (open_flag & O_EXCL)
1810			nd.flags |= LOOKUP_EXCL;
1811	}
1812	if (open_flag & O_DIRECTORY)
1813		nd.flags |= LOOKUP_DIRECTORY;
1814	if (!(open_flag & O_NOFOLLOW))
1815		nd.flags |= LOOKUP_FOLLOW;
1816	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1817	while (unlikely(!filp)) { /* trailing symlink */
1818		struct path holder;
1819		struct inode *inode = path.dentry->d_inode;
1820		void *cookie;
1821		error = -ELOOP;
1822		/* S_ISDIR part is a temporary automount kludge */
1823		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
1824			goto exit_dput;
1825		if (count++ == 32)
1826			goto exit_dput;
1827		/*
1828		 * This is subtle. Instead of calling do_follow_link() we do
1829		 * the thing by hands. The reason is that this way we have zero
1830		 * link_count and path_walk() (called from ->follow_link)
1831		 * honoring LOOKUP_PARENT.  After that we have the parent and
1832		 * last component, i.e. we are in the same situation as after
1833		 * the first path_walk().  Well, almost - if the last component
1834		 * is normal we get its copy stored in nd->last.name and we will
1835		 * have to putname() it when we are done. Procfs-like symlinks
1836		 * just set LAST_BIND.
1837		 */
1838		nd.flags |= LOOKUP_PARENT;
1839		error = security_inode_follow_link(path.dentry, &nd);
1840		if (error)
1841			goto exit_dput;
1842		error = __do_follow_link(&path, &nd, &cookie);
1843		if (unlikely(error)) {
1844			/* nd.path had been dropped */
1845			if (!IS_ERR(cookie) && inode->i_op->put_link)
1846				inode->i_op->put_link(path.dentry, &nd, cookie);
1847			path_put(&path);
1848			release_open_intent(&nd);
1849			filp = ERR_PTR(error);
1850			goto out;
1851		}
1852		holder = path;
1853		nd.flags &= ~LOOKUP_PARENT;
1854		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1855		if (inode->i_op->put_link)
1856			inode->i_op->put_link(holder.dentry, &nd, cookie);
1857		path_put(&holder);
1858	}
1859out:
1860	if (nd.root.mnt)
1861		path_put(&nd.root);
1862	if (filp == ERR_PTR(-ESTALE) && !force_reval) {
1863		force_reval = 1;
1864		goto reval;
1865	}
1866	return filp;
1867
1868exit_dput:
1869	path_put_conditional(&path, &nd);
1870	if (!IS_ERR(nd.intent.open.file))
1871		release_open_intent(&nd);
1872exit_parent:
1873	path_put(&nd.path);
1874	filp = ERR_PTR(error);
1875	goto out;
1876}
1877
1878/**
1879 * filp_open - open file and return file pointer
1880 *
1881 * @filename:	path to open
1882 * @flags:	open flags as per the open(2) second argument
1883 * @mode:	mode for the new file if O_CREAT is set, else ignored
1884 *
1885 * This is the helper to open a file from kernelspace if you really
1886 * have to.  But in generally you should not do this, so please move
1887 * along, nothing to see here..
1888 */
1889struct file *filp_open(const char *filename, int flags, int mode)
1890{
1891	return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
1892}
1893EXPORT_SYMBOL(filp_open);
1894
1895/**
1896 * lookup_create - lookup a dentry, creating it if it doesn't exist
1897 * @nd: nameidata info
1898 * @is_dir: directory flag
1899 *
1900 * Simple function to lookup and return a dentry and create it
1901 * if it doesn't exist.  Is SMP-safe.
1902 *
1903 * Returns with nd->path.dentry->d_inode->i_mutex locked.
1904 */
1905struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1906{
1907	struct dentry *dentry = ERR_PTR(-EEXIST);
1908
1909	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1910	/*
1911	 * Yucky last component or no last component at all?
1912	 * (foo/., foo/.., /////)
1913	 */
1914	if (nd->last_type != LAST_NORM)
1915		goto fail;
1916	nd->flags &= ~LOOKUP_PARENT;
1917	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
1918	nd->intent.open.flags = O_EXCL;
1919
1920	/*
1921	 * Do the final lookup.
1922	 */
1923	dentry = lookup_hash(nd);
1924	if (IS_ERR(dentry))
1925		goto fail;
1926
1927	if (dentry->d_inode)
1928		goto eexist;
1929	/*
1930	 * Special case - lookup gave negative, but... we had foo/bar/
1931	 * From the vfs_mknod() POV we just have a negative dentry -
1932	 * all is fine. Let's be bastards - you had / on the end, you've
1933	 * been asking for (non-existent) directory. -ENOENT for you.
1934	 */
1935	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
1936		dput(dentry);
1937		dentry = ERR_PTR(-ENOENT);
1938	}
1939	return dentry;
1940eexist:
1941	dput(dentry);
1942	dentry = ERR_PTR(-EEXIST);
1943fail:
1944	return dentry;
1945}
1946EXPORT_SYMBOL_GPL(lookup_create);
1947
1948int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1949{
1950	int error = may_create(dir, dentry);
1951
1952	if (error)
1953		return error;
1954
1955	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1956		return -EPERM;
1957
1958	if (!dir->i_op->mknod)
1959		return -EPERM;
1960
1961	error = devcgroup_inode_mknod(mode, dev);
1962	if (error)
1963		return error;
1964
1965	error = security_inode_mknod(dir, dentry, mode, dev);
1966	if (error)
1967		return error;
1968
1969	error = dir->i_op->mknod(dir, dentry, mode, dev);
1970	if (!error)
1971		fsnotify_create(dir, dentry);
1972	return error;
1973}
1974
1975static int may_mknod(mode_t mode)
1976{
1977	switch (mode & S_IFMT) {
1978	case S_IFREG:
1979	case S_IFCHR:
1980	case S_IFBLK:
1981	case S_IFIFO:
1982	case S_IFSOCK:
1983	case 0: /* zero mode translates to S_IFREG */
1984		return 0;
1985	case S_IFDIR:
1986		return -EPERM;
1987	default:
1988		return -EINVAL;
1989	}
1990}
1991
1992SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
1993		unsigned, dev)
1994{
1995	int error;
1996	char *tmp;
1997	struct dentry *dentry;
1998	struct nameidata nd;
1999
2000	if (S_ISDIR(mode))
2001		return -EPERM;
2002
2003	error = user_path_parent(dfd, filename, &nd, &tmp);
2004	if (error)
2005		return error;
2006
2007	dentry = lookup_create(&nd, 0);
2008	if (IS_ERR(dentry)) {
2009		error = PTR_ERR(dentry);
2010		goto out_unlock;
2011	}
2012	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2013		mode &= ~current_umask();
2014	error = may_mknod(mode);
2015	if (error)
2016		goto out_dput;
2017	error = mnt_want_write(nd.path.mnt);
2018	if (error)
2019		goto out_dput;
2020	error = security_path_mknod(&nd.path, dentry, mode, dev);
2021	if (error)
2022		goto out_drop_write;
2023	switch (mode & S_IFMT) {
2024		case 0: case S_IFREG:
2025			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
2026			break;
2027		case S_IFCHR: case S_IFBLK:
2028			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
2029					new_decode_dev(dev));
2030			break;
2031		case S_IFIFO: case S_IFSOCK:
2032			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2033			break;
2034	}
2035out_drop_write:
2036	mnt_drop_write(nd.path.mnt);
2037out_dput:
2038	dput(dentry);
2039out_unlock:
2040	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2041	path_put(&nd.path);
2042	putname(tmp);
2043
2044	return error;
2045}
2046
2047SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
2048{
2049	return sys_mknodat(AT_FDCWD, filename, mode, dev);
2050}
2051
2052int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2053{
2054	int error = may_create(dir, dentry);
2055
2056	if (error)
2057		return error;
2058
2059	if (!dir->i_op->mkdir)
2060		return -EPERM;
2061
2062	mode &= (S_IRWXUGO|S_ISVTX);
2063	error = security_inode_mkdir(dir, dentry, mode);
2064	if (error)
2065		return error;
2066
2067	error = dir->i_op->mkdir(dir, dentry, mode);
2068	if (!error)
2069		fsnotify_mkdir(dir, dentry);
2070	return error;
2071}
2072
2073SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2074{
2075	int error = 0;
2076	char * tmp;
2077	struct dentry *dentry;
2078	struct nameidata nd;
2079
2080	error = user_path_parent(dfd, pathname, &nd, &tmp);
2081	if (error)
2082		goto out_err;
2083
2084	dentry = lookup_create(&nd, 1);
2085	error = PTR_ERR(dentry);
2086	if (IS_ERR(dentry))
2087		goto out_unlock;
2088
2089	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2090		mode &= ~current_umask();
2091	error = mnt_want_write(nd.path.mnt);
2092	if (error)
2093		goto out_dput;
2094	error = security_path_mkdir(&nd.path, dentry, mode);
2095	if (error)
2096		goto out_drop_write;
2097	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2098out_drop_write:
2099	mnt_drop_write(nd.path.mnt);
2100out_dput:
2101	dput(dentry);
2102out_unlock:
2103	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2104	path_put(&nd.path);
2105	putname(tmp);
2106out_err:
2107	return error;
2108}
2109
2110SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2111{
2112	return sys_mkdirat(AT_FDCWD, pathname, mode);
2113}
2114
2115/*
2116 * We try to drop the dentry early: we should have
2117 * a usage count of 2 if we're the only user of this
2118 * dentry, and if that is true (possibly after pruning
2119 * the dcache), then we drop the dentry now.
2120 *
2121 * A low-level filesystem can, if it choses, legally
2122 * do a
2123 *
2124 *	if (!d_unhashed(dentry))
2125 *		return -EBUSY;
2126 *
2127 * if it cannot handle the case of removing a directory
2128 * that is still in use by something else..
2129 */
2130void dentry_unhash(struct dentry *dentry)
2131{
2132	dget(dentry);
2133	shrink_dcache_parent(dentry);
2134	spin_lock(&dcache_lock);
2135	spin_lock(&dentry->d_lock);
2136	if (atomic_read(&dentry->d_count) == 2)
2137		__d_drop(dentry);
2138	spin_unlock(&dentry->d_lock);
2139	spin_unlock(&dcache_lock);
2140}
2141
2142int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2143{
2144	int error = may_delete(dir, dentry, 1);
2145
2146	if (error)
2147		return error;
2148
2149	if (!dir->i_op->rmdir)
2150		return -EPERM;
2151
2152	mutex_lock(&dentry->d_inode->i_mutex);
2153	dentry_unhash(dentry);
2154	if (d_mountpoint(dentry))
2155		error = -EBUSY;
2156	else {
2157		error = security_inode_rmdir(dir, dentry);
2158		if (!error) {
2159			error = dir->i_op->rmdir(dir, dentry);
2160			if (!error) {
2161				dentry->d_inode->i_flags |= S_DEAD;
2162				dont_mount(dentry);
2163			}
2164		}
2165	}
2166	mutex_unlock(&dentry->d_inode->i_mutex);
2167	if (!error) {
2168		d_delete(dentry);
2169	}
2170	dput(dentry);
2171
2172	return error;
2173}
2174
2175static long do_rmdir(int dfd, const char __user *pathname)
2176{
2177	int error = 0;
2178	char * name;
2179	struct dentry *dentry;
2180	struct nameidata nd;
2181
2182	error = user_path_parent(dfd, pathname, &nd, &name);
2183	if (error)
2184		return error;
2185
2186	switch(nd.last_type) {
2187	case LAST_DOTDOT:
2188		error = -ENOTEMPTY;
2189		goto exit1;
2190	case LAST_DOT:
2191		error = -EINVAL;
2192		goto exit1;
2193	case LAST_ROOT:
2194		error = -EBUSY;
2195		goto exit1;
2196	}
2197
2198	nd.flags &= ~LOOKUP_PARENT;
2199
2200	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2201	dentry = lookup_hash(&nd);
2202	error = PTR_ERR(dentry);
2203	if (IS_ERR(dentry))
2204		goto exit2;
2205	error = mnt_want_write(nd.path.mnt);
2206	if (error)
2207		goto exit3;
2208	error = security_path_rmdir(&nd.path, dentry);
2209	if (error)
2210		goto exit4;
2211	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2212exit4:
2213	mnt_drop_write(nd.path.mnt);
2214exit3:
2215	dput(dentry);
2216exit2:
2217	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2218exit1:
2219	path_put(&nd.path);
2220	putname(name);
2221	return error;
2222}
2223
2224SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
2225{
2226	return do_rmdir(AT_FDCWD, pathname);
2227}
2228
2229int vfs_unlink(struct inode *dir, struct dentry *dentry)
2230{
2231	int error = may_delete(dir, dentry, 0);
2232
2233	if (error)
2234		return error;
2235
2236	if (!dir->i_op->unlink)
2237		return -EPERM;
2238
2239	mutex_lock(&dentry->d_inode->i_mutex);
2240	if (d_mountpoint(dentry))
2241		error = -EBUSY;
2242	else {
2243		error = security_inode_unlink(dir, dentry);
2244		if (!error) {
2245			error = dir->i_op->unlink(dir, dentry);
2246			if (!error)
2247				dont_mount(dentry);
2248		}
2249	}
2250	mutex_unlock(&dentry->d_inode->i_mutex);
2251
2252	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
2253	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2254		fsnotify_link_count(dentry->d_inode);
2255		d_delete(dentry);
2256	}
2257
2258	return error;
2259}
2260
2261/*
2262 * Make sure that the actual truncation of the file will occur outside its
2263 * directory's i_mutex.  Truncate can take a long time if there is a lot of
2264 * writeout happening, and we don't want to prevent access to the directory
2265 * while waiting on the I/O.
2266 */
2267static long do_unlinkat(int dfd, const char __user *pathname)
2268{
2269	int error;
2270	char *name;
2271	struct dentry *dentry;
2272	struct nameidata nd;
2273	struct inode *inode = NULL;
2274
2275	error = user_path_parent(dfd, pathname, &nd, &name);
2276	if (error)
2277		return error;
2278
2279	error = -EISDIR;
2280	if (nd.last_type != LAST_NORM)
2281		goto exit1;
2282
2283	nd.flags &= ~LOOKUP_PARENT;
2284
2285	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2286	dentry = lookup_hash(&nd);
2287	error = PTR_ERR(dentry);
2288	if (!IS_ERR(dentry)) {
2289		/* Why not before? Because we want correct error value */
2290		if (nd.last.name[nd.last.len])
2291			goto slashes;
2292		inode = dentry->d_inode;
2293		if (inode)
2294			atomic_inc(&inode->i_count);
2295		error = mnt_want_write(nd.path.mnt);
2296		if (error)
2297			goto exit2;
2298		error = security_path_unlink(&nd.path, dentry);
2299		if (error)
2300			goto exit3;
2301		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2302exit3:
2303		mnt_drop_write(nd.path.mnt);
2304	exit2:
2305		dput(dentry);
2306	}
2307	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2308	if (inode)
2309		iput(inode);	/* truncate the inode here */
2310exit1:
2311	path_put(&nd.path);
2312	putname(name);
2313	return error;
2314
2315slashes:
2316	error = !dentry->d_inode ? -ENOENT :
2317		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2318	goto exit2;
2319}
2320
2321SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
2322{
2323	if ((flag & ~AT_REMOVEDIR) != 0)
2324		return -EINVAL;
2325
2326	if (flag & AT_REMOVEDIR)
2327		return do_rmdir(dfd, pathname);
2328
2329	return do_unlinkat(dfd, pathname);
2330}
2331
2332SYSCALL_DEFINE1(unlink, const char __user *, pathname)
2333{
2334	return do_unlinkat(AT_FDCWD, pathname);
2335}
2336
2337int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2338{
2339	int error = may_create(dir, dentry);
2340
2341	if (error)
2342		return error;
2343
2344	if (!dir->i_op->symlink)
2345		return -EPERM;
2346
2347	error = security_inode_symlink(dir, dentry, oldname);
2348	if (error)
2349		return error;
2350
2351	error = dir->i_op->symlink(dir, dentry, oldname);
2352	if (!error)
2353		fsnotify_create(dir, dentry);
2354	return error;
2355}
2356
2357SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
2358		int, newdfd, const char __user *, newname)
2359{
2360	int error;
2361	char *from;
2362	char *to;
2363	struct dentry *dentry;
2364	struct nameidata nd;
2365
2366	from = getname(oldname);
2367	if (IS_ERR(from))
2368		return PTR_ERR(from);
2369
2370	error = user_path_parent(newdfd, newname, &nd, &to);
2371	if (error)
2372		goto out_putname;
2373
2374	dentry = lookup_create(&nd, 0);
2375	error = PTR_ERR(dentry);
2376	if (IS_ERR(dentry))
2377		goto out_unlock;
2378
2379	error = mnt_want_write(nd.path.mnt);
2380	if (error)
2381		goto out_dput;
2382	error = security_path_symlink(&nd.path, dentry, from);
2383	if (error)
2384		goto out_drop_write;
2385	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
2386out_drop_write:
2387	mnt_drop_write(nd.path.mnt);
2388out_dput:
2389	dput(dentry);
2390out_unlock:
2391	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2392	path_put(&nd.path);
2393	putname(to);
2394out_putname:
2395	putname(from);
2396	return error;
2397}
2398
2399SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
2400{
2401	return sys_symlinkat(oldname, AT_FDCWD, newname);
2402}
2403
2404int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2405{
2406	struct inode *inode = old_dentry->d_inode;
2407	int error;
2408
2409	if (!inode)
2410		return -ENOENT;
2411
2412	error = may_create(dir, new_dentry);
2413	if (error)
2414		return error;
2415
2416	if (dir->i_sb != inode->i_sb)
2417		return -EXDEV;
2418
2419	/*
2420	 * A link to an append-only or immutable file cannot be created.
2421	 */
2422	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2423		return -EPERM;
2424	if (!dir->i_op->link)
2425		return -EPERM;
2426	if (S_ISDIR(inode->i_mode))
2427		return -EPERM;
2428
2429	error = security_inode_link(old_dentry, dir, new_dentry);
2430	if (error)
2431		return error;
2432
2433	mutex_lock(&inode->i_mutex);
2434	error = dir->i_op->link(old_dentry, dir, new_dentry);
2435	mutex_unlock(&inode->i_mutex);
2436	if (!error)
2437		fsnotify_link(dir, inode, new_dentry);
2438	return error;
2439}
2440
2441/*
2442 * Hardlinks are often used in delicate situations.  We avoid
2443 * security-related surprises by not following symlinks on the
2444 * newname.  --KAB
2445 *
2446 * We don't follow them on the oldname either to be compatible
2447 * with linux 2.0, and to avoid hard-linking to directories
2448 * and other special files.  --ADM
2449 */
2450SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
2451		int, newdfd, const char __user *, newname, int, flags)
2452{
2453	struct dentry *new_dentry;
2454	struct nameidata nd;
2455	struct path old_path;
2456	int error;
2457	char *to;
2458
2459	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2460		return -EINVAL;
2461
2462	error = user_path_at(olddfd, oldname,
2463			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2464			     &old_path);
2465	if (error)
2466		return error;
2467
2468	error = user_path_parent(newdfd, newname, &nd, &to);
2469	if (error)
2470		goto out;
2471	error = -EXDEV;
2472	if (old_path.mnt != nd.path.mnt)
2473		goto out_release;
2474	new_dentry = lookup_create(&nd, 0);
2475	error = PTR_ERR(new_dentry);
2476	if (IS_ERR(new_dentry))
2477		goto out_unlock;
2478	error = mnt_want_write(nd.path.mnt);
2479	if (error)
2480		goto out_dput;
2481	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
2482	if (error)
2483		goto out_drop_write;
2484	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
2485out_drop_write:
2486	mnt_drop_write(nd.path.mnt);
2487out_dput:
2488	dput(new_dentry);
2489out_unlock:
2490	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2491out_release:
2492	path_put(&nd.path);
2493	putname(to);
2494out:
2495	path_put(&old_path);
2496
2497	return error;
2498}
2499
2500SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
2501{
2502	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2503}
2504
2505static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2506			  struct inode *new_dir, struct dentry *new_dentry)
2507{
2508	int error = 0;
2509	struct inode *target;
2510
2511	/*
2512	 * If we are going to change the parent - check write permissions,
2513	 * we'll need to flip '..'.
2514	 */
2515	if (new_dir != old_dir) {
2516		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
2517		if (error)
2518			return error;
2519	}
2520
2521	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2522	if (error)
2523		return error;
2524
2525	target = new_dentry->d_inode;
2526	if (target)
2527		mutex_lock(&target->i_mutex);
2528	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2529		error = -EBUSY;
2530	else {
2531		if (target)
2532			dentry_unhash(new_dentry);
2533		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2534	}
2535	if (target) {
2536		if (!error) {
2537			target->i_flags |= S_DEAD;
2538			dont_mount(new_dentry);
2539		}
2540		mutex_unlock(&target->i_mutex);
2541		if (d_unhashed(new_dentry))
2542			d_rehash(new_dentry);
2543		dput(new_dentry);
2544	}
2545	if (!error)
2546		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2547			d_move(old_dentry,new_dentry);
2548	return error;
2549}
2550
2551static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2552			    struct inode *new_dir, struct dentry *new_dentry)
2553{
2554	struct inode *target;
2555	int error;
2556
2557	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2558	if (error)
2559		return error;
2560
2561	dget(new_dentry);
2562	target = new_dentry->d_inode;
2563	if (target)
2564		mutex_lock(&target->i_mutex);
2565	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2566		error = -EBUSY;
2567	else
2568		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2569	if (!error) {
2570		if (target)
2571			dont_mount(new_dentry);
2572		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2573			d_move(old_dentry, new_dentry);
2574	}
2575	if (target)
2576		mutex_unlock(&target->i_mutex);
2577	dput(new_dentry);
2578	return error;
2579}
2580
2581int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2582	       struct inode *new_dir, struct dentry *new_dentry)
2583{
2584	int error;
2585	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2586	const unsigned char *old_name;
2587
2588	if (old_dentry->d_inode == new_dentry->d_inode)
2589 		return 0;
2590
2591	error = may_delete(old_dir, old_dentry, is_dir);
2592	if (error)
2593		return error;
2594
2595	if (!new_dentry->d_inode)
2596		error = may_create(new_dir, new_dentry);
2597	else
2598		error = may_delete(new_dir, new_dentry, is_dir);
2599	if (error)
2600		return error;
2601
2602	if (!old_dir->i_op->rename)
2603		return -EPERM;
2604
2605	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2606
2607	if (is_dir)
2608		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2609	else
2610		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2611	if (!error)
2612		fsnotify_move(old_dir, new_dir, old_name, is_dir,
2613			      new_dentry->d_inode, old_dentry);
2614	fsnotify_oldname_free(old_name);
2615
2616	return error;
2617}
2618
2619SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
2620		int, newdfd, const char __user *, newname)
2621{
2622	struct dentry *old_dir, *new_dir;
2623	struct dentry *old_dentry, *new_dentry;
2624	struct dentry *trap;
2625	struct nameidata oldnd, newnd;
2626	char *from;
2627	char *to;
2628	int error;
2629
2630	error = user_path_parent(olddfd, oldname, &oldnd, &from);
2631	if (error)
2632		goto exit;
2633
2634	error = user_path_parent(newdfd, newname, &newnd, &to);
2635	if (error)
2636		goto exit1;
2637
2638	error = -EXDEV;
2639	if (oldnd.path.mnt != newnd.path.mnt)
2640		goto exit2;
2641
2642	old_dir = oldnd.path.dentry;
2643	error = -EBUSY;
2644	if (oldnd.last_type != LAST_NORM)
2645		goto exit2;
2646
2647	new_dir = newnd.path.dentry;
2648	if (newnd.last_type != LAST_NORM)
2649		goto exit2;
2650
2651	oldnd.flags &= ~LOOKUP_PARENT;
2652	newnd.flags &= ~LOOKUP_PARENT;
2653	newnd.flags |= LOOKUP_RENAME_TARGET;
2654
2655	trap = lock_rename(new_dir, old_dir);
2656
2657	old_dentry = lookup_hash(&oldnd);
2658	error = PTR_ERR(old_dentry);
2659	if (IS_ERR(old_dentry))
2660		goto exit3;
2661	/* source must exist */
2662	error = -ENOENT;
2663	if (!old_dentry->d_inode)
2664		goto exit4;
2665	/* unless the source is a directory trailing slashes give -ENOTDIR */
2666	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2667		error = -ENOTDIR;
2668		if (oldnd.last.name[oldnd.last.len])
2669			goto exit4;
2670		if (newnd.last.name[newnd.last.len])
2671			goto exit4;
2672	}
2673	/* source should not be ancestor of target */
2674	error = -EINVAL;
2675	if (old_dentry == trap)
2676		goto exit4;
2677	new_dentry = lookup_hash(&newnd);
2678	error = PTR_ERR(new_dentry);
2679	if (IS_ERR(new_dentry))
2680		goto exit4;
2681	/* target should not be an ancestor of source */
2682	error = -ENOTEMPTY;
2683	if (new_dentry == trap)
2684		goto exit5;
2685
2686	error = mnt_want_write(oldnd.path.mnt);
2687	if (error)
2688		goto exit5;
2689	error = security_path_rename(&oldnd.path, old_dentry,
2690				     &newnd.path, new_dentry);
2691	if (error)
2692		goto exit6;
2693	error = vfs_rename(old_dir->d_inode, old_dentry,
2694				   new_dir->d_inode, new_dentry);
2695exit6:
2696	mnt_drop_write(oldnd.path.mnt);
2697exit5:
2698	dput(new_dentry);
2699exit4:
2700	dput(old_dentry);
2701exit3:
2702	unlock_rename(new_dir, old_dir);
2703exit2:
2704	path_put(&newnd.path);
2705	putname(to);
2706exit1:
2707	path_put(&oldnd.path);
2708	putname(from);
2709exit:
2710	return error;
2711}
2712
2713SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
2714{
2715	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2716}
2717
2718int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2719{
2720	int len;
2721
2722	len = PTR_ERR(link);
2723	if (IS_ERR(link))
2724		goto out;
2725
2726	len = strlen(link);
2727	if (len > (unsigned) buflen)
2728		len = buflen;
2729	if (copy_to_user(buffer, link, len))
2730		len = -EFAULT;
2731out:
2732	return len;
2733}
2734
2735/*
2736 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
2737 * have ->follow_link() touching nd only in nd_set_link().  Using (or not
2738 * using) it for any given inode is up to filesystem.
2739 */
2740int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2741{
2742	struct nameidata nd;
2743	void *cookie;
2744	int res;
2745
2746	nd.depth = 0;
2747	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
2748	if (IS_ERR(cookie))
2749		return PTR_ERR(cookie);
2750
2751	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2752	if (dentry->d_inode->i_op->put_link)
2753		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
2754	return res;
2755}
2756
2757int vfs_follow_link(struct nameidata *nd, const char *link)
2758{
2759	return __vfs_follow_link(nd, link);
2760}
2761
2762/* get the link contents into pagecache */
2763static char *page_getlink(struct dentry * dentry, struct page **ppage)
2764{
2765	char *kaddr;
2766	struct page *page;
2767	struct address_space *mapping = dentry->d_inode->i_mapping;
2768	page = read_mapping_page(mapping, 0, NULL);
2769	if (IS_ERR(page))
2770		return (char*)page;
2771	*ppage = page;
2772	kaddr = kmap(page);
2773	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
2774	return kaddr;
2775}
2776
2777int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2778{
2779	struct page *page = NULL;
2780	char *s = page_getlink(dentry, &page);
2781	int res = vfs_readlink(dentry,buffer,buflen,s);
2782	if (page) {
2783		kunmap(page);
2784		page_cache_release(page);
2785	}
2786	return res;
2787}
2788
2789void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2790{
2791	struct page *page = NULL;
2792	nd_set_link(nd, page_getlink(dentry, &page));
2793	return page;
2794}
2795
2796void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2797{
2798	struct page *page = cookie;
2799
2800	if (page) {
2801		kunmap(page);
2802		page_cache_release(page);
2803	}
2804}
2805
2806/*
2807 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
2808 */
2809int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
2810{
2811	struct address_space *mapping = inode->i_mapping;
2812	struct page *page;
2813	void *fsdata;
2814	int err;
2815	char *kaddr;
2816	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
2817	if (nofs)
2818		flags |= AOP_FLAG_NOFS;
2819
2820retry:
2821	err = pagecache_write_begin(NULL, mapping, 0, len-1,
2822				flags, &page, &fsdata);
2823	if (err)
2824		goto fail;
2825
2826	kaddr = kmap_atomic(page, KM_USER0);
2827	memcpy(kaddr, symname, len-1);
2828	kunmap_atomic(kaddr, KM_USER0);
2829
2830	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2831							page, fsdata);
2832	if (err < 0)
2833		goto fail;
2834	if (err < len-1)
2835		goto retry;
2836
2837	mark_inode_dirty(inode);
2838	return 0;
2839fail:
2840	return err;
2841}
2842
2843int page_symlink(struct inode *inode, const char *symname, int len)
2844{
2845	return __page_symlink(inode, symname, len,
2846			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
2847}
2848
2849const struct inode_operations page_symlink_inode_operations = {
2850	.readlink	= generic_readlink,
2851	.follow_link	= page_follow_link_light,
2852	.put_link	= page_put_link,
2853};
2854
2855EXPORT_SYMBOL(user_path_at);
2856EXPORT_SYMBOL(follow_down);
2857EXPORT_SYMBOL(follow_up);
2858EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2859EXPORT_SYMBOL(getname);
2860EXPORT_SYMBOL(lock_rename);
2861EXPORT_SYMBOL(lookup_one_len);
2862EXPORT_SYMBOL(page_follow_link_light);
2863EXPORT_SYMBOL(page_put_link);
2864EXPORT_SYMBOL(page_readlink);
2865EXPORT_SYMBOL(__page_symlink);
2866EXPORT_SYMBOL(page_symlink);
2867EXPORT_SYMBOL(page_symlink_inode_operations);
2868EXPORT_SYMBOL(path_lookup);
2869EXPORT_SYMBOL(kern_path);
2870EXPORT_SYMBOL(vfs_path_lookup);
2871EXPORT_SYMBOL(inode_permission);
2872EXPORT_SYMBOL(file_permission);
2873EXPORT_SYMBOL(unlock_rename);
2874EXPORT_SYMBOL(vfs_create);
2875EXPORT_SYMBOL(vfs_follow_link);
2876EXPORT_SYMBOL(vfs_link);
2877EXPORT_SYMBOL(vfs_mkdir);
2878EXPORT_SYMBOL(vfs_mknod);
2879EXPORT_SYMBOL(generic_permission);
2880EXPORT_SYMBOL(vfs_readlink);
2881EXPORT_SYMBOL(vfs_rename);
2882EXPORT_SYMBOL(vfs_rmdir);
2883EXPORT_SYMBOL(vfs_symlink);
2884EXPORT_SYMBOL(vfs_unlink);
2885EXPORT_SYMBOL(dentry_unhash);
2886EXPORT_SYMBOL(generic_readlink);
2887