1// SPDX-License-Identifier: GPL-2.0
2/*
3 *  linux/fs/namei.c
4 *
5 *  Copyright (C) 1991, 1992  Linus Torvalds
6 */
7
8/*
9 * Some corrections by tytso.
10 */
11
12/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
13 * lookup logic.
14 */
15/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
16 */
17
18#include <linux/init.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/wordpart.h>
22#include <linux/fs.h>
23#include <linux/filelock.h>
24#include <linux/namei.h>
25#include <linux/pagemap.h>
26#include <linux/sched/mm.h>
27#include <linux/fsnotify.h>
28#include <linux/personality.h>
29#include <linux/security.h>
30#include <linux/syscalls.h>
31#include <linux/mount.h>
32#include <linux/audit.h>
33#include <linux/capability.h>
34#include <linux/file.h>
35#include <linux/fcntl.h>
36#include <linux/device_cgroup.h>
37#include <linux/fs_struct.h>
38#include <linux/posix_acl.h>
39#include <linux/hash.h>
40#include <linux/bitops.h>
41#include <linux/init_task.h>
42#include <linux/uaccess.h>
43
44#include "internal.h"
45#include "mount.h"
46
47/* [Feb-1997 T. Schoebel-Theuer]
48 * Fundamental changes in the pathname lookup mechanisms (namei)
49 * were necessary because of omirr.  The reason is that omirr needs
50 * to know the _real_ pathname, not the user-supplied one, in case
51 * of symlinks (and also when transname replacements occur).
52 *
53 * The new code replaces the old recursive symlink resolution with
54 * an iterative one (in case of non-nested symlink chains).  It does
55 * this with calls to <fs>_follow_link().
56 * As a side effect, dir_namei(), _namei() and follow_link() are now
57 * replaced with a single function lookup_dentry() that can handle all
58 * the special cases of the former code.
59 *
60 * With the new dcache, the pathname is stored at each inode, at least as
61 * long as the refcount of the inode is positive.  As a side effect, the
62 * size of the dcache depends on the inode cache and thus is dynamic.
63 *
64 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
65 * resolution to correspond with current state of the code.
66 *
67 * Note that the symlink resolution is not *completely* iterative.
68 * There is still a significant amount of tail- and mid- recursion in
69 * the algorithm.  Also, note that <fs>_readlink() is not used in
70 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
71 * may return different results than <fs>_follow_link().  Many virtual
72 * filesystems (including /proc) exhibit this behavior.
73 */
74
75/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
76 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
77 * and the name already exists in form of a symlink, try to create the new
78 * name indicated by the symlink. The old code always complained that the
79 * name already exists, due to not following the symlink even if its target
80 * is nonexistent.  The new semantics affects also mknod() and link() when
81 * the name is a symlink pointing to a non-existent name.
82 *
83 * I don't know which semantics is the right one, since I have no access
84 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
85 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
86 * "old" one. Personally, I think the new semantics is much more logical.
87 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
88 * file does succeed in both HP-UX and SunOs, but not in Solaris
89 * and in the old Linux semantics.
90 */
91
92/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
93 * semantics.  See the comments in "open_namei" and "do_link" below.
94 *
95 * [10-Sep-98 Alan Modra] Another symlink change.
96 */
97
98/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
99 *	inside the path - always follow.
100 *	in the last component in creation/removal/renaming - never follow.
101 *	if LOOKUP_FOLLOW passed - follow.
102 *	if the pathname has trailing slashes - follow.
103 *	otherwise - don't follow.
104 * (applied in that order).
105 *
106 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
107 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
108 * During the 2.4 we need to fix the userland stuff depending on it -
109 * hopefully we will be able to get rid of that wart in 2.5. So far only
110 * XEmacs seems to be relying on it...
111 */
112/*
113 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
115 * any extra contention...
116 */
117
118/* In order to reduce some races, while at the same time doing additional
119 * checking and hopefully speeding things up, we copy filenames to the
120 * kernel data space before using them..
121 *
122 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
123 * PATH_MAX includes the nul terminator --RR.
124 */
125
126#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
127
128struct filename *
129getname_flags(const char __user *filename, int flags, int *empty)
130{
131	struct filename *result;
132	char *kname;
133	int len;
134
135	result = audit_reusename(filename);
136	if (result)
137		return result;
138
139	result = __getname();
140	if (unlikely(!result))
141		return ERR_PTR(-ENOMEM);
142
143	/*
144	 * First, try to embed the struct filename inside the names_cache
145	 * allocation
146	 */
147	kname = (char *)result->iname;
148	result->name = kname;
149
150	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
151	if (unlikely(len < 0)) {
152		__putname(result);
153		return ERR_PTR(len);
154	}
155
156	/*
157	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
158	 * separate struct filename so we can dedicate the entire
159	 * names_cache allocation for the pathname, and re-do the copy from
160	 * userland.
161	 */
162	if (unlikely(len == EMBEDDED_NAME_MAX)) {
163		const size_t size = offsetof(struct filename, iname[1]);
164		kname = (char *)result;
165
166		/*
167		 * size is chosen that way we to guarantee that
168		 * result->iname[0] is within the same object and that
169		 * kname can't be equal to result->iname, no matter what.
170		 */
171		result = kzalloc(size, GFP_KERNEL);
172		if (unlikely(!result)) {
173			__putname(kname);
174			return ERR_PTR(-ENOMEM);
175		}
176		result->name = kname;
177		len = strncpy_from_user(kname, filename, PATH_MAX);
178		if (unlikely(len < 0)) {
179			__putname(kname);
180			kfree(result);
181			return ERR_PTR(len);
182		}
183		if (unlikely(len == PATH_MAX)) {
184			__putname(kname);
185			kfree(result);
186			return ERR_PTR(-ENAMETOOLONG);
187		}
188	}
189
190	atomic_set(&result->refcnt, 1);
191	/* The empty path is special. */
192	if (unlikely(!len)) {
193		if (empty)
194			*empty = 1;
195		if (!(flags & LOOKUP_EMPTY)) {
196			putname(result);
197			return ERR_PTR(-ENOENT);
198		}
199	}
200
201	result->uptr = filename;
202	result->aname = NULL;
203	audit_getname(result);
204	return result;
205}
206
207struct filename *
208getname_uflags(const char __user *filename, int uflags)
209{
210	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
211
212	return getname_flags(filename, flags, NULL);
213}
214
215struct filename *
216getname(const char __user * filename)
217{
218	return getname_flags(filename, 0, NULL);
219}
220
221struct filename *
222getname_kernel(const char * filename)
223{
224	struct filename *result;
225	int len = strlen(filename) + 1;
226
227	result = __getname();
228	if (unlikely(!result))
229		return ERR_PTR(-ENOMEM);
230
231	if (len <= EMBEDDED_NAME_MAX) {
232		result->name = (char *)result->iname;
233	} else if (len <= PATH_MAX) {
234		const size_t size = offsetof(struct filename, iname[1]);
235		struct filename *tmp;
236
237		tmp = kmalloc(size, GFP_KERNEL);
238		if (unlikely(!tmp)) {
239			__putname(result);
240			return ERR_PTR(-ENOMEM);
241		}
242		tmp->name = (char *)result;
243		result = tmp;
244	} else {
245		__putname(result);
246		return ERR_PTR(-ENAMETOOLONG);
247	}
248	memcpy((char *)result->name, filename, len);
249	result->uptr = NULL;
250	result->aname = NULL;
251	atomic_set(&result->refcnt, 1);
252	audit_getname(result);
253
254	return result;
255}
256EXPORT_SYMBOL(getname_kernel);
257
258void putname(struct filename *name)
259{
260	if (IS_ERR(name))
261		return;
262
263	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
264		return;
265
266	if (!atomic_dec_and_test(&name->refcnt))
267		return;
268
269	if (name->name != name->iname) {
270		__putname(name->name);
271		kfree(name);
272	} else
273		__putname(name);
274}
275EXPORT_SYMBOL(putname);
276
277/**
278 * check_acl - perform ACL permission checking
279 * @idmap:	idmap of the mount the inode was found from
280 * @inode:	inode to check permissions on
281 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
282 *
283 * This function performs the ACL permission checking. Since this function
284 * retrieve POSIX acls it needs to know whether it is called from a blocking or
285 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
286 *
287 * If the inode has been found through an idmapped mount the idmap of
288 * the vfsmount must be passed through @idmap. This function will then take
289 * care to map the inode according to @idmap before checking permissions.
290 * On non-idmapped mounts or if permission checking is to be performed on the
291 * raw inode simply pass @nop_mnt_idmap.
292 */
293static int check_acl(struct mnt_idmap *idmap,
294		     struct inode *inode, int mask)
295{
296#ifdef CONFIG_FS_POSIX_ACL
297	struct posix_acl *acl;
298
299	if (mask & MAY_NOT_BLOCK) {
300		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
301	        if (!acl)
302	                return -EAGAIN;
303		/* no ->get_inode_acl() calls in RCU mode... */
304		if (is_uncached_acl(acl))
305			return -ECHILD;
306	        return posix_acl_permission(idmap, inode, acl, mask);
307	}
308
309	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
310	if (IS_ERR(acl))
311		return PTR_ERR(acl);
312	if (acl) {
313	        int error = posix_acl_permission(idmap, inode, acl, mask);
314	        posix_acl_release(acl);
315	        return error;
316	}
317#endif
318
319	return -EAGAIN;
320}
321
322/**
323 * acl_permission_check - perform basic UNIX permission checking
324 * @idmap:	idmap of the mount the inode was found from
325 * @inode:	inode to check permissions on
326 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
327 *
328 * This function performs the basic UNIX permission checking. Since this
329 * function may retrieve POSIX acls it needs to know whether it is called from a
330 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
331 *
332 * If the inode has been found through an idmapped mount the idmap of
333 * the vfsmount must be passed through @idmap. This function will then take
334 * care to map the inode according to @idmap before checking permissions.
335 * On non-idmapped mounts or if permission checking is to be performed on the
336 * raw inode simply pass @nop_mnt_idmap.
337 */
338static int acl_permission_check(struct mnt_idmap *idmap,
339				struct inode *inode, int mask)
340{
341	unsigned int mode = inode->i_mode;
342	vfsuid_t vfsuid;
343
344	/* Are we the owner? If so, ACL's don't matter */
345	vfsuid = i_uid_into_vfsuid(idmap, inode);
346	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
347		mask &= 7;
348		mode >>= 6;
349		return (mask & ~mode) ? -EACCES : 0;
350	}
351
352	/* Do we have ACL's? */
353	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
354		int error = check_acl(idmap, inode, mask);
355		if (error != -EAGAIN)
356			return error;
357	}
358
359	/* Only RWX matters for group/other mode bits */
360	mask &= 7;
361
362	/*
363	 * Are the group permissions different from
364	 * the other permissions in the bits we care
365	 * about? Need to check group ownership if so.
366	 */
367	if (mask & (mode ^ (mode >> 3))) {
368		vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
369		if (vfsgid_in_group_p(vfsgid))
370			mode >>= 3;
371	}
372
373	/* Bits in 'mode' clear that we require? */
374	return (mask & ~mode) ? -EACCES : 0;
375}
376
377/**
378 * generic_permission -  check for access rights on a Posix-like filesystem
379 * @idmap:	idmap of the mount the inode was found from
380 * @inode:	inode to check access rights for
381 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
382 *		%MAY_NOT_BLOCK ...)
383 *
384 * Used to check for read/write/execute permissions on a file.
385 * We use "fsuid" for this, letting us set arbitrary permissions
386 * for filesystem access without changing the "normal" uids which
387 * are used for other things.
388 *
389 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
390 * request cannot be satisfied (eg. requires blocking or too much complexity).
391 * It would then be called again in ref-walk mode.
392 *
393 * If the inode has been found through an idmapped mount the idmap of
394 * the vfsmount must be passed through @idmap. This function will then take
395 * care to map the inode according to @idmap before checking permissions.
396 * On non-idmapped mounts or if permission checking is to be performed on the
397 * raw inode simply pass @nop_mnt_idmap.
398 */
399int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
400		       int mask)
401{
402	int ret;
403
404	/*
405	 * Do the basic permission checks.
406	 */
407	ret = acl_permission_check(idmap, inode, mask);
408	if (ret != -EACCES)
409		return ret;
410
411	if (S_ISDIR(inode->i_mode)) {
412		/* DACs are overridable for directories */
413		if (!(mask & MAY_WRITE))
414			if (capable_wrt_inode_uidgid(idmap, inode,
415						     CAP_DAC_READ_SEARCH))
416				return 0;
417		if (capable_wrt_inode_uidgid(idmap, inode,
418					     CAP_DAC_OVERRIDE))
419			return 0;
420		return -EACCES;
421	}
422
423	/*
424	 * Searching includes executable on directories, else just read.
425	 */
426	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
427	if (mask == MAY_READ)
428		if (capable_wrt_inode_uidgid(idmap, inode,
429					     CAP_DAC_READ_SEARCH))
430			return 0;
431	/*
432	 * Read/write DACs are always overridable.
433	 * Executable DACs are overridable when there is
434	 * at least one exec bit set.
435	 */
436	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
437		if (capable_wrt_inode_uidgid(idmap, inode,
438					     CAP_DAC_OVERRIDE))
439			return 0;
440
441	return -EACCES;
442}
443EXPORT_SYMBOL(generic_permission);
444
445/**
446 * do_inode_permission - UNIX permission checking
447 * @idmap:	idmap of the mount the inode was found from
448 * @inode:	inode to check permissions on
449 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
450 *
451 * We _really_ want to just do "generic_permission()" without
452 * even looking at the inode->i_op values. So we keep a cache
453 * flag in inode->i_opflags, that says "this has not special
454 * permission function, use the fast case".
455 */
456static inline int do_inode_permission(struct mnt_idmap *idmap,
457				      struct inode *inode, int mask)
458{
459	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
460		if (likely(inode->i_op->permission))
461			return inode->i_op->permission(idmap, inode, mask);
462
463		/* This gets set once for the inode lifetime */
464		spin_lock(&inode->i_lock);
465		inode->i_opflags |= IOP_FASTPERM;
466		spin_unlock(&inode->i_lock);
467	}
468	return generic_permission(idmap, inode, mask);
469}
470
471/**
472 * sb_permission - Check superblock-level permissions
473 * @sb: Superblock of inode to check permission on
474 * @inode: Inode to check permission on
475 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
476 *
477 * Separate out file-system wide checks from inode-specific permission checks.
478 */
479static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
480{
481	if (unlikely(mask & MAY_WRITE)) {
482		umode_t mode = inode->i_mode;
483
484		/* Nobody gets write access to a read-only fs. */
485		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
486			return -EROFS;
487	}
488	return 0;
489}
490
491/**
492 * inode_permission - Check for access rights to a given inode
493 * @idmap:	idmap of the mount the inode was found from
494 * @inode:	Inode to check permission on
495 * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
496 *
497 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
498 * this, letting us set arbitrary permissions for filesystem access without
499 * changing the "normal" UIDs which are used for other things.
500 *
501 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
502 */
503int inode_permission(struct mnt_idmap *idmap,
504		     struct inode *inode, int mask)
505{
506	int retval;
507
508	retval = sb_permission(inode->i_sb, inode, mask);
509	if (retval)
510		return retval;
511
512	if (unlikely(mask & MAY_WRITE)) {
513		/*
514		 * Nobody gets write access to an immutable file.
515		 */
516		if (IS_IMMUTABLE(inode))
517			return -EPERM;
518
519		/*
520		 * Updating mtime will likely cause i_uid and i_gid to be
521		 * written back improperly if their true value is unknown
522		 * to the vfs.
523		 */
524		if (HAS_UNMAPPED_ID(idmap, inode))
525			return -EACCES;
526	}
527
528	retval = do_inode_permission(idmap, inode, mask);
529	if (retval)
530		return retval;
531
532	retval = devcgroup_inode_permission(inode, mask);
533	if (retval)
534		return retval;
535
536	return security_inode_permission(inode, mask);
537}
538EXPORT_SYMBOL(inode_permission);
539
540/**
541 * path_get - get a reference to a path
542 * @path: path to get the reference to
543 *
544 * Given a path increment the reference count to the dentry and the vfsmount.
545 */
546void path_get(const struct path *path)
547{
548	mntget(path->mnt);
549	dget(path->dentry);
550}
551EXPORT_SYMBOL(path_get);
552
553/**
554 * path_put - put a reference to a path
555 * @path: path to put the reference to
556 *
557 * Given a path decrement the reference count to the dentry and the vfsmount.
558 */
559void path_put(const struct path *path)
560{
561	dput(path->dentry);
562	mntput(path->mnt);
563}
564EXPORT_SYMBOL(path_put);
565
566#define EMBEDDED_LEVELS 2
567struct nameidata {
568	struct path	path;
569	struct qstr	last;
570	struct path	root;
571	struct inode	*inode; /* path.dentry.d_inode */
572	unsigned int	flags, state;
573	unsigned	seq, next_seq, m_seq, r_seq;
574	int		last_type;
575	unsigned	depth;
576	int		total_link_count;
577	struct saved {
578		struct path link;
579		struct delayed_call done;
580		const char *name;
581		unsigned seq;
582	} *stack, internal[EMBEDDED_LEVELS];
583	struct filename	*name;
584	struct nameidata *saved;
585	unsigned	root_seq;
586	int		dfd;
587	vfsuid_t	dir_vfsuid;
588	umode_t		dir_mode;
589} __randomize_layout;
590
591#define ND_ROOT_PRESET 1
592#define ND_ROOT_GRABBED 2
593#define ND_JUMPED 4
594
595static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
596{
597	struct nameidata *old = current->nameidata;
598	p->stack = p->internal;
599	p->depth = 0;
600	p->dfd = dfd;
601	p->name = name;
602	p->path.mnt = NULL;
603	p->path.dentry = NULL;
604	p->total_link_count = old ? old->total_link_count : 0;
605	p->saved = old;
606	current->nameidata = p;
607}
608
609static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
610			  const struct path *root)
611{
612	__set_nameidata(p, dfd, name);
613	p->state = 0;
614	if (unlikely(root)) {
615		p->state = ND_ROOT_PRESET;
616		p->root = *root;
617	}
618}
619
620static void restore_nameidata(void)
621{
622	struct nameidata *now = current->nameidata, *old = now->saved;
623
624	current->nameidata = old;
625	if (old)
626		old->total_link_count = now->total_link_count;
627	if (now->stack != now->internal)
628		kfree(now->stack);
629}
630
631static bool nd_alloc_stack(struct nameidata *nd)
632{
633	struct saved *p;
634
635	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
636			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
637	if (unlikely(!p))
638		return false;
639	memcpy(p, nd->internal, sizeof(nd->internal));
640	nd->stack = p;
641	return true;
642}
643
644/**
645 * path_connected - Verify that a dentry is below mnt.mnt_root
646 * @mnt: The mountpoint to check.
647 * @dentry: The dentry to check.
648 *
649 * Rename can sometimes move a file or directory outside of a bind
650 * mount, path_connected allows those cases to be detected.
651 */
652static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
653{
654	struct super_block *sb = mnt->mnt_sb;
655
656	/* Bind mounts can have disconnected paths */
657	if (mnt->mnt_root == sb->s_root)
658		return true;
659
660	return is_subdir(dentry, mnt->mnt_root);
661}
662
663static void drop_links(struct nameidata *nd)
664{
665	int i = nd->depth;
666	while (i--) {
667		struct saved *last = nd->stack + i;
668		do_delayed_call(&last->done);
669		clear_delayed_call(&last->done);
670	}
671}
672
673static void leave_rcu(struct nameidata *nd)
674{
675	nd->flags &= ~LOOKUP_RCU;
676	nd->seq = nd->next_seq = 0;
677	rcu_read_unlock();
678}
679
680static void terminate_walk(struct nameidata *nd)
681{
682	drop_links(nd);
683	if (!(nd->flags & LOOKUP_RCU)) {
684		int i;
685		path_put(&nd->path);
686		for (i = 0; i < nd->depth; i++)
687			path_put(&nd->stack[i].link);
688		if (nd->state & ND_ROOT_GRABBED) {
689			path_put(&nd->root);
690			nd->state &= ~ND_ROOT_GRABBED;
691		}
692	} else {
693		leave_rcu(nd);
694	}
695	nd->depth = 0;
696	nd->path.mnt = NULL;
697	nd->path.dentry = NULL;
698}
699
700/* path_put is needed afterwards regardless of success or failure */
701static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
702{
703	int res = __legitimize_mnt(path->mnt, mseq);
704	if (unlikely(res)) {
705		if (res > 0)
706			path->mnt = NULL;
707		path->dentry = NULL;
708		return false;
709	}
710	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
711		path->dentry = NULL;
712		return false;
713	}
714	return !read_seqcount_retry(&path->dentry->d_seq, seq);
715}
716
717static inline bool legitimize_path(struct nameidata *nd,
718			    struct path *path, unsigned seq)
719{
720	return __legitimize_path(path, seq, nd->m_seq);
721}
722
723static bool legitimize_links(struct nameidata *nd)
724{
725	int i;
726	if (unlikely(nd->flags & LOOKUP_CACHED)) {
727		drop_links(nd);
728		nd->depth = 0;
729		return false;
730	}
731	for (i = 0; i < nd->depth; i++) {
732		struct saved *last = nd->stack + i;
733		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
734			drop_links(nd);
735			nd->depth = i + 1;
736			return false;
737		}
738	}
739	return true;
740}
741
742static bool legitimize_root(struct nameidata *nd)
743{
744	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
745	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
746		return true;
747	nd->state |= ND_ROOT_GRABBED;
748	return legitimize_path(nd, &nd->root, nd->root_seq);
749}
750
751/*
752 * Path walking has 2 modes, rcu-walk and ref-walk (see
753 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
754 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
755 * normal reference counts on dentries and vfsmounts to transition to ref-walk
756 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
757 * got stuck, so ref-walk may continue from there. If this is not successful
758 * (eg. a seqcount has changed), then failure is returned and it's up to caller
759 * to restart the path walk from the beginning in ref-walk mode.
760 */
761
762/**
763 * try_to_unlazy - try to switch to ref-walk mode.
764 * @nd: nameidata pathwalk data
765 * Returns: true on success, false on failure
766 *
767 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
768 * for ref-walk mode.
769 * Must be called from rcu-walk context.
770 * Nothing should touch nameidata between try_to_unlazy() failure and
771 * terminate_walk().
772 */
773static bool try_to_unlazy(struct nameidata *nd)
774{
775	struct dentry *parent = nd->path.dentry;
776
777	BUG_ON(!(nd->flags & LOOKUP_RCU));
778
779	if (unlikely(!legitimize_links(nd)))
780		goto out1;
781	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
782		goto out;
783	if (unlikely(!legitimize_root(nd)))
784		goto out;
785	leave_rcu(nd);
786	BUG_ON(nd->inode != parent->d_inode);
787	return true;
788
789out1:
790	nd->path.mnt = NULL;
791	nd->path.dentry = NULL;
792out:
793	leave_rcu(nd);
794	return false;
795}
796
797/**
798 * try_to_unlazy_next - try to switch to ref-walk mode.
799 * @nd: nameidata pathwalk data
800 * @dentry: next dentry to step into
801 * Returns: true on success, false on failure
802 *
803 * Similar to try_to_unlazy(), but here we have the next dentry already
804 * picked by rcu-walk and want to legitimize that in addition to the current
805 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
806 * Nothing should touch nameidata between try_to_unlazy_next() failure and
807 * terminate_walk().
808 */
809static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
810{
811	int res;
812	BUG_ON(!(nd->flags & LOOKUP_RCU));
813
814	if (unlikely(!legitimize_links(nd)))
815		goto out2;
816	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
817	if (unlikely(res)) {
818		if (res > 0)
819			goto out2;
820		goto out1;
821	}
822	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
823		goto out1;
824
825	/*
826	 * We need to move both the parent and the dentry from the RCU domain
827	 * to be properly refcounted. And the sequence number in the dentry
828	 * validates *both* dentry counters, since we checked the sequence
829	 * number of the parent after we got the child sequence number. So we
830	 * know the parent must still be valid if the child sequence number is
831	 */
832	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
833		goto out;
834	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
835		goto out_dput;
836	/*
837	 * Sequence counts matched. Now make sure that the root is
838	 * still valid and get it if required.
839	 */
840	if (unlikely(!legitimize_root(nd)))
841		goto out_dput;
842	leave_rcu(nd);
843	return true;
844
845out2:
846	nd->path.mnt = NULL;
847out1:
848	nd->path.dentry = NULL;
849out:
850	leave_rcu(nd);
851	return false;
852out_dput:
853	leave_rcu(nd);
854	dput(dentry);
855	return false;
856}
857
858static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
859{
860	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
861		return dentry->d_op->d_revalidate(dentry, flags);
862	else
863		return 1;
864}
865
866/**
867 * complete_walk - successful completion of path walk
868 * @nd:  pointer nameidata
869 *
870 * If we had been in RCU mode, drop out of it and legitimize nd->path.
871 * Revalidate the final result, unless we'd already done that during
872 * the path walk or the filesystem doesn't ask for it.  Return 0 on
873 * success, -error on failure.  In case of failure caller does not
874 * need to drop nd->path.
875 */
876static int complete_walk(struct nameidata *nd)
877{
878	struct dentry *dentry = nd->path.dentry;
879	int status;
880
881	if (nd->flags & LOOKUP_RCU) {
882		/*
883		 * We don't want to zero nd->root for scoped-lookups or
884		 * externally-managed nd->root.
885		 */
886		if (!(nd->state & ND_ROOT_PRESET))
887			if (!(nd->flags & LOOKUP_IS_SCOPED))
888				nd->root.mnt = NULL;
889		nd->flags &= ~LOOKUP_CACHED;
890		if (!try_to_unlazy(nd))
891			return -ECHILD;
892	}
893
894	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
895		/*
896		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
897		 * ever step outside the root during lookup" and should already
898		 * be guaranteed by the rest of namei, we want to avoid a namei
899		 * BUG resulting in userspace being given a path that was not
900		 * scoped within the root at some point during the lookup.
901		 *
902		 * So, do a final sanity-check to make sure that in the
903		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
904		 * we won't silently return an fd completely outside of the
905		 * requested root to userspace.
906		 *
907		 * Userspace could move the path outside the root after this
908		 * check, but as discussed elsewhere this is not a concern (the
909		 * resolved file was inside the root at some point).
910		 */
911		if (!path_is_under(&nd->path, &nd->root))
912			return -EXDEV;
913	}
914
915	if (likely(!(nd->state & ND_JUMPED)))
916		return 0;
917
918	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
919		return 0;
920
921	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
922	if (status > 0)
923		return 0;
924
925	if (!status)
926		status = -ESTALE;
927
928	return status;
929}
930
931static int set_root(struct nameidata *nd)
932{
933	struct fs_struct *fs = current->fs;
934
935	/*
936	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
937	 * still have to ensure it doesn't happen because it will cause a breakout
938	 * from the dirfd.
939	 */
940	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
941		return -ENOTRECOVERABLE;
942
943	if (nd->flags & LOOKUP_RCU) {
944		unsigned seq;
945
946		do {
947			seq = read_seqcount_begin(&fs->seq);
948			nd->root = fs->root;
949			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
950		} while (read_seqcount_retry(&fs->seq, seq));
951	} else {
952		get_fs_root(fs, &nd->root);
953		nd->state |= ND_ROOT_GRABBED;
954	}
955	return 0;
956}
957
958static int nd_jump_root(struct nameidata *nd)
959{
960	if (unlikely(nd->flags & LOOKUP_BENEATH))
961		return -EXDEV;
962	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
963		/* Absolute path arguments to path_init() are allowed. */
964		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
965			return -EXDEV;
966	}
967	if (!nd->root.mnt) {
968		int error = set_root(nd);
969		if (error)
970			return error;
971	}
972	if (nd->flags & LOOKUP_RCU) {
973		struct dentry *d;
974		nd->path = nd->root;
975		d = nd->path.dentry;
976		nd->inode = d->d_inode;
977		nd->seq = nd->root_seq;
978		if (read_seqcount_retry(&d->d_seq, nd->seq))
979			return -ECHILD;
980	} else {
981		path_put(&nd->path);
982		nd->path = nd->root;
983		path_get(&nd->path);
984		nd->inode = nd->path.dentry->d_inode;
985	}
986	nd->state |= ND_JUMPED;
987	return 0;
988}
989
990/*
991 * Helper to directly jump to a known parsed path from ->get_link,
992 * caller must have taken a reference to path beforehand.
993 */
994int nd_jump_link(const struct path *path)
995{
996	int error = -ELOOP;
997	struct nameidata *nd = current->nameidata;
998
999	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1000		goto err;
1001
1002	error = -EXDEV;
1003	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1004		if (nd->path.mnt != path->mnt)
1005			goto err;
1006	}
1007	/* Not currently safe for scoped-lookups. */
1008	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1009		goto err;
1010
1011	path_put(&nd->path);
1012	nd->path = *path;
1013	nd->inode = nd->path.dentry->d_inode;
1014	nd->state |= ND_JUMPED;
1015	return 0;
1016
1017err:
1018	path_put(path);
1019	return error;
1020}
1021
1022static inline void put_link(struct nameidata *nd)
1023{
1024	struct saved *last = nd->stack + --nd->depth;
1025	do_delayed_call(&last->done);
1026	if (!(nd->flags & LOOKUP_RCU))
1027		path_put(&last->link);
1028}
1029
1030static int sysctl_protected_symlinks __read_mostly;
1031static int sysctl_protected_hardlinks __read_mostly;
1032static int sysctl_protected_fifos __read_mostly;
1033static int sysctl_protected_regular __read_mostly;
1034
1035#ifdef CONFIG_SYSCTL
1036static struct ctl_table namei_sysctls[] = {
1037	{
1038		.procname	= "protected_symlinks",
1039		.data		= &sysctl_protected_symlinks,
1040		.maxlen		= sizeof(int),
1041		.mode		= 0644,
1042		.proc_handler	= proc_dointvec_minmax,
1043		.extra1		= SYSCTL_ZERO,
1044		.extra2		= SYSCTL_ONE,
1045	},
1046	{
1047		.procname	= "protected_hardlinks",
1048		.data		= &sysctl_protected_hardlinks,
1049		.maxlen		= sizeof(int),
1050		.mode		= 0644,
1051		.proc_handler	= proc_dointvec_minmax,
1052		.extra1		= SYSCTL_ZERO,
1053		.extra2		= SYSCTL_ONE,
1054	},
1055	{
1056		.procname	= "protected_fifos",
1057		.data		= &sysctl_protected_fifos,
1058		.maxlen		= sizeof(int),
1059		.mode		= 0644,
1060		.proc_handler	= proc_dointvec_minmax,
1061		.extra1		= SYSCTL_ZERO,
1062		.extra2		= SYSCTL_TWO,
1063	},
1064	{
1065		.procname	= "protected_regular",
1066		.data		= &sysctl_protected_regular,
1067		.maxlen		= sizeof(int),
1068		.mode		= 0644,
1069		.proc_handler	= proc_dointvec_minmax,
1070		.extra1		= SYSCTL_ZERO,
1071		.extra2		= SYSCTL_TWO,
1072	},
1073};
1074
1075static int __init init_fs_namei_sysctls(void)
1076{
1077	register_sysctl_init("fs", namei_sysctls);
1078	return 0;
1079}
1080fs_initcall(init_fs_namei_sysctls);
1081
1082#endif /* CONFIG_SYSCTL */
1083
1084/**
1085 * may_follow_link - Check symlink following for unsafe situations
1086 * @nd: nameidata pathwalk data
1087 * @inode: Used for idmapping.
1088 *
1089 * In the case of the sysctl_protected_symlinks sysctl being enabled,
1090 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1091 * in a sticky world-writable directory. This is to protect privileged
1092 * processes from failing races against path names that may change out
1093 * from under them by way of other users creating malicious symlinks.
1094 * It will permit symlinks to be followed only when outside a sticky
1095 * world-writable directory, or when the uid of the symlink and follower
1096 * match, or when the directory owner matches the symlink's owner.
1097 *
1098 * Returns 0 if following the symlink is allowed, -ve on error.
1099 */
1100static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
1101{
1102	struct mnt_idmap *idmap;
1103	vfsuid_t vfsuid;
1104
1105	if (!sysctl_protected_symlinks)
1106		return 0;
1107
1108	idmap = mnt_idmap(nd->path.mnt);
1109	vfsuid = i_uid_into_vfsuid(idmap, inode);
1110	/* Allowed if owner and follower match. */
1111	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1112		return 0;
1113
1114	/* Allowed if parent directory not sticky and world-writable. */
1115	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
1116		return 0;
1117
1118	/* Allowed if parent directory and link owner match. */
1119	if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
1120		return 0;
1121
1122	if (nd->flags & LOOKUP_RCU)
1123		return -ECHILD;
1124
1125	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
1126	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
1127	return -EACCES;
1128}
1129
1130/**
1131 * safe_hardlink_source - Check for safe hardlink conditions
1132 * @idmap: idmap of the mount the inode was found from
1133 * @inode: the source inode to hardlink from
1134 *
1135 * Return false if at least one of the following conditions:
1136 *    - inode is not a regular file
1137 *    - inode is setuid
1138 *    - inode is setgid and group-exec
1139 *    - access failure for read and write
1140 *
1141 * Otherwise returns true.
1142 */
1143static bool safe_hardlink_source(struct mnt_idmap *idmap,
1144				 struct inode *inode)
1145{
1146	umode_t mode = inode->i_mode;
1147
1148	/* Special files should not get pinned to the filesystem. */
1149	if (!S_ISREG(mode))
1150		return false;
1151
1152	/* Setuid files should not get pinned to the filesystem. */
1153	if (mode & S_ISUID)
1154		return false;
1155
1156	/* Executable setgid files should not get pinned to the filesystem. */
1157	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
1158		return false;
1159
1160	/* Hardlinking to unreadable or unwritable sources is dangerous. */
1161	if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
1162		return false;
1163
1164	return true;
1165}
1166
1167/**
1168 * may_linkat - Check permissions for creating a hardlink
1169 * @idmap: idmap of the mount the inode was found from
1170 * @link:  the source to hardlink from
1171 *
1172 * Block hardlink when all of:
1173 *  - sysctl_protected_hardlinks enabled
1174 *  - fsuid does not match inode
1175 *  - hardlink source is unsafe (see safe_hardlink_source() above)
1176 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
1177 *
1178 * If the inode has been found through an idmapped mount the idmap of
1179 * the vfsmount must be passed through @idmap. This function will then take
1180 * care to map the inode according to @idmap before checking permissions.
1181 * On non-idmapped mounts or if permission checking is to be performed on the
1182 * raw inode simply pass @nop_mnt_idmap.
1183 *
1184 * Returns 0 if successful, -ve on error.
1185 */
1186int may_linkat(struct mnt_idmap *idmap, const struct path *link)
1187{
1188	struct inode *inode = link->dentry->d_inode;
1189
1190	/* Inode writeback is not safe when the uid or gid are invalid. */
1191	if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
1192	    !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
1193		return -EOVERFLOW;
1194
1195	if (!sysctl_protected_hardlinks)
1196		return 0;
1197
1198	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1199	 * otherwise, it must be a safe source.
1200	 */
1201	if (safe_hardlink_source(idmap, inode) ||
1202	    inode_owner_or_capable(idmap, inode))
1203		return 0;
1204
1205	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
1206	return -EPERM;
1207}
1208
1209/**
1210 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1211 *			  should be allowed, or not, on files that already
1212 *			  exist.
1213 * @idmap: idmap of the mount the inode was found from
1214 * @nd: nameidata pathwalk data
1215 * @inode: the inode of the file to open
1216 *
1217 * Block an O_CREAT open of a FIFO (or a regular file) when:
1218 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1219 *   - the file already exists
1220 *   - we are in a sticky directory
1221 *   - we don't own the file
1222 *   - the owner of the directory doesn't own the file
1223 *   - the directory is world writable
1224 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1225 * the directory doesn't have to be world writable: being group writable will
1226 * be enough.
1227 *
1228 * If the inode has been found through an idmapped mount the idmap of
1229 * the vfsmount must be passed through @idmap. This function will then take
1230 * care to map the inode according to @idmap before checking permissions.
1231 * On non-idmapped mounts or if permission checking is to be performed on the
1232 * raw inode simply pass @nop_mnt_idmap.
1233 *
1234 * Returns 0 if the open is allowed, -ve on error.
1235 */
1236static int may_create_in_sticky(struct mnt_idmap *idmap,
1237				struct nameidata *nd, struct inode *const inode)
1238{
1239	umode_t dir_mode = nd->dir_mode;
1240	vfsuid_t dir_vfsuid = nd->dir_vfsuid;
1241
1242	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
1243	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
1244	    likely(!(dir_mode & S_ISVTX)) ||
1245	    vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) ||
1246	    vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
1247		return 0;
1248
1249	if (likely(dir_mode & 0002) ||
1250	    (dir_mode & 0020 &&
1251	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
1252	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
1253		const char *operation = S_ISFIFO(inode->i_mode) ?
1254					"sticky_create_fifo" :
1255					"sticky_create_regular";
1256		audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1257		return -EACCES;
1258	}
1259	return 0;
1260}
1261
1262/*
1263 * follow_up - Find the mountpoint of path's vfsmount
1264 *
1265 * Given a path, find the mountpoint of its source file system.
1266 * Replace @path with the path of the mountpoint in the parent mount.
1267 * Up is towards /.
1268 *
1269 * Return 1 if we went up a level and 0 if we were already at the
1270 * root.
1271 */
1272int follow_up(struct path *path)
1273{
1274	struct mount *mnt = real_mount(path->mnt);
1275	struct mount *parent;
1276	struct dentry *mountpoint;
1277
1278	read_seqlock_excl(&mount_lock);
1279	parent = mnt->mnt_parent;
1280	if (parent == mnt) {
1281		read_sequnlock_excl(&mount_lock);
1282		return 0;
1283	}
1284	mntget(&parent->mnt);
1285	mountpoint = dget(mnt->mnt_mountpoint);
1286	read_sequnlock_excl(&mount_lock);
1287	dput(path->dentry);
1288	path->dentry = mountpoint;
1289	mntput(path->mnt);
1290	path->mnt = &parent->mnt;
1291	return 1;
1292}
1293EXPORT_SYMBOL(follow_up);
1294
1295static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
1296				  struct path *path, unsigned *seqp)
1297{
1298	while (mnt_has_parent(m)) {
1299		struct dentry *mountpoint = m->mnt_mountpoint;
1300
1301		m = m->mnt_parent;
1302		if (unlikely(root->dentry == mountpoint &&
1303			     root->mnt == &m->mnt))
1304			break;
1305		if (mountpoint != m->mnt.mnt_root) {
1306			path->mnt = &m->mnt;
1307			path->dentry = mountpoint;
1308			*seqp = read_seqcount_begin(&mountpoint->d_seq);
1309			return true;
1310		}
1311	}
1312	return false;
1313}
1314
1315static bool choose_mountpoint(struct mount *m, const struct path *root,
1316			      struct path *path)
1317{
1318	bool found;
1319
1320	rcu_read_lock();
1321	while (1) {
1322		unsigned seq, mseq = read_seqbegin(&mount_lock);
1323
1324		found = choose_mountpoint_rcu(m, root, path, &seq);
1325		if (unlikely(!found)) {
1326			if (!read_seqretry(&mount_lock, mseq))
1327				break;
1328		} else {
1329			if (likely(__legitimize_path(path, seq, mseq)))
1330				break;
1331			rcu_read_unlock();
1332			path_put(path);
1333			rcu_read_lock();
1334		}
1335	}
1336	rcu_read_unlock();
1337	return found;
1338}
1339
1340/*
1341 * Perform an automount
1342 * - return -EISDIR to tell follow_managed() to stop and return the path we
1343 *   were called with.
1344 */
1345static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
1346{
1347	struct dentry *dentry = path->dentry;
1348
1349	/* We don't want to mount if someone's just doing a stat -
1350	 * unless they're stat'ing a directory and appended a '/' to
1351	 * the name.
1352	 *
1353	 * We do, however, want to mount if someone wants to open or
1354	 * create a file of any type under the mountpoint, wants to
1355	 * traverse through the mountpoint or wants to open the
1356	 * mounted directory.  Also, autofs may mark negative dentries
1357	 * as being automount points.  These will need the attentions
1358	 * of the daemon to instantiate them before they can be used.
1359	 */
1360	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1361			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1362	    dentry->d_inode)
1363		return -EISDIR;
1364
1365	if (count && (*count)++ >= MAXSYMLINKS)
1366		return -ELOOP;
1367
1368	return finish_automount(dentry->d_op->d_automount(path), path);
1369}
1370
1371/*
1372 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
1373 * dentries are pinned but not locked here, so negative dentry can go
1374 * positive right under us.  Use of smp_load_acquire() provides a barrier
1375 * sufficient for ->d_inode and ->d_flags consistency.
1376 */
1377static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
1378			     int *count, unsigned lookup_flags)
1379{
1380	struct vfsmount *mnt = path->mnt;
1381	bool need_mntput = false;
1382	int ret = 0;
1383
1384	while (flags & DCACHE_MANAGED_DENTRY) {
1385		/* Allow the filesystem to manage the transit without i_mutex
1386		 * being held. */
1387		if (flags & DCACHE_MANAGE_TRANSIT) {
1388			ret = path->dentry->d_op->d_manage(path, false);
1389			flags = smp_load_acquire(&path->dentry->d_flags);
1390			if (ret < 0)
1391				break;
1392		}
1393
1394		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1395			struct vfsmount *mounted = lookup_mnt(path);
1396			if (mounted) {		// ... in our namespace
1397				dput(path->dentry);
1398				if (need_mntput)
1399					mntput(path->mnt);
1400				path->mnt = mounted;
1401				path->dentry = dget(mounted->mnt_root);
1402				// here we know it's positive
1403				flags = path->dentry->d_flags;
1404				need_mntput = true;
1405				continue;
1406			}
1407		}
1408
1409		if (!(flags & DCACHE_NEED_AUTOMOUNT))
1410			break;
1411
1412		// uncovered automount point
1413		ret = follow_automount(path, count, lookup_flags);
1414		flags = smp_load_acquire(&path->dentry->d_flags);
1415		if (ret < 0)
1416			break;
1417	}
1418
1419	if (ret == -EISDIR)
1420		ret = 0;
1421	// possible if you race with several mount --move
1422	if (need_mntput && path->mnt == mnt)
1423		mntput(path->mnt);
1424	if (!ret && unlikely(d_flags_negative(flags)))
1425		ret = -ENOENT;
1426	*jumped = need_mntput;
1427	return ret;
1428}
1429
1430static inline int traverse_mounts(struct path *path, bool *jumped,
1431				  int *count, unsigned lookup_flags)
1432{
1433	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1434
1435	/* fastpath */
1436	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1437		*jumped = false;
1438		if (unlikely(d_flags_negative(flags)))
1439			return -ENOENT;
1440		return 0;
1441	}
1442	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1443}
1444
1445int follow_down_one(struct path *path)
1446{
1447	struct vfsmount *mounted;
1448
1449	mounted = lookup_mnt(path);
1450	if (mounted) {
1451		dput(path->dentry);
1452		mntput(path->mnt);
1453		path->mnt = mounted;
1454		path->dentry = dget(mounted->mnt_root);
1455		return 1;
1456	}
1457	return 0;
1458}
1459EXPORT_SYMBOL(follow_down_one);
1460
1461/*
1462 * Follow down to the covering mount currently visible to userspace.  At each
1463 * point, the filesystem owning that dentry may be queried as to whether the
1464 * caller is permitted to proceed or not.
1465 */
1466int follow_down(struct path *path, unsigned int flags)
1467{
1468	struct vfsmount *mnt = path->mnt;
1469	bool jumped;
1470	int ret = traverse_mounts(path, &jumped, NULL, flags);
1471
1472	if (path->mnt != mnt)
1473		mntput(mnt);
1474	return ret;
1475}
1476EXPORT_SYMBOL(follow_down);
1477
1478/*
1479 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1480 * we meet a managed dentry that would need blocking.
1481 */
1482static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
1483{
1484	struct dentry *dentry = path->dentry;
1485	unsigned int flags = dentry->d_flags;
1486
1487	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1488		return true;
1489
1490	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1491		return false;
1492
1493	for (;;) {
1494		/*
1495		 * Don't forget we might have a non-mountpoint managed dentry
1496		 * that wants to block transit.
1497		 */
1498		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1499			int res = dentry->d_op->d_manage(path, true);
1500			if (res)
1501				return res == -EISDIR;
1502			flags = dentry->d_flags;
1503		}
1504
1505		if (flags & DCACHE_MOUNTED) {
1506			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1507			if (mounted) {
1508				path->mnt = &mounted->mnt;
1509				dentry = path->dentry = mounted->mnt.mnt_root;
1510				nd->state |= ND_JUMPED;
1511				nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1512				flags = dentry->d_flags;
1513				// makes sure that non-RCU pathwalk could reach
1514				// this state.
1515				if (read_seqretry(&mount_lock, nd->m_seq))
1516					return false;
1517				continue;
1518			}
1519			if (read_seqretry(&mount_lock, nd->m_seq))
1520				return false;
1521		}
1522		return !(flags & DCACHE_NEED_AUTOMOUNT);
1523	}
1524}
1525
1526static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
1527			  struct path *path)
1528{
1529	bool jumped;
1530	int ret;
1531
1532	path->mnt = nd->path.mnt;
1533	path->dentry = dentry;
1534	if (nd->flags & LOOKUP_RCU) {
1535		unsigned int seq = nd->next_seq;
1536		if (likely(__follow_mount_rcu(nd, path)))
1537			return 0;
1538		// *path and nd->next_seq might've been clobbered
1539		path->mnt = nd->path.mnt;
1540		path->dentry = dentry;
1541		nd->next_seq = seq;
1542		if (!try_to_unlazy_next(nd, dentry))
1543			return -ECHILD;
1544	}
1545	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
1546	if (jumped) {
1547		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1548			ret = -EXDEV;
1549		else
1550			nd->state |= ND_JUMPED;
1551	}
1552	if (unlikely(ret)) {
1553		dput(path->dentry);
1554		if (path->mnt != nd->path.mnt)
1555			mntput(path->mnt);
1556	}
1557	return ret;
1558}
1559
1560/*
1561 * This looks up the name in dcache and possibly revalidates the found dentry.
1562 * NULL is returned if the dentry does not exist in the cache.
1563 */
1564static struct dentry *lookup_dcache(const struct qstr *name,
1565				    struct dentry *dir,
1566				    unsigned int flags)
1567{
1568	struct dentry *dentry = d_lookup(dir, name);
1569	if (dentry) {
1570		int error = d_revalidate(dentry, flags);
1571		if (unlikely(error <= 0)) {
1572			if (!error)
1573				d_invalidate(dentry);
1574			dput(dentry);
1575			return ERR_PTR(error);
1576		}
1577	}
1578	return dentry;
1579}
1580
1581/*
1582 * Parent directory has inode locked exclusive.  This is one
1583 * and only case when ->lookup() gets called on non in-lookup
1584 * dentries - as the matter of fact, this only gets called
1585 * when directory is guaranteed to have no in-lookup children
1586 * at all.
1587 */
1588struct dentry *lookup_one_qstr_excl(const struct qstr *name,
1589				    struct dentry *base,
1590				    unsigned int flags)
1591{
1592	struct dentry *dentry = lookup_dcache(name, base, flags);
1593	struct dentry *old;
1594	struct inode *dir = base->d_inode;
1595
1596	if (dentry)
1597		return dentry;
1598
1599	/* Don't create child dentry for a dead directory. */
1600	if (unlikely(IS_DEADDIR(dir)))
1601		return ERR_PTR(-ENOENT);
1602
1603	dentry = d_alloc(base, name);
1604	if (unlikely(!dentry))
1605		return ERR_PTR(-ENOMEM);
1606
1607	old = dir->i_op->lookup(dir, dentry, flags);
1608	if (unlikely(old)) {
1609		dput(dentry);
1610		dentry = old;
1611	}
1612	return dentry;
1613}
1614EXPORT_SYMBOL(lookup_one_qstr_excl);
1615
1616static struct dentry *lookup_fast(struct nameidata *nd)
1617{
1618	struct dentry *dentry, *parent = nd->path.dentry;
1619	int status = 1;
1620
1621	/*
1622	 * Rename seqlock is not required here because in the off chance
1623	 * of a false negative due to a concurrent rename, the caller is
1624	 * going to fall back to non-racy lookup.
1625	 */
1626	if (nd->flags & LOOKUP_RCU) {
1627		dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
1628		if (unlikely(!dentry)) {
1629			if (!try_to_unlazy(nd))
1630				return ERR_PTR(-ECHILD);
1631			return NULL;
1632		}
1633
1634		/*
1635		 * This sequence count validates that the parent had no
1636		 * changes while we did the lookup of the dentry above.
1637		 */
1638		if (read_seqcount_retry(&parent->d_seq, nd->seq))
1639			return ERR_PTR(-ECHILD);
1640
1641		status = d_revalidate(dentry, nd->flags);
1642		if (likely(status > 0))
1643			return dentry;
1644		if (!try_to_unlazy_next(nd, dentry))
1645			return ERR_PTR(-ECHILD);
1646		if (status == -ECHILD)
1647			/* we'd been told to redo it in non-rcu mode */
1648			status = d_revalidate(dentry, nd->flags);
1649	} else {
1650		dentry = __d_lookup(parent, &nd->last);
1651		if (unlikely(!dentry))
1652			return NULL;
1653		status = d_revalidate(dentry, nd->flags);
1654	}
1655	if (unlikely(status <= 0)) {
1656		if (!status)
1657			d_invalidate(dentry);
1658		dput(dentry);
1659		return ERR_PTR(status);
1660	}
1661	return dentry;
1662}
1663
1664/* Fast lookup failed, do it the slow way */
1665static struct dentry *__lookup_slow(const struct qstr *name,
1666				    struct dentry *dir,
1667				    unsigned int flags)
1668{
1669	struct dentry *dentry, *old;
1670	struct inode *inode = dir->d_inode;
1671	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1672
1673	/* Don't go there if it's already dead */
1674	if (unlikely(IS_DEADDIR(inode)))
1675		return ERR_PTR(-ENOENT);
1676again:
1677	dentry = d_alloc_parallel(dir, name, &wq);
1678	if (IS_ERR(dentry))
1679		return dentry;
1680	if (unlikely(!d_in_lookup(dentry))) {
1681		int error = d_revalidate(dentry, flags);
1682		if (unlikely(error <= 0)) {
1683			if (!error) {
1684				d_invalidate(dentry);
1685				dput(dentry);
1686				goto again;
1687			}
1688			dput(dentry);
1689			dentry = ERR_PTR(error);
1690		}
1691	} else {
1692		old = inode->i_op->lookup(inode, dentry, flags);
1693		d_lookup_done(dentry);
1694		if (unlikely(old)) {
1695			dput(dentry);
1696			dentry = old;
1697		}
1698	}
1699	return dentry;
1700}
1701
1702static struct dentry *lookup_slow(const struct qstr *name,
1703				  struct dentry *dir,
1704				  unsigned int flags)
1705{
1706	struct inode *inode = dir->d_inode;
1707	struct dentry *res;
1708	inode_lock_shared(inode);
1709	res = __lookup_slow(name, dir, flags);
1710	inode_unlock_shared(inode);
1711	return res;
1712}
1713
1714static inline int may_lookup(struct mnt_idmap *idmap,
1715			     struct nameidata *nd)
1716{
1717	if (nd->flags & LOOKUP_RCU) {
1718		int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1719		if (!err)		// success, keep going
1720			return 0;
1721		if (!try_to_unlazy(nd))
1722			return -ECHILD;	// redo it all non-lazy
1723		if (err != -ECHILD)	// hard error
1724			return err;
1725	}
1726	return inode_permission(idmap, nd->inode, MAY_EXEC);
1727}
1728
1729static int reserve_stack(struct nameidata *nd, struct path *link)
1730{
1731	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1732		return -ELOOP;
1733
1734	if (likely(nd->depth != EMBEDDED_LEVELS))
1735		return 0;
1736	if (likely(nd->stack != nd->internal))
1737		return 0;
1738	if (likely(nd_alloc_stack(nd)))
1739		return 0;
1740
1741	if (nd->flags & LOOKUP_RCU) {
1742		// we need to grab link before we do unlazy.  And we can't skip
1743		// unlazy even if we fail to grab the link - cleanup needs it
1744		bool grabbed_link = legitimize_path(nd, link, nd->next_seq);
1745
1746		if (!try_to_unlazy(nd) || !grabbed_link)
1747			return -ECHILD;
1748
1749		if (nd_alloc_stack(nd))
1750			return 0;
1751	}
1752	return -ENOMEM;
1753}
1754
1755enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
1756
1757static const char *pick_link(struct nameidata *nd, struct path *link,
1758		     struct inode *inode, int flags)
1759{
1760	struct saved *last;
1761	const char *res;
1762	int error = reserve_stack(nd, link);
1763
1764	if (unlikely(error)) {
1765		if (!(nd->flags & LOOKUP_RCU))
1766			path_put(link);
1767		return ERR_PTR(error);
1768	}
1769	last = nd->stack + nd->depth++;
1770	last->link = *link;
1771	clear_delayed_call(&last->done);
1772	last->seq = nd->next_seq;
1773
1774	if (flags & WALK_TRAILING) {
1775		error = may_follow_link(nd, inode);
1776		if (unlikely(error))
1777			return ERR_PTR(error);
1778	}
1779
1780	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
1781			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1782		return ERR_PTR(-ELOOP);
1783
1784	if (!(nd->flags & LOOKUP_RCU)) {
1785		touch_atime(&last->link);
1786		cond_resched();
1787	} else if (atime_needs_update(&last->link, inode)) {
1788		if (!try_to_unlazy(nd))
1789			return ERR_PTR(-ECHILD);
1790		touch_atime(&last->link);
1791	}
1792
1793	error = security_inode_follow_link(link->dentry, inode,
1794					   nd->flags & LOOKUP_RCU);
1795	if (unlikely(error))
1796		return ERR_PTR(error);
1797
1798	res = READ_ONCE(inode->i_link);
1799	if (!res) {
1800		const char * (*get)(struct dentry *, struct inode *,
1801				struct delayed_call *);
1802		get = inode->i_op->get_link;
1803		if (nd->flags & LOOKUP_RCU) {
1804			res = get(NULL, inode, &last->done);
1805			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1806				res = get(link->dentry, inode, &last->done);
1807		} else {
1808			res = get(link->dentry, inode, &last->done);
1809		}
1810		if (!res)
1811			goto all_done;
1812		if (IS_ERR(res))
1813			return res;
1814	}
1815	if (*res == '/') {
1816		error = nd_jump_root(nd);
1817		if (unlikely(error))
1818			return ERR_PTR(error);
1819		while (unlikely(*++res == '/'))
1820			;
1821	}
1822	if (*res)
1823		return res;
1824all_done: // pure jump
1825	put_link(nd);
1826	return NULL;
1827}
1828
1829/*
1830 * Do we need to follow links? We _really_ want to be able
1831 * to do this check without having to look at inode->i_op,
1832 * so we keep a cache of "no, this doesn't need follow_link"
1833 * for the common case.
1834 *
1835 * NOTE: dentry must be what nd->next_seq had been sampled from.
1836 */
1837static const char *step_into(struct nameidata *nd, int flags,
1838		     struct dentry *dentry)
1839{
1840	struct path path;
1841	struct inode *inode;
1842	int err = handle_mounts(nd, dentry, &path);
1843
1844	if (err < 0)
1845		return ERR_PTR(err);
1846	inode = path.dentry->d_inode;
1847	if (likely(!d_is_symlink(path.dentry)) ||
1848	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
1849	   (flags & WALK_NOFOLLOW)) {
1850		/* not a symlink or should not follow */
1851		if (nd->flags & LOOKUP_RCU) {
1852			if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1853				return ERR_PTR(-ECHILD);
1854			if (unlikely(!inode))
1855				return ERR_PTR(-ENOENT);
1856		} else {
1857			dput(nd->path.dentry);
1858			if (nd->path.mnt != path.mnt)
1859				mntput(nd->path.mnt);
1860		}
1861		nd->path = path;
1862		nd->inode = inode;
1863		nd->seq = nd->next_seq;
1864		return NULL;
1865	}
1866	if (nd->flags & LOOKUP_RCU) {
1867		/* make sure that d_is_symlink above matches inode */
1868		if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1869			return ERR_PTR(-ECHILD);
1870	} else {
1871		if (path.mnt == nd->path.mnt)
1872			mntget(path.mnt);
1873	}
1874	return pick_link(nd, &path, inode, flags);
1875}
1876
1877static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
1878{
1879	struct dentry *parent, *old;
1880
1881	if (path_equal(&nd->path, &nd->root))
1882		goto in_root;
1883	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1884		struct path path;
1885		unsigned seq;
1886		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
1887					   &nd->root, &path, &seq))
1888			goto in_root;
1889		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1890			return ERR_PTR(-ECHILD);
1891		nd->path = path;
1892		nd->inode = path.dentry->d_inode;
1893		nd->seq = seq;
1894		// makes sure that non-RCU pathwalk could reach this state
1895		if (read_seqretry(&mount_lock, nd->m_seq))
1896			return ERR_PTR(-ECHILD);
1897		/* we know that mountpoint was pinned */
1898	}
1899	old = nd->path.dentry;
1900	parent = old->d_parent;
1901	nd->next_seq = read_seqcount_begin(&parent->d_seq);
1902	// makes sure that non-RCU pathwalk could reach this state
1903	if (read_seqcount_retry(&old->d_seq, nd->seq))
1904		return ERR_PTR(-ECHILD);
1905	if (unlikely(!path_connected(nd->path.mnt, parent)))
1906		return ERR_PTR(-ECHILD);
1907	return parent;
1908in_root:
1909	if (read_seqretry(&mount_lock, nd->m_seq))
1910		return ERR_PTR(-ECHILD);
1911	if (unlikely(nd->flags & LOOKUP_BENEATH))
1912		return ERR_PTR(-ECHILD);
1913	nd->next_seq = nd->seq;
1914	return nd->path.dentry;
1915}
1916
1917static struct dentry *follow_dotdot(struct nameidata *nd)
1918{
1919	struct dentry *parent;
1920
1921	if (path_equal(&nd->path, &nd->root))
1922		goto in_root;
1923	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1924		struct path path;
1925
1926		if (!choose_mountpoint(real_mount(nd->path.mnt),
1927				       &nd->root, &path))
1928			goto in_root;
1929		path_put(&nd->path);
1930		nd->path = path;
1931		nd->inode = path.dentry->d_inode;
1932		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1933			return ERR_PTR(-EXDEV);
1934	}
1935	/* rare case of legitimate dget_parent()... */
1936	parent = dget_parent(nd->path.dentry);
1937	if (unlikely(!path_connected(nd->path.mnt, parent))) {
1938		dput(parent);
1939		return ERR_PTR(-ENOENT);
1940	}
1941	return parent;
1942
1943in_root:
1944	if (unlikely(nd->flags & LOOKUP_BENEATH))
1945		return ERR_PTR(-EXDEV);
1946	return dget(nd->path.dentry);
1947}
1948
1949static const char *handle_dots(struct nameidata *nd, int type)
1950{
1951	if (type == LAST_DOTDOT) {
1952		const char *error = NULL;
1953		struct dentry *parent;
1954
1955		if (!nd->root.mnt) {
1956			error = ERR_PTR(set_root(nd));
1957			if (error)
1958				return error;
1959		}
1960		if (nd->flags & LOOKUP_RCU)
1961			parent = follow_dotdot_rcu(nd);
1962		else
1963			parent = follow_dotdot(nd);
1964		if (IS_ERR(parent))
1965			return ERR_CAST(parent);
1966		error = step_into(nd, WALK_NOFOLLOW, parent);
1967		if (unlikely(error))
1968			return error;
1969
1970		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1971			/*
1972			 * If there was a racing rename or mount along our
1973			 * path, then we can't be sure that ".." hasn't jumped
1974			 * above nd->root (and so userspace should retry or use
1975			 * some fallback).
1976			 */
1977			smp_rmb();
1978			if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
1979				return ERR_PTR(-EAGAIN);
1980			if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
1981				return ERR_PTR(-EAGAIN);
1982		}
1983	}
1984	return NULL;
1985}
1986
1987static const char *walk_component(struct nameidata *nd, int flags)
1988{
1989	struct dentry *dentry;
1990	/*
1991	 * "." and ".." are special - ".." especially so because it has
1992	 * to be able to know about the current root directory and
1993	 * parent relationships.
1994	 */
1995	if (unlikely(nd->last_type != LAST_NORM)) {
1996		if (!(flags & WALK_MORE) && nd->depth)
1997			put_link(nd);
1998		return handle_dots(nd, nd->last_type);
1999	}
2000	dentry = lookup_fast(nd);
2001	if (IS_ERR(dentry))
2002		return ERR_CAST(dentry);
2003	if (unlikely(!dentry)) {
2004		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
2005		if (IS_ERR(dentry))
2006			return ERR_CAST(dentry);
2007	}
2008	if (!(flags & WALK_MORE) && nd->depth)
2009		put_link(nd);
2010	return step_into(nd, flags, dentry);
2011}
2012
2013/*
2014 * We can do the critical dentry name comparison and hashing
2015 * operations one word at a time, but we are limited to:
2016 *
2017 * - Architectures with fast unaligned word accesses. We could
2018 *   do a "get_unaligned()" if this helps and is sufficiently
2019 *   fast.
2020 *
2021 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2022 *   do not trap on the (extremely unlikely) case of a page
2023 *   crossing operation.
2024 *
2025 * - Furthermore, we need an efficient 64-bit compile for the
2026 *   64-bit case in order to generate the "number of bytes in
2027 *   the final mask". Again, that could be replaced with a
2028 *   efficient population count instruction or similar.
2029 */
2030#ifdef CONFIG_DCACHE_WORD_ACCESS
2031
2032#include <asm/word-at-a-time.h>
2033
2034#ifdef HASH_MIX
2035
2036/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
2037
2038#elif defined(CONFIG_64BIT)
2039/*
2040 * Register pressure in the mixing function is an issue, particularly
2041 * on 32-bit x86, but almost any function requires one state value and
2042 * one temporary.  Instead, use a function designed for two state values
2043 * and no temporaries.
2044 *
2045 * This function cannot create a collision in only two iterations, so
2046 * we have two iterations to achieve avalanche.  In those two iterations,
2047 * we have six layers of mixing, which is enough to spread one bit's
2048 * influence out to 2^6 = 64 state bits.
2049 *
2050 * Rotate constants are scored by considering either 64 one-bit input
2051 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2052 * probability of that delta causing a change to each of the 128 output
2053 * bits, using a sample of random initial states.
2054 *
2055 * The Shannon entropy of the computed probabilities is then summed
2056 * to produce a score.  Ideally, any input change has a 50% chance of
2057 * toggling any given output bit.
2058 *
2059 * Mixing scores (in bits) for (12,45):
2060 * Input delta: 1-bit      2-bit
2061 * 1 round:     713.3    42542.6
2062 * 2 rounds:   2753.7   140389.8
2063 * 3 rounds:   5954.1   233458.2
2064 * 4 rounds:   7862.6   256672.2
2065 * Perfect:    8192     258048
2066 *            (64*128) (64*63/2 * 128)
2067 */
2068#define HASH_MIX(x, y, a)	\
2069	(	x ^= (a),	\
2070	y ^= x,	x = rol64(x,12),\
2071	x += y,	y = rol64(y,45),\
2072	y *= 9			)
2073
2074/*
2075 * Fold two longs into one 32-bit hash value.  This must be fast, but
2076 * latency isn't quite as critical, as there is a fair bit of additional
2077 * work done before the hash value is used.
2078 */
2079static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2080{
2081	y ^= x * GOLDEN_RATIO_64;
2082	y *= GOLDEN_RATIO_64;
2083	return y >> 32;
2084}
2085
2086#else	/* 32-bit case */
2087
2088/*
2089 * Mixing scores (in bits) for (7,20):
2090 * Input delta: 1-bit      2-bit
2091 * 1 round:     330.3     9201.6
2092 * 2 rounds:   1246.4    25475.4
2093 * 3 rounds:   1907.1    31295.1
2094 * 4 rounds:   2042.3    31718.6
2095 * Perfect:    2048      31744
2096 *            (32*64)   (32*31/2 * 64)
2097 */
2098#define HASH_MIX(x, y, a)	\
2099	(	x ^= (a),	\
2100	y ^= x,	x = rol32(x, 7),\
2101	x += y,	y = rol32(y,20),\
2102	y *= 9			)
2103
2104static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2105{
2106	/* Use arch-optimized multiply if one exists */
2107	return __hash_32(y ^ __hash_32(x));
2108}
2109
2110#endif
2111
2112/*
2113 * Return the hash of a string of known length.  This is carfully
2114 * designed to match hash_name(), which is the more critical function.
2115 * In particular, we must end by hashing a final word containing 0..7
2116 * payload bytes, to match the way that hash_name() iterates until it
2117 * finds the delimiter after the name.
2118 */
2119unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2120{
2121	unsigned long a, x = 0, y = (unsigned long)salt;
2122
2123	for (;;) {
2124		if (!len)
2125			goto done;
2126		a = load_unaligned_zeropad(name);
2127		if (len < sizeof(unsigned long))
2128			break;
2129		HASH_MIX(x, y, a);
2130		name += sizeof(unsigned long);
2131		len -= sizeof(unsigned long);
2132	}
2133	x ^= a & bytemask_from_count(len);
2134done:
2135	return fold_hash(x, y);
2136}
2137EXPORT_SYMBOL(full_name_hash);
2138
2139/* Return the "hash_len" (hash and length) of a null-terminated string */
2140u64 hashlen_string(const void *salt, const char *name)
2141{
2142	unsigned long a = 0, x = 0, y = (unsigned long)salt;
2143	unsigned long adata, mask, len;
2144	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2145
2146	len = 0;
2147	goto inside;
2148
2149	do {
2150		HASH_MIX(x, y, a);
2151		len += sizeof(unsigned long);
2152inside:
2153		a = load_unaligned_zeropad(name+len);
2154	} while (!has_zero(a, &adata, &constants));
2155
2156	adata = prep_zero_mask(a, adata, &constants);
2157	mask = create_zero_mask(adata);
2158	x ^= a & zero_bytemask(mask);
2159
2160	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2161}
2162EXPORT_SYMBOL(hashlen_string);
2163
2164/*
2165 * Calculate the length and hash of the path component, and
2166 * return the "hash_len" as the result.
2167 */
2168static inline u64 hash_name(const void *salt, const char *name)
2169{
2170	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
2171	unsigned long adata, bdata, mask, len;
2172	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2173
2174	len = 0;
2175	goto inside;
2176
2177	do {
2178		HASH_MIX(x, y, a);
2179		len += sizeof(unsigned long);
2180inside:
2181		a = load_unaligned_zeropad(name+len);
2182		b = a ^ REPEAT_BYTE('/');
2183	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2184
2185	adata = prep_zero_mask(a, adata, &constants);
2186	bdata = prep_zero_mask(b, bdata, &constants);
2187	mask = create_zero_mask(adata | bdata);
2188	x ^= a & zero_bytemask(mask);
2189
2190	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2191}
2192
2193#else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2194
2195/* Return the hash of a string of known length */
2196unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2197{
2198	unsigned long hash = init_name_hash(salt);
2199	while (len--)
2200		hash = partial_name_hash((unsigned char)*name++, hash);
2201	return end_name_hash(hash);
2202}
2203EXPORT_SYMBOL(full_name_hash);
2204
2205/* Return the "hash_len" (hash and length) of a null-terminated string */
2206u64 hashlen_string(const void *salt, const char *name)
2207{
2208	unsigned long hash = init_name_hash(salt);
2209	unsigned long len = 0, c;
2210
2211	c = (unsigned char)*name;
2212	while (c) {
2213		len++;
2214		hash = partial_name_hash(c, hash);
2215		c = (unsigned char)name[len];
2216	}
2217	return hashlen_create(end_name_hash(hash), len);
2218}
2219EXPORT_SYMBOL(hashlen_string);
2220
2221/*
2222 * We know there's a real path component here of at least
2223 * one character.
2224 */
2225static inline u64 hash_name(const void *salt, const char *name)
2226{
2227	unsigned long hash = init_name_hash(salt);
2228	unsigned long len = 0, c;
2229
2230	c = (unsigned char)*name;
2231	do {
2232		len++;
2233		hash = partial_name_hash(c, hash);
2234		c = (unsigned char)name[len];
2235	} while (c && c != '/');
2236	return hashlen_create(end_name_hash(hash), len);
2237}
2238
2239#endif
2240
2241/*
2242 * Name resolution.
2243 * This is the basic name resolution function, turning a pathname into
2244 * the final dentry. We expect 'base' to be positive and a directory.
2245 *
2246 * Returns 0 and nd will have valid dentry and mnt on success.
2247 * Returns error and drops reference to input namei data on failure.
2248 */
2249static int link_path_walk(const char *name, struct nameidata *nd)
2250{
2251	int depth = 0; // depth <= nd->depth
2252	int err;
2253
2254	nd->last_type = LAST_ROOT;
2255	nd->flags |= LOOKUP_PARENT;
2256	if (IS_ERR(name))
2257		return PTR_ERR(name);
2258	while (*name=='/')
2259		name++;
2260	if (!*name) {
2261		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2262		return 0;
2263	}
2264
2265	/* At this point we know we have a real path component. */
2266	for(;;) {
2267		struct mnt_idmap *idmap;
2268		const char *link;
2269		u64 hash_len;
2270		int type;
2271
2272		idmap = mnt_idmap(nd->path.mnt);
2273		err = may_lookup(idmap, nd);
2274		if (err)
2275			return err;
2276
2277		hash_len = hash_name(nd->path.dentry, name);
2278
2279		type = LAST_NORM;
2280		if (name[0] == '.') switch (hashlen_len(hash_len)) {
2281			case 2:
2282				if (name[1] == '.') {
2283					type = LAST_DOTDOT;
2284					nd->state |= ND_JUMPED;
2285				}
2286				break;
2287			case 1:
2288				type = LAST_DOT;
2289		}
2290		if (likely(type == LAST_NORM)) {
2291			struct dentry *parent = nd->path.dentry;
2292			nd->state &= ~ND_JUMPED;
2293			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2294				struct qstr this = { { .hash_len = hash_len }, .name = name };
2295				err = parent->d_op->d_hash(parent, &this);
2296				if (err < 0)
2297					return err;
2298				hash_len = this.hash_len;
2299				name = this.name;
2300			}
2301		}
2302
2303		nd->last.hash_len = hash_len;
2304		nd->last.name = name;
2305		nd->last_type = type;
2306
2307		name += hashlen_len(hash_len);
2308		if (!*name)
2309			goto OK;
2310		/*
2311		 * If it wasn't NUL, we know it was '/'. Skip that
2312		 * slash, and continue until no more slashes.
2313		 */
2314		do {
2315			name++;
2316		} while (unlikely(*name == '/'));
2317		if (unlikely(!*name)) {
2318OK:
2319			/* pathname or trailing symlink, done */
2320			if (!depth) {
2321				nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
2322				nd->dir_mode = nd->inode->i_mode;
2323				nd->flags &= ~LOOKUP_PARENT;
2324				return 0;
2325			}
2326			/* last component of nested symlink */
2327			name = nd->stack[--depth].name;
2328			link = walk_component(nd, 0);
2329		} else {
2330			/* not the last component */
2331			link = walk_component(nd, WALK_MORE);
2332		}
2333		if (unlikely(link)) {
2334			if (IS_ERR(link))
2335				return PTR_ERR(link);
2336			/* a symlink to follow */
2337			nd->stack[depth++].name = name;
2338			name = link;
2339			continue;
2340		}
2341		if (unlikely(!d_can_lookup(nd->path.dentry))) {
2342			if (nd->flags & LOOKUP_RCU) {
2343				if (!try_to_unlazy(nd))
2344					return -ECHILD;
2345			}
2346			return -ENOTDIR;
2347		}
2348	}
2349}
2350
2351/* must be paired with terminate_walk() */
2352static const char *path_init(struct nameidata *nd, unsigned flags)
2353{
2354	int error;
2355	const char *s = nd->name->name;
2356
2357	/* LOOKUP_CACHED requires RCU, ask caller to retry */
2358	if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
2359		return ERR_PTR(-EAGAIN);
2360
2361	if (!*s)
2362		flags &= ~LOOKUP_RCU;
2363	if (flags & LOOKUP_RCU)
2364		rcu_read_lock();
2365	else
2366		nd->seq = nd->next_seq = 0;
2367
2368	nd->flags = flags;
2369	nd->state |= ND_JUMPED;
2370
2371	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2372	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2373	smp_rmb();
2374
2375	if (nd->state & ND_ROOT_PRESET) {
2376		struct dentry *root = nd->root.dentry;
2377		struct inode *inode = root->d_inode;
2378		if (*s && unlikely(!d_can_lookup(root)))
2379			return ERR_PTR(-ENOTDIR);
2380		nd->path = nd->root;
2381		nd->inode = inode;
2382		if (flags & LOOKUP_RCU) {
2383			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2384			nd->root_seq = nd->seq;
2385		} else {
2386			path_get(&nd->path);
2387		}
2388		return s;
2389	}
2390
2391	nd->root.mnt = NULL;
2392
2393	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
2394	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2395		error = nd_jump_root(nd);
2396		if (unlikely(error))
2397			return ERR_PTR(error);
2398		return s;
2399	}
2400
2401	/* Relative pathname -- get the starting-point it is relative to. */
2402	if (nd->dfd == AT_FDCWD) {
2403		if (flags & LOOKUP_RCU) {
2404			struct fs_struct *fs = current->fs;
2405			unsigned seq;
2406
2407			do {
2408				seq = read_seqcount_begin(&fs->seq);
2409				nd->path = fs->pwd;
2410				nd->inode = nd->path.dentry->d_inode;
2411				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2412			} while (read_seqcount_retry(&fs->seq, seq));
2413		} else {
2414			get_fs_pwd(current->fs, &nd->path);
2415			nd->inode = nd->path.dentry->d_inode;
2416		}
2417	} else {
2418		/* Caller must check execute permissions on the starting path component */
2419		struct fd f = fdget_raw(nd->dfd);
2420		struct dentry *dentry;
2421
2422		if (!f.file)
2423			return ERR_PTR(-EBADF);
2424
2425		dentry = f.file->f_path.dentry;
2426
2427		if (*s && unlikely(!d_can_lookup(dentry))) {
2428			fdput(f);
2429			return ERR_PTR(-ENOTDIR);
2430		}
2431
2432		nd->path = f.file->f_path;
2433		if (flags & LOOKUP_RCU) {
2434			nd->inode = nd->path.dentry->d_inode;
2435			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2436		} else {
2437			path_get(&nd->path);
2438			nd->inode = nd->path.dentry->d_inode;
2439		}
2440		fdput(f);
2441	}
2442
2443	/* For scoped-lookups we need to set the root to the dirfd as well. */
2444	if (flags & LOOKUP_IS_SCOPED) {
2445		nd->root = nd->path;
2446		if (flags & LOOKUP_RCU) {
2447			nd->root_seq = nd->seq;
2448		} else {
2449			path_get(&nd->root);
2450			nd->state |= ND_ROOT_GRABBED;
2451		}
2452	}
2453	return s;
2454}
2455
2456static inline const char *lookup_last(struct nameidata *nd)
2457{
2458	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2459		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2460
2461	return walk_component(nd, WALK_TRAILING);
2462}
2463
2464static int handle_lookup_down(struct nameidata *nd)
2465{
2466	if (!(nd->flags & LOOKUP_RCU))
2467		dget(nd->path.dentry);
2468	nd->next_seq = nd->seq;
2469	return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
2470}
2471
2472/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
2473static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2474{
2475	const char *s = path_init(nd, flags);
2476	int err;
2477
2478	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2479		err = handle_lookup_down(nd);
2480		if (unlikely(err < 0))
2481			s = ERR_PTR(err);
2482	}
2483
2484	while (!(err = link_path_walk(s, nd)) &&
2485	       (s = lookup_last(nd)) != NULL)
2486		;
2487	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2488		err = handle_lookup_down(nd);
2489		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2490	}
2491	if (!err)
2492		err = complete_walk(nd);
2493
2494	if (!err && nd->flags & LOOKUP_DIRECTORY)
2495		if (!d_can_lookup(nd->path.dentry))
2496			err = -ENOTDIR;
2497	if (!err) {
2498		*path = nd->path;
2499		nd->path.mnt = NULL;
2500		nd->path.dentry = NULL;
2501	}
2502	terminate_walk(nd);
2503	return err;
2504}
2505
2506int filename_lookup(int dfd, struct filename *name, unsigned flags,
2507		    struct path *path, struct path *root)
2508{
2509	int retval;
2510	struct nameidata nd;
2511	if (IS_ERR(name))
2512		return PTR_ERR(name);
2513	set_nameidata(&nd, dfd, name, root);
2514	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2515	if (unlikely(retval == -ECHILD))
2516		retval = path_lookupat(&nd, flags, path);
2517	if (unlikely(retval == -ESTALE))
2518		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2519
2520	if (likely(!retval))
2521		audit_inode(name, path->dentry,
2522			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2523	restore_nameidata();
2524	return retval;
2525}
2526
2527/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
2528static int path_parentat(struct nameidata *nd, unsigned flags,
2529				struct path *parent)
2530{
2531	const char *s = path_init(nd, flags);
2532	int err = link_path_walk(s, nd);
2533	if (!err)
2534		err = complete_walk(nd);
2535	if (!err) {
2536		*parent = nd->path;
2537		nd->path.mnt = NULL;
2538		nd->path.dentry = NULL;
2539	}
2540	terminate_walk(nd);
2541	return err;
2542}
2543
2544/* Note: this does not consume "name" */
2545static int __filename_parentat(int dfd, struct filename *name,
2546			       unsigned int flags, struct path *parent,
2547			       struct qstr *last, int *type,
2548			       const struct path *root)
2549{
2550	int retval;
2551	struct nameidata nd;
2552
2553	if (IS_ERR(name))
2554		return PTR_ERR(name);
2555	set_nameidata(&nd, dfd, name, root);
2556	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2557	if (unlikely(retval == -ECHILD))
2558		retval = path_parentat(&nd, flags, parent);
2559	if (unlikely(retval == -ESTALE))
2560		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2561	if (likely(!retval)) {
2562		*last = nd.last;
2563		*type = nd.last_type;
2564		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2565	}
2566	restore_nameidata();
2567	return retval;
2568}
2569
2570static int filename_parentat(int dfd, struct filename *name,
2571			     unsigned int flags, struct path *parent,
2572			     struct qstr *last, int *type)
2573{
2574	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2575}
2576
2577/* does lookup, returns the object with parent locked */
2578static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
2579{
2580	struct dentry *d;
2581	struct qstr last;
2582	int type, error;
2583
2584	error = filename_parentat(dfd, name, 0, path, &last, &type);
2585	if (error)
2586		return ERR_PTR(error);
2587	if (unlikely(type != LAST_NORM)) {
2588		path_put(path);
2589		return ERR_PTR(-EINVAL);
2590	}
2591	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2592	d = lookup_one_qstr_excl(&last, path->dentry, 0);
2593	if (IS_ERR(d)) {
2594		inode_unlock(path->dentry->d_inode);
2595		path_put(path);
2596	}
2597	return d;
2598}
2599
2600struct dentry *kern_path_locked(const char *name, struct path *path)
2601{
2602	struct filename *filename = getname_kernel(name);
2603	struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
2604
2605	putname(filename);
2606	return res;
2607}
2608
2609struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
2610{
2611	struct filename *filename = getname(name);
2612	struct dentry *res = __kern_path_locked(dfd, filename, path);
2613
2614	putname(filename);
2615	return res;
2616}
2617EXPORT_SYMBOL(user_path_locked_at);
2618
2619int kern_path(const char *name, unsigned int flags, struct path *path)
2620{
2621	struct filename *filename = getname_kernel(name);
2622	int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
2623
2624	putname(filename);
2625	return ret;
2626
2627}
2628EXPORT_SYMBOL(kern_path);
2629
2630/**
2631 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
2632 * @filename: filename structure
2633 * @flags: lookup flags
2634 * @parent: pointer to struct path to fill
2635 * @last: last component
2636 * @type: type of the last component
2637 * @root: pointer to struct path of the base directory
2638 */
2639int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
2640			   struct path *parent, struct qstr *last, int *type,
2641			   const struct path *root)
2642{
2643	return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
2644				    type, root);
2645}
2646EXPORT_SYMBOL(vfs_path_parent_lookup);
2647
2648/**
2649 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2650 * @dentry:  pointer to dentry of the base directory
2651 * @mnt: pointer to vfs mount of the base directory
2652 * @name: pointer to file name
2653 * @flags: lookup flags
2654 * @path: pointer to struct path to fill
2655 */
2656int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2657		    const char *name, unsigned int flags,
2658		    struct path *path)
2659{
2660	struct filename *filename;
2661	struct path root = {.mnt = mnt, .dentry = dentry};
2662	int ret;
2663
2664	filename = getname_kernel(name);
2665	/* the first argument of filename_lookup() is ignored with root */
2666	ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
2667	putname(filename);
2668	return ret;
2669}
2670EXPORT_SYMBOL(vfs_path_lookup);
2671
2672static int lookup_one_common(struct mnt_idmap *idmap,
2673			     const char *name, struct dentry *base, int len,
2674			     struct qstr *this)
2675{
2676	this->name = name;
2677	this->len = len;
2678	this->hash = full_name_hash(base, name, len);
2679	if (!len)
2680		return -EACCES;
2681
2682	if (is_dot_dotdot(name, len))
2683		return -EACCES;
2684
2685	while (len--) {
2686		unsigned int c = *(const unsigned char *)name++;
2687		if (c == '/' || c == '\0')
2688			return -EACCES;
2689	}
2690	/*
2691	 * See if the low-level filesystem might want
2692	 * to use its own hash..
2693	 */
2694	if (base->d_flags & DCACHE_OP_HASH) {
2695		int err = base->d_op->d_hash(base, this);
2696		if (err < 0)
2697			return err;
2698	}
2699
2700	return inode_permission(idmap, base->d_inode, MAY_EXEC);
2701}
2702
2703/**
2704 * try_lookup_one_len - filesystem helper to lookup single pathname component
2705 * @name:	pathname component to lookup
2706 * @base:	base directory to lookup from
2707 * @len:	maximum length @len should be interpreted to
2708 *
2709 * Look up a dentry by name in the dcache, returning NULL if it does not
2710 * currently exist.  The function does not try to create a dentry.
2711 *
2712 * Note that this routine is purely a helper for filesystem usage and should
2713 * not be called by generic code.
2714 *
2715 * The caller must hold base->i_mutex.
2716 */
2717struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
2718{
2719	struct qstr this;
2720	int err;
2721
2722	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2723
2724	err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
2725	if (err)
2726		return ERR_PTR(err);
2727
2728	return lookup_dcache(&this, base, 0);
2729}
2730EXPORT_SYMBOL(try_lookup_one_len);
2731
2732/**
2733 * lookup_one_len - filesystem helper to lookup single pathname component
2734 * @name:	pathname component to lookup
2735 * @base:	base directory to lookup from
2736 * @len:	maximum length @len should be interpreted to
2737 *
2738 * Note that this routine is purely a helper for filesystem usage and should
2739 * not be called by generic code.
2740 *
2741 * The caller must hold base->i_mutex.
2742 */
2743struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2744{
2745	struct dentry *dentry;
2746	struct qstr this;
2747	int err;
2748
2749	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2750
2751	err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
2752	if (err)
2753		return ERR_PTR(err);
2754
2755	dentry = lookup_dcache(&this, base, 0);
2756	return dentry ? dentry : __lookup_slow(&this, base, 0);
2757}
2758EXPORT_SYMBOL(lookup_one_len);
2759
2760/**
2761 * lookup_one - filesystem helper to lookup single pathname component
2762 * @idmap:	idmap of the mount the lookup is performed from
2763 * @name:	pathname component to lookup
2764 * @base:	base directory to lookup from
2765 * @len:	maximum length @len should be interpreted to
2766 *
2767 * Note that this routine is purely a helper for filesystem usage and should
2768 * not be called by generic code.
2769 *
2770 * The caller must hold base->i_mutex.
2771 */
2772struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
2773			  struct dentry *base, int len)
2774{
2775	struct dentry *dentry;
2776	struct qstr this;
2777	int err;
2778
2779	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2780
2781	err = lookup_one_common(idmap, name, base, len, &this);
2782	if (err)
2783		return ERR_PTR(err);
2784
2785	dentry = lookup_dcache(&this, base, 0);
2786	return dentry ? dentry : __lookup_slow(&this, base, 0);
2787}
2788EXPORT_SYMBOL(lookup_one);
2789
2790/**
2791 * lookup_one_unlocked - filesystem helper to lookup single pathname component
2792 * @idmap:	idmap of the mount the lookup is performed from
2793 * @name:	pathname component to lookup
2794 * @base:	base directory to lookup from
2795 * @len:	maximum length @len should be interpreted to
2796 *
2797 * Note that this routine is purely a helper for filesystem usage and should
2798 * not be called by generic code.
2799 *
2800 * Unlike lookup_one_len, it should be called without the parent
2801 * i_mutex held, and will take the i_mutex itself if necessary.
2802 */
2803struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
2804				   const char *name, struct dentry *base,
2805				   int len)
2806{
2807	struct qstr this;
2808	int err;
2809	struct dentry *ret;
2810
2811	err = lookup_one_common(idmap, name, base, len, &this);
2812	if (err)
2813		return ERR_PTR(err);
2814
2815	ret = lookup_dcache(&this, base, 0);
2816	if (!ret)
2817		ret = lookup_slow(&this, base, 0);
2818	return ret;
2819}
2820EXPORT_SYMBOL(lookup_one_unlocked);
2821
2822/**
2823 * lookup_one_positive_unlocked - filesystem helper to lookup single
2824 *				  pathname component
2825 * @idmap:	idmap of the mount the lookup is performed from
2826 * @name:	pathname component to lookup
2827 * @base:	base directory to lookup from
2828 * @len:	maximum length @len should be interpreted to
2829 *
2830 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
2831 * known positive or ERR_PTR(). This is what most of the users want.
2832 *
2833 * Note that pinned negative with unlocked parent _can_ become positive at any
2834 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
2835 * positives have >d_inode stable, so this one avoids such problems.
2836 *
2837 * Note that this routine is purely a helper for filesystem usage and should
2838 * not be called by generic code.
2839 *
2840 * The helper should be called without i_mutex held.
2841 */
2842struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
2843					    const char *name,
2844					    struct dentry *base, int len)
2845{
2846	struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);
2847
2848	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
2849		dput(ret);
2850		ret = ERR_PTR(-ENOENT);
2851	}
2852	return ret;
2853}
2854EXPORT_SYMBOL(lookup_one_positive_unlocked);
2855
2856/**
2857 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2858 * @name:	pathname component to lookup
2859 * @base:	base directory to lookup from
2860 * @len:	maximum length @len should be interpreted to
2861 *
2862 * Note that this routine is purely a helper for filesystem usage and should
2863 * not be called by generic code.
2864 *
2865 * Unlike lookup_one_len, it should be called without the parent
2866 * i_mutex held, and will take the i_mutex itself if necessary.
2867 */
2868struct dentry *lookup_one_len_unlocked(const char *name,
2869				       struct dentry *base, int len)
2870{
2871	return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
2872}
2873EXPORT_SYMBOL(lookup_one_len_unlocked);
2874
2875/*
2876 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
2877 * on negatives.  Returns known positive or ERR_PTR(); that's what
2878 * most of the users want.  Note that pinned negative with unlocked parent
2879 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
2880 * need to be very careful; pinned positives have ->d_inode stable, so
2881 * this one avoids such problems.
2882 */
2883struct dentry *lookup_positive_unlocked(const char *name,
2884				       struct dentry *base, int len)
2885{
2886	return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
2887}
2888EXPORT_SYMBOL(lookup_positive_unlocked);
2889
2890#ifdef CONFIG_UNIX98_PTYS
2891int path_pts(struct path *path)
2892{
2893	/* Find something mounted on "pts" in the same directory as
2894	 * the input path.
2895	 */
2896	struct dentry *parent = dget_parent(path->dentry);
2897	struct dentry *child;
2898	struct qstr this = QSTR_INIT("pts", 3);
2899
2900	if (unlikely(!path_connected(path->mnt, parent))) {
2901		dput(parent);
2902		return -ENOENT;
2903	}
2904	dput(path->dentry);
2905	path->dentry = parent;
2906	child = d_hash_and_lookup(parent, &this);
2907	if (IS_ERR_OR_NULL(child))
2908		return -ENOENT;
2909
2910	path->dentry = child;
2911	dput(parent);
2912	follow_down(path, 0);
2913	return 0;
2914}
2915#endif
2916
2917int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2918		 struct path *path, int *empty)
2919{
2920	struct filename *filename = getname_flags(name, flags, empty);
2921	int ret = filename_lookup(dfd, filename, flags, path, NULL);
2922
2923	putname(filename);
2924	return ret;
2925}
2926EXPORT_SYMBOL(user_path_at_empty);
2927
2928int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
2929		   struct inode *inode)
2930{
2931	kuid_t fsuid = current_fsuid();
2932
2933	if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
2934		return 0;
2935	if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
2936		return 0;
2937	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
2938}
2939EXPORT_SYMBOL(__check_sticky);
2940
2941/*
2942 *	Check whether we can remove a link victim from directory dir, check
2943 *  whether the type of victim is right.
2944 *  1. We can't do it if dir is read-only (done in permission())
2945 *  2. We should have write and exec permissions on dir
2946 *  3. We can't remove anything from append-only dir
2947 *  4. We can't do anything with immutable dir (done in permission())
2948 *  5. If the sticky bit on dir is set we should either
2949 *	a. be owner of dir, or
2950 *	b. be owner of victim, or
2951 *	c. have CAP_FOWNER capability
2952 *  6. If the victim is append-only or immutable we can't do antyhing with
2953 *     links pointing to it.
2954 *  7. If the victim has an unknown uid or gid we can't change the inode.
2955 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2956 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2957 * 10. We can't remove a root or mountpoint.
2958 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2959 *     nfs_async_unlink().
2960 */
2961static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
2962		      struct dentry *victim, bool isdir)
2963{
2964	struct inode *inode = d_backing_inode(victim);
2965	int error;
2966
2967	if (d_is_negative(victim))
2968		return -ENOENT;
2969	BUG_ON(!inode);
2970
2971	BUG_ON(victim->d_parent->d_inode != dir);
2972
2973	/* Inode writeback is not safe when the uid or gid are invalid. */
2974	if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
2975	    !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
2976		return -EOVERFLOW;
2977
2978	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2979
2980	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
2981	if (error)
2982		return error;
2983	if (IS_APPEND(dir))
2984		return -EPERM;
2985
2986	if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
2987	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
2988	    HAS_UNMAPPED_ID(idmap, inode))
2989		return -EPERM;
2990	if (isdir) {
2991		if (!d_is_dir(victim))
2992			return -ENOTDIR;
2993		if (IS_ROOT(victim))
2994			return -EBUSY;
2995	} else if (d_is_dir(victim))
2996		return -EISDIR;
2997	if (IS_DEADDIR(dir))
2998		return -ENOENT;
2999	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3000		return -EBUSY;
3001	return 0;
3002}
3003
3004/*	Check whether we can create an object with dentry child in directory
3005 *  dir.
3006 *  1. We can't do it if child already exists (open has special treatment for
3007 *     this case, but since we are inlined it's OK)
3008 *  2. We can't do it if dir is read-only (done in permission())
3009 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
3010 *  4. We should have write and exec permissions on dir
3011 *  5. We can't do it if dir is immutable (done in permission())
3012 */
3013static inline int may_create(struct mnt_idmap *idmap,
3014			     struct inode *dir, struct dentry *child)
3015{
3016	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
3017	if (child->d_inode)
3018		return -EEXIST;
3019	if (IS_DEADDIR(dir))
3020		return -ENOENT;
3021	if (!fsuidgid_has_mapping(dir->i_sb, idmap))
3022		return -EOVERFLOW;
3023
3024	return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3025}
3026
3027// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
3028static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
3029{
3030	struct dentry *p = p1, *q = p2, *r;
3031
3032	while ((r = p->d_parent) != p2 && r != p)
3033		p = r;
3034	if (r == p2) {
3035		// p is a child of p2 and an ancestor of p1 or p1 itself
3036		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3037		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
3038		return p;
3039	}
3040	// p is the root of connected component that contains p1
3041	// p2 does not occur on the path from p to p1
3042	while ((r = q->d_parent) != p1 && r != p && r != q)
3043		q = r;
3044	if (r == p1) {
3045		// q is a child of p1 and an ancestor of p2 or p2 itself
3046		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3047		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3048		return q;
3049	} else if (likely(r == p)) {
3050		// both p2 and p1 are descendents of p
3051		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3052		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3053		return NULL;
3054	} else { // no common ancestor at the time we'd been called
3055		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3056		return ERR_PTR(-EXDEV);
3057	}
3058}
3059
3060/*
3061 * p1 and p2 should be directories on the same fs.
3062 */
3063struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
3064{
3065	if (p1 == p2) {
3066		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3067		return NULL;
3068	}
3069
3070	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3071	return lock_two_directories(p1, p2);
3072}
3073EXPORT_SYMBOL(lock_rename);
3074
3075/*
3076 * c1 and p2 should be on the same fs.
3077 */
3078struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
3079{
3080	if (READ_ONCE(c1->d_parent) == p2) {
3081		/*
3082		 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
3083		 */
3084		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3085		/*
3086		 * now that p2 is locked, nobody can move in or out of it,
3087		 * so the test below is safe.
3088		 */
3089		if (likely(c1->d_parent == p2))
3090			return NULL;
3091
3092		/*
3093		 * c1 got moved out of p2 while we'd been taking locks;
3094		 * unlock and fall back to slow case.
3095		 */
3096		inode_unlock(p2->d_inode);
3097	}
3098
3099	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3100	/*
3101	 * nobody can move out of any directories on this fs.
3102	 */
3103	if (likely(c1->d_parent != p2))
3104		return lock_two_directories(c1->d_parent, p2);
3105
3106	/*
3107	 * c1 got moved into p2 while we were taking locks;
3108	 * we need p2 locked and ->s_vfs_rename_mutex unlocked,
3109	 * for consistency with lock_rename().
3110	 */
3111	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3112	mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
3113	return NULL;
3114}
3115EXPORT_SYMBOL(lock_rename_child);
3116
3117void unlock_rename(struct dentry *p1, struct dentry *p2)
3118{
3119	inode_unlock(p1->d_inode);
3120	if (p1 != p2) {
3121		inode_unlock(p2->d_inode);
3122		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3123	}
3124}
3125EXPORT_SYMBOL(unlock_rename);
3126
3127/**
3128 * vfs_prepare_mode - prepare the mode to be used for a new inode
3129 * @idmap:	idmap of the mount the inode was found from
3130 * @dir:	parent directory of the new inode
3131 * @mode:	mode of the new inode
3132 * @mask_perms:	allowed permission by the vfs
3133 * @type:	type of file to be created
3134 *
3135 * This helper consolidates and enforces vfs restrictions on the @mode of a new
3136 * object to be created.
3137 *
3138 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
3139 * the kernel documentation for mode_strip_umask()). Moving umask stripping
3140 * after setgid stripping allows the same ordering for both non-POSIX ACL and
3141 * POSIX ACL supporting filesystems.
3142 *
3143 * Note that it's currently valid for @type to be 0 if a directory is created.
3144 * Filesystems raise that flag individually and we need to check whether each
3145 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
3146 * non-zero type.
3147 *
3148 * Returns: mode to be passed to the filesystem
3149 */
3150static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
3151				       const struct inode *dir, umode_t mode,
3152				       umode_t mask_perms, umode_t type)
3153{
3154	mode = mode_strip_sgid(idmap, dir, mode);
3155	mode = mode_strip_umask(dir, mode);
3156
3157	/*
3158	 * Apply the vfs mandated allowed permission mask and set the type of
3159	 * file to be created before we call into the filesystem.
3160	 */
3161	mode &= (mask_perms & ~S_IFMT);
3162	mode |= (type & S_IFMT);
3163
3164	return mode;
3165}
3166
3167/**
3168 * vfs_create - create new file
3169 * @idmap:	idmap of the mount the inode was found from
3170 * @dir:	inode of @dentry
3171 * @dentry:	pointer to dentry of the base directory
3172 * @mode:	mode of the new file
3173 * @want_excl:	whether the file must not yet exist
3174 *
3175 * Create a new file.
3176 *
3177 * If the inode has been found through an idmapped mount the idmap of
3178 * the vfsmount must be passed through @idmap. This function will then take
3179 * care to map the inode according to @idmap before checking permissions.
3180 * On non-idmapped mounts or if permission checking is to be performed on the
3181 * raw inode simply pass @nop_mnt_idmap.
3182 */
3183int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
3184	       struct dentry *dentry, umode_t mode, bool want_excl)
3185{
3186	int error;
3187
3188	error = may_create(idmap, dir, dentry);
3189	if (error)
3190		return error;
3191
3192	if (!dir->i_op->create)
3193		return -EACCES;	/* shouldn't it be ENOSYS? */
3194
3195	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
3196	error = security_inode_create(dir, dentry, mode);
3197	if (error)
3198		return error;
3199	error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
3200	if (!error)
3201		fsnotify_create(dir, dentry);
3202	return error;
3203}
3204EXPORT_SYMBOL(vfs_create);
3205
3206int vfs_mkobj(struct dentry *dentry, umode_t mode,
3207		int (*f)(struct dentry *, umode_t, void *),
3208		void *arg)
3209{
3210	struct inode *dir = dentry->d_parent->d_inode;
3211	int error = may_create(&nop_mnt_idmap, dir, dentry);
3212	if (error)
3213		return error;
3214
3215	mode &= S_IALLUGO;
3216	mode |= S_IFREG;
3217	error = security_inode_create(dir, dentry, mode);
3218	if (error)
3219		return error;
3220	error = f(dentry, mode, arg);
3221	if (!error)
3222		fsnotify_create(dir, dentry);
3223	return error;
3224}
3225EXPORT_SYMBOL(vfs_mkobj);
3226
3227bool may_open_dev(const struct path *path)
3228{
3229	return !(path->mnt->mnt_flags & MNT_NODEV) &&
3230		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
3231}
3232
3233static int may_open(struct mnt_idmap *idmap, const struct path *path,
3234		    int acc_mode, int flag)
3235{
3236	struct dentry *dentry = path->dentry;
3237	struct inode *inode = dentry->d_inode;
3238	int error;
3239
3240	if (!inode)
3241		return -ENOENT;
3242
3243	switch (inode->i_mode & S_IFMT) {
3244	case S_IFLNK:
3245		return -ELOOP;
3246	case S_IFDIR:
3247		if (acc_mode & MAY_WRITE)
3248			return -EISDIR;
3249		if (acc_mode & MAY_EXEC)
3250			return -EACCES;
3251		break;
3252	case S_IFBLK:
3253	case S_IFCHR:
3254		if (!may_open_dev(path))
3255			return -EACCES;
3256		fallthrough;
3257	case S_IFIFO:
3258	case S_IFSOCK:
3259		if (acc_mode & MAY_EXEC)
3260			return -EACCES;
3261		flag &= ~O_TRUNC;
3262		break;
3263	case S_IFREG:
3264		if ((acc_mode & MAY_EXEC) && path_noexec(path))
3265			return -EACCES;
3266		break;
3267	}
3268
3269	error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
3270	if (error)
3271		return error;
3272
3273	/*
3274	 * An append-only file must be opened in append mode for writing.
3275	 */
3276	if (IS_APPEND(inode)) {
3277		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3278			return -EPERM;
3279		if (flag & O_TRUNC)
3280			return -EPERM;
3281	}
3282
3283	/* O_NOATIME can only be set by the owner or superuser */
3284	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
3285		return -EPERM;
3286
3287	return 0;
3288}
3289
3290static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
3291{
3292	const struct path *path = &filp->f_path;
3293	struct inode *inode = path->dentry->d_inode;
3294	int error = get_write_access(inode);
3295	if (error)
3296		return error;
3297
3298	error = security_file_truncate(filp);
3299	if (!error) {
3300		error = do_truncate(idmap, path->dentry, 0,
3301				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3302				    filp);
3303	}
3304	put_write_access(inode);
3305	return error;
3306}
3307
3308static inline int open_to_namei_flags(int flag)
3309{
3310	if ((flag & O_ACCMODE) == 3)
3311		flag--;
3312	return flag;
3313}
3314
3315static int may_o_create(struct mnt_idmap *idmap,
3316			const struct path *dir, struct dentry *dentry,
3317			umode_t mode)
3318{
3319	int error = security_path_mknod(dir, dentry, mode, 0);
3320	if (error)
3321		return error;
3322
3323	if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
3324		return -EOVERFLOW;
3325
3326	error = inode_permission(idmap, dir->dentry->d_inode,
3327				 MAY_WRITE | MAY_EXEC);
3328	if (error)
3329		return error;
3330
3331	return security_inode_create(dir->dentry->d_inode, dentry, mode);
3332}
3333
3334/*
3335 * Attempt to atomically look up, create and open a file from a negative
3336 * dentry.
3337 *
3338 * Returns 0 if successful.  The file will have been created and attached to
3339 * @file by the filesystem calling finish_open().
3340 *
3341 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
3342 * be set.  The caller will need to perform the open themselves.  @path will
3343 * have been updated to point to the new dentry.  This may be negative.
3344 *
3345 * Returns an error code otherwise.
3346 */
3347static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
3348				  struct file *file,
3349				  int open_flag, umode_t mode)
3350{
3351	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3352	struct inode *dir =  nd->path.dentry->d_inode;
3353	int error;
3354
3355	if (nd->flags & LOOKUP_DIRECTORY)
3356		open_flag |= O_DIRECTORY;
3357
3358	file->f_path.dentry = DENTRY_NOT_SET;
3359	file->f_path.mnt = nd->path.mnt;
3360	error = dir->i_op->atomic_open(dir, dentry, file,
3361				       open_to_namei_flags(open_flag), mode);
3362	d_lookup_done(dentry);
3363	if (!error) {
3364		if (file->f_mode & FMODE_OPENED) {
3365			if (unlikely(dentry != file->f_path.dentry)) {
3366				dput(dentry);
3367				dentry = dget(file->f_path.dentry);
3368			}
3369		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3370			error = -EIO;
3371		} else {
3372			if (file->f_path.dentry) {
3373				dput(dentry);
3374				dentry = file->f_path.dentry;
3375			}
3376			if (unlikely(d_is_negative(dentry)))
3377				error = -ENOENT;
3378		}
3379	}
3380	if (error) {
3381		dput(dentry);
3382		dentry = ERR_PTR(error);
3383	}
3384	return dentry;
3385}
3386
3387/*
3388 * Look up and maybe create and open the last component.
3389 *
3390 * Must be called with parent locked (exclusive in O_CREAT case).
3391 *
3392 * Returns 0 on success, that is, if
3393 *  the file was successfully atomically created (if necessary) and opened, or
3394 *  the file was not completely opened at this time, though lookups and
3395 *  creations were performed.
3396 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3397 * In the latter case dentry returned in @path might be negative if O_CREAT
3398 * hadn't been specified.
3399 *
3400 * An error code is returned on failure.
3401 */
3402static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
3403				  const struct open_flags *op,
3404				  bool got_write)
3405{
3406	struct mnt_idmap *idmap;
3407	struct dentry *dir = nd->path.dentry;
3408	struct inode *dir_inode = dir->d_inode;
3409	int open_flag = op->open_flag;
3410	struct dentry *dentry;
3411	int error, create_error = 0;
3412	umode_t mode = op->mode;
3413	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3414
3415	if (unlikely(IS_DEADDIR(dir_inode)))
3416		return ERR_PTR(-ENOENT);
3417
3418	file->f_mode &= ~FMODE_CREATED;
3419	dentry = d_lookup(dir, &nd->last);
3420	for (;;) {
3421		if (!dentry) {
3422			dentry = d_alloc_parallel(dir, &nd->last, &wq);
3423			if (IS_ERR(dentry))
3424				return dentry;
3425		}
3426		if (d_in_lookup(dentry))
3427			break;
3428
3429		error = d_revalidate(dentry, nd->flags);
3430		if (likely(error > 0))
3431			break;
3432		if (error)
3433			goto out_dput;
3434		d_invalidate(dentry);
3435		dput(dentry);
3436		dentry = NULL;
3437	}
3438	if (dentry->d_inode) {
3439		/* Cached positive dentry: will open in f_op->open */
3440		return dentry;
3441	}
3442
3443	/*
3444	 * Checking write permission is tricky, bacuse we don't know if we are
3445	 * going to actually need it: O_CREAT opens should work as long as the
3446	 * file exists.  But checking existence breaks atomicity.  The trick is
3447	 * to check access and if not granted clear O_CREAT from the flags.
3448	 *
3449	 * Another problem is returing the "right" error value (e.g. for an
3450	 * O_EXCL open we want to return EEXIST not EROFS).
3451	 */
3452	if (unlikely(!got_write))
3453		open_flag &= ~O_TRUNC;
3454	idmap = mnt_idmap(nd->path.mnt);
3455	if (open_flag & O_CREAT) {
3456		if (open_flag & O_EXCL)
3457			open_flag &= ~O_TRUNC;
3458		mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
3459		if (likely(got_write))
3460			create_error = may_o_create(idmap, &nd->path,
3461						    dentry, mode);
3462		else
3463			create_error = -EROFS;
3464	}
3465	if (create_error)
3466		open_flag &= ~O_CREAT;
3467	if (dir_inode->i_op->atomic_open) {
3468		dentry = atomic_open(nd, dentry, file, open_flag, mode);
3469		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
3470			dentry = ERR_PTR(create_error);
3471		return dentry;
3472	}
3473
3474	if (d_in_lookup(dentry)) {
3475		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3476							     nd->flags);
3477		d_lookup_done(dentry);
3478		if (unlikely(res)) {
3479			if (IS_ERR(res)) {
3480				error = PTR_ERR(res);
3481				goto out_dput;
3482			}
3483			dput(dentry);
3484			dentry = res;
3485		}
3486	}
3487
3488	/* Negative dentry, just create the file */
3489	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3490		file->f_mode |= FMODE_CREATED;
3491		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3492		if (!dir_inode->i_op->create) {
3493			error = -EACCES;
3494			goto out_dput;
3495		}
3496
3497		error = dir_inode->i_op->create(idmap, dir_inode, dentry,
3498						mode, open_flag & O_EXCL);
3499		if (error)
3500			goto out_dput;
3501	}
3502	if (unlikely(create_error) && !dentry->d_inode) {
3503		error = create_error;
3504		goto out_dput;
3505	}
3506	return dentry;
3507
3508out_dput:
3509	dput(dentry);
3510	return ERR_PTR(error);
3511}
3512
3513static const char *open_last_lookups(struct nameidata *nd,
3514		   struct file *file, const struct open_flags *op)
3515{
3516	struct dentry *dir = nd->path.dentry;
3517	int open_flag = op->open_flag;
3518	bool got_write = false;
3519	struct dentry *dentry;
3520	const char *res;
3521
3522	nd->flags |= op->intent;
3523
3524	if (nd->last_type != LAST_NORM) {
3525		if (nd->depth)
3526			put_link(nd);
3527		return handle_dots(nd, nd->last_type);
3528	}
3529
3530	if (!(open_flag & O_CREAT)) {
3531		if (nd->last.name[nd->last.len])
3532			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3533		/* we _can_ be in RCU mode here */
3534		dentry = lookup_fast(nd);
3535		if (IS_ERR(dentry))
3536			return ERR_CAST(dentry);
3537		if (likely(dentry))
3538			goto finish_lookup;
3539
3540		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
3541			return ERR_PTR(-ECHILD);
3542	} else {
3543		/* create side of things */
3544		if (nd->flags & LOOKUP_RCU) {
3545			if (!try_to_unlazy(nd))
3546				return ERR_PTR(-ECHILD);
3547		}
3548		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3549		/* trailing slashes? */
3550		if (unlikely(nd->last.name[nd->last.len]))
3551			return ERR_PTR(-EISDIR);
3552	}
3553
3554	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3555		got_write = !mnt_want_write(nd->path.mnt);
3556		/*
3557		 * do _not_ fail yet - we might not need that or fail with
3558		 * a different error; let lookup_open() decide; we'll be
3559		 * dropping this one anyway.
3560		 */
3561	}
3562	if (open_flag & O_CREAT)
3563		inode_lock(dir->d_inode);
3564	else
3565		inode_lock_shared(dir->d_inode);
3566	dentry = lookup_open(nd, file, op, got_write);
3567	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
3568		fsnotify_create(dir->d_inode, dentry);
3569	if (open_flag & O_CREAT)
3570		inode_unlock(dir->d_inode);
3571	else
3572		inode_unlock_shared(dir->d_inode);
3573
3574	if (got_write)
3575		mnt_drop_write(nd->path.mnt);
3576
3577	if (IS_ERR(dentry))
3578		return ERR_CAST(dentry);
3579
3580	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3581		dput(nd->path.dentry);
3582		nd->path.dentry = dentry;
3583		return NULL;
3584	}
3585
3586finish_lookup:
3587	if (nd->depth)
3588		put_link(nd);
3589	res = step_into(nd, WALK_TRAILING, dentry);
3590	if (unlikely(res))
3591		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3592	return res;
3593}
3594
3595/*
3596 * Handle the last step of open()
3597 */
3598static int do_open(struct nameidata *nd,
3599		   struct file *file, const struct open_flags *op)
3600{
3601	struct mnt_idmap *idmap;
3602	int open_flag = op->open_flag;
3603	bool do_truncate;
3604	int acc_mode;
3605	int error;
3606
3607	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
3608		error = complete_walk(nd);
3609		if (error)
3610			return error;
3611	}
3612	if (!(file->f_mode & FMODE_CREATED))
3613		audit_inode(nd->name, nd->path.dentry, 0);
3614	idmap = mnt_idmap(nd->path.mnt);
3615	if (open_flag & O_CREAT) {
3616		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3617			return -EEXIST;
3618		if (d_is_dir(nd->path.dentry))
3619			return -EISDIR;
3620		error = may_create_in_sticky(idmap, nd,
3621					     d_backing_inode(nd->path.dentry));
3622		if (unlikely(error))
3623			return error;
3624	}
3625	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3626		return -ENOTDIR;
3627
3628	do_truncate = false;
3629	acc_mode = op->acc_mode;
3630	if (file->f_mode & FMODE_CREATED) {
3631		/* Don't check for write permission, don't truncate */
3632		open_flag &= ~O_TRUNC;
3633		acc_mode = 0;
3634	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3635		error = mnt_want_write(nd->path.mnt);
3636		if (error)
3637			return error;
3638		do_truncate = true;
3639	}
3640	error = may_open(idmap, &nd->path, acc_mode, open_flag);
3641	if (!error && !(file->f_mode & FMODE_OPENED))
3642		error = vfs_open(&nd->path, file);
3643	if (!error)
3644		error = security_file_post_open(file, op->acc_mode);
3645	if (!error && do_truncate)
3646		error = handle_truncate(idmap, file);
3647	if (unlikely(error > 0)) {
3648		WARN_ON(1);
3649		error = -EINVAL;
3650	}
3651	if (do_truncate)
3652		mnt_drop_write(nd->path.mnt);
3653	return error;
3654}
3655
3656/**
3657 * vfs_tmpfile - create tmpfile
3658 * @idmap:	idmap of the mount the inode was found from
3659 * @parentpath:	pointer to the path of the base directory
3660 * @file:	file descriptor of the new tmpfile
3661 * @mode:	mode of the new tmpfile
3662 *
3663 * Create a temporary file.
3664 *
3665 * If the inode has been found through an idmapped mount the idmap of
3666 * the vfsmount must be passed through @idmap. This function will then take
3667 * care to map the inode according to @idmap before checking permissions.
3668 * On non-idmapped mounts or if permission checking is to be performed on the
3669 * raw inode simply pass @nop_mnt_idmap.
3670 */
3671static int vfs_tmpfile(struct mnt_idmap *idmap,
3672		       const struct path *parentpath,
3673		       struct file *file, umode_t mode)
3674{
3675	struct dentry *child;
3676	struct inode *dir = d_inode(parentpath->dentry);
3677	struct inode *inode;
3678	int error;
3679	int open_flag = file->f_flags;
3680
3681	/* we want directory to be writable */
3682	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3683	if (error)
3684		return error;
3685	if (!dir->i_op->tmpfile)
3686		return -EOPNOTSUPP;
3687	child = d_alloc(parentpath->dentry, &slash_name);
3688	if (unlikely(!child))
3689		return -ENOMEM;
3690	file->f_path.mnt = parentpath->mnt;
3691	file->f_path.dentry = child;
3692	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
3693	error = dir->i_op->tmpfile(idmap, dir, file, mode);
3694	dput(child);
3695	if (error)
3696		return error;
3697	/* Don't check for other permissions, the inode was just created */
3698	error = may_open(idmap, &file->f_path, 0, file->f_flags);
3699	if (error)
3700		return error;
3701	inode = file_inode(file);
3702	if (!(open_flag & O_EXCL)) {
3703		spin_lock(&inode->i_lock);
3704		inode->i_state |= I_LINKABLE;
3705		spin_unlock(&inode->i_lock);
3706	}
3707	security_inode_post_create_tmpfile(idmap, inode);
3708	return 0;
3709}
3710
3711/**
3712 * kernel_tmpfile_open - open a tmpfile for kernel internal use
3713 * @idmap:	idmap of the mount the inode was found from
3714 * @parentpath:	path of the base directory
3715 * @mode:	mode of the new tmpfile
3716 * @open_flag:	flags
3717 * @cred:	credentials for open
3718 *
3719 * Create and open a temporary file.  The file is not accounted in nr_files,
3720 * hence this is only for kernel internal use, and must not be installed into
3721 * file tables or such.
3722 */
3723struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
3724				 const struct path *parentpath,
3725				 umode_t mode, int open_flag,
3726				 const struct cred *cred)
3727{
3728	struct file *file;
3729	int error;
3730
3731	file = alloc_empty_file_noaccount(open_flag, cred);
3732	if (IS_ERR(file))
3733		return file;
3734
3735	error = vfs_tmpfile(idmap, parentpath, file, mode);
3736	if (error) {
3737		fput(file);
3738		file = ERR_PTR(error);
3739	}
3740	return file;
3741}
3742EXPORT_SYMBOL(kernel_tmpfile_open);
3743
3744static int do_tmpfile(struct nameidata *nd, unsigned flags,
3745		const struct open_flags *op,
3746		struct file *file)
3747{
3748	struct path path;
3749	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3750
3751	if (unlikely(error))
3752		return error;
3753	error = mnt_want_write(path.mnt);
3754	if (unlikely(error))
3755		goto out;
3756	error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
3757	if (error)
3758		goto out2;
3759	audit_inode(nd->name, file->f_path.dentry, 0);
3760out2:
3761	mnt_drop_write(path.mnt);
3762out:
3763	path_put(&path);
3764	return error;
3765}
3766
3767static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3768{
3769	struct path path;
3770	int error = path_lookupat(nd, flags, &path);
3771	if (!error) {
3772		audit_inode(nd->name, path.dentry, 0);
3773		error = vfs_open(&path, file);
3774		path_put(&path);
3775	}
3776	return error;
3777}
3778
3779static struct file *path_openat(struct nameidata *nd,
3780			const struct open_flags *op, unsigned flags)
3781{
3782	struct file *file;
3783	int error;
3784
3785	file = alloc_empty_file(op->open_flag, current_cred());
3786	if (IS_ERR(file))
3787		return file;
3788
3789	if (unlikely(file->f_flags & __O_TMPFILE)) {
3790		error = do_tmpfile(nd, flags, op, file);
3791	} else if (unlikely(file->f_flags & O_PATH)) {
3792		error = do_o_path(nd, flags, file);
3793	} else {
3794		const char *s = path_init(nd, flags);
3795		while (!(error = link_path_walk(s, nd)) &&
3796		       (s = open_last_lookups(nd, file, op)) != NULL)
3797			;
3798		if (!error)
3799			error = do_open(nd, file, op);
3800		terminate_walk(nd);
3801	}
3802	if (likely(!error)) {
3803		if (likely(file->f_mode & FMODE_OPENED))
3804			return file;
3805		WARN_ON(1);
3806		error = -EINVAL;
3807	}
3808	fput(file);
3809	if (error == -EOPENSTALE) {
3810		if (flags & LOOKUP_RCU)
3811			error = -ECHILD;
3812		else
3813			error = -ESTALE;
3814	}
3815	return ERR_PTR(error);
3816}
3817
3818struct file *do_filp_open(int dfd, struct filename *pathname,
3819		const struct open_flags *op)
3820{
3821	struct nameidata nd;
3822	int flags = op->lookup_flags;
3823	struct file *filp;
3824
3825	set_nameidata(&nd, dfd, pathname, NULL);
3826	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
3827	if (unlikely(filp == ERR_PTR(-ECHILD)))
3828		filp = path_openat(&nd, op, flags);
3829	if (unlikely(filp == ERR_PTR(-ESTALE)))
3830		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
3831	restore_nameidata();
3832	return filp;
3833}
3834
3835struct file *do_file_open_root(const struct path *root,
3836		const char *name, const struct open_flags *op)
3837{
3838	struct nameidata nd;
3839	struct file *file;
3840	struct filename *filename;
3841	int flags = op->lookup_flags;
3842
3843	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
3844		return ERR_PTR(-ELOOP);
3845
3846	filename = getname_kernel(name);
3847	if (IS_ERR(filename))
3848		return ERR_CAST(filename);
3849
3850	set_nameidata(&nd, -1, filename, root);
3851	file = path_openat(&nd, op, flags | LOOKUP_RCU);
3852	if (unlikely(file == ERR_PTR(-ECHILD)))
3853		file = path_openat(&nd, op, flags);
3854	if (unlikely(file == ERR_PTR(-ESTALE)))
3855		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
3856	restore_nameidata();
3857	putname(filename);
3858	return file;
3859}
3860
3861static struct dentry *filename_create(int dfd, struct filename *name,
3862				      struct path *path, unsigned int lookup_flags)
3863{
3864	struct dentry *dentry = ERR_PTR(-EEXIST);
3865	struct qstr last;
3866	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
3867	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
3868	unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
3869	int type;
3870	int err2;
3871	int error;
3872
3873	error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
3874	if (error)
3875		return ERR_PTR(error);
3876
3877	/*
3878	 * Yucky last component or no last component at all?
3879	 * (foo/., foo/.., /////)
3880	 */
3881	if (unlikely(type != LAST_NORM))
3882		goto out;
3883
3884	/* don't fail immediately if it's r/o, at least try to report other errors */
3885	err2 = mnt_want_write(path->mnt);
3886	/*
3887	 * Do the final lookup.  Suppress 'create' if there is a trailing
3888	 * '/', and a directory wasn't requested.
3889	 */
3890	if (last.name[last.len] && !want_dir)
3891		create_flags = 0;
3892	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
3893	dentry = lookup_one_qstr_excl(&last, path->dentry,
3894				      reval_flag | create_flags);
3895	if (IS_ERR(dentry))
3896		goto unlock;
3897
3898	error = -EEXIST;
3899	if (d_is_positive(dentry))
3900		goto fail;
3901
3902	/*
3903	 * Special case - lookup gave negative, but... we had foo/bar/
3904	 * From the vfs_mknod() POV we just have a negative dentry -
3905	 * all is fine. Let's be bastards - you had / on the end, you've
3906	 * been asking for (non-existent) directory. -ENOENT for you.
3907	 */
3908	if (unlikely(!create_flags)) {
3909		error = -ENOENT;
3910		goto fail;
3911	}
3912	if (unlikely(err2)) {
3913		error = err2;
3914		goto fail;
3915	}
3916	return dentry;
3917fail:
3918	dput(dentry);
3919	dentry = ERR_PTR(error);
3920unlock:
3921	inode_unlock(path->dentry->d_inode);
3922	if (!err2)
3923		mnt_drop_write(path->mnt);
3924out:
3925	path_put(path);
3926	return dentry;
3927}
3928
3929struct dentry *kern_path_create(int dfd, const char *pathname,
3930				struct path *path, unsigned int lookup_flags)
3931{
3932	struct filename *filename = getname_kernel(pathname);
3933	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
3934
3935	putname(filename);
3936	return res;
3937}
3938EXPORT_SYMBOL(kern_path_create);
3939
3940void done_path_create(struct path *path, struct dentry *dentry)
3941{
3942	dput(dentry);
3943	inode_unlock(path->dentry->d_inode);
3944	mnt_drop_write(path->mnt);
3945	path_put(path);
3946}
3947EXPORT_SYMBOL(done_path_create);
3948
3949inline struct dentry *user_path_create(int dfd, const char __user *pathname,
3950				struct path *path, unsigned int lookup_flags)
3951{
3952	struct filename *filename = getname(pathname);
3953	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
3954
3955	putname(filename);
3956	return res;
3957}
3958EXPORT_SYMBOL(user_path_create);
3959
3960/**
3961 * vfs_mknod - create device node or file
3962 * @idmap:	idmap of the mount the inode was found from
3963 * @dir:	inode of @dentry
3964 * @dentry:	pointer to dentry of the base directory
3965 * @mode:	mode of the new device node or file
3966 * @dev:	device number of device to create
3967 *
3968 * Create a device node or file.
3969 *
3970 * If the inode has been found through an idmapped mount the idmap of
3971 * the vfsmount must be passed through @idmap. This function will then take
3972 * care to map the inode according to @idmap before checking permissions.
3973 * On non-idmapped mounts or if permission checking is to be performed on the
3974 * raw inode simply pass @nop_mnt_idmap.
3975 */
3976int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
3977	      struct dentry *dentry, umode_t mode, dev_t dev)
3978{
3979	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3980	int error = may_create(idmap, dir, dentry);
3981
3982	if (error)
3983		return error;
3984
3985	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
3986	    !capable(CAP_MKNOD))
3987		return -EPERM;
3988
3989	if (!dir->i_op->mknod)
3990		return -EPERM;
3991
3992	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
3993	error = devcgroup_inode_mknod(mode, dev);
3994	if (error)
3995		return error;
3996
3997	error = security_inode_mknod(dir, dentry, mode, dev);
3998	if (error)
3999		return error;
4000
4001	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4002	if (!error)
4003		fsnotify_create(dir, dentry);
4004	return error;
4005}
4006EXPORT_SYMBOL(vfs_mknod);
4007
4008static int may_mknod(umode_t mode)
4009{
4010	switch (mode & S_IFMT) {
4011	case S_IFREG:
4012	case S_IFCHR:
4013	case S_IFBLK:
4014	case S_IFIFO:
4015	case S_IFSOCK:
4016	case 0: /* zero mode translates to S_IFREG */
4017		return 0;
4018	case S_IFDIR:
4019		return -EPERM;
4020	default:
4021		return -EINVAL;
4022	}
4023}
4024
4025static int do_mknodat(int dfd, struct filename *name, umode_t mode,
4026		unsigned int dev)
4027{
4028	struct mnt_idmap *idmap;
4029	struct dentry *dentry;
4030	struct path path;
4031	int error;
4032	unsigned int lookup_flags = 0;
4033
4034	error = may_mknod(mode);
4035	if (error)
4036		goto out1;
4037retry:
4038	dentry = filename_create(dfd, name, &path, lookup_flags);
4039	error = PTR_ERR(dentry);
4040	if (IS_ERR(dentry))
4041		goto out1;
4042
4043	error = security_path_mknod(&path, dentry,
4044			mode_strip_umask(path.dentry->d_inode, mode), dev);
4045	if (error)
4046		goto out2;
4047
4048	idmap = mnt_idmap(path.mnt);
4049	switch (mode & S_IFMT) {
4050		case 0: case S_IFREG:
4051			error = vfs_create(idmap, path.dentry->d_inode,
4052					   dentry, mode, true);
4053			if (!error)
4054				security_path_post_mknod(idmap, dentry);
4055			break;
4056		case S_IFCHR: case S_IFBLK:
4057			error = vfs_mknod(idmap, path.dentry->d_inode,
4058					  dentry, mode, new_decode_dev(dev));
4059			break;
4060		case S_IFIFO: case S_IFSOCK:
4061			error = vfs_mknod(idmap, path.dentry->d_inode,
4062					  dentry, mode, 0);
4063			break;
4064	}
4065out2:
4066	done_path_create(&path, dentry);
4067	if (retry_estale(error, lookup_flags)) {
4068		lookup_flags |= LOOKUP_REVAL;
4069		goto retry;
4070	}
4071out1:
4072	putname(name);
4073	return error;
4074}
4075
4076SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
4077		unsigned int, dev)
4078{
4079	return do_mknodat(dfd, getname(filename), mode, dev);
4080}
4081
4082SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
4083{
4084	return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
4085}
4086
4087/**
4088 * vfs_mkdir - create directory
4089 * @idmap:	idmap of the mount the inode was found from
4090 * @dir:	inode of @dentry
4091 * @dentry:	pointer to dentry of the base directory
4092 * @mode:	mode of the new directory
4093 *
4094 * Create a directory.
4095 *
4096 * If the inode has been found through an idmapped mount the idmap of
4097 * the vfsmount must be passed through @idmap. This function will then take
4098 * care to map the inode according to @idmap before checking permissions.
4099 * On non-idmapped mounts or if permission checking is to be performed on the
4100 * raw inode simply pass @nop_mnt_idmap.
4101 */
4102int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
4103	      struct dentry *dentry, umode_t mode)
4104{
4105	int error;
4106	unsigned max_links = dir->i_sb->s_max_links;
4107
4108	error = may_create(idmap, dir, dentry);
4109	if (error)
4110		return error;
4111
4112	if (!dir->i_op->mkdir)
4113		return -EPERM;
4114
4115	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
4116	error = security_inode_mkdir(dir, dentry, mode);
4117	if (error)
4118		return error;
4119
4120	if (max_links && dir->i_nlink >= max_links)
4121		return -EMLINK;
4122
4123	error = dir->i_op->mkdir(idmap, dir, dentry, mode);
4124	if (!error)
4125		fsnotify_mkdir(dir, dentry);
4126	return error;
4127}
4128EXPORT_SYMBOL(vfs_mkdir);
4129
4130int do_mkdirat(int dfd, struct filename *name, umode_t mode)
4131{
4132	struct dentry *dentry;
4133	struct path path;
4134	int error;
4135	unsigned int lookup_flags = LOOKUP_DIRECTORY;
4136
4137retry:
4138	dentry = filename_create(dfd, name, &path, lookup_flags);
4139	error = PTR_ERR(dentry);
4140	if (IS_ERR(dentry))
4141		goto out_putname;
4142
4143	error = security_path_mkdir(&path, dentry,
4144			mode_strip_umask(path.dentry->d_inode, mode));
4145	if (!error) {
4146		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
4147				  dentry, mode);
4148	}
4149	done_path_create(&path, dentry);
4150	if (retry_estale(error, lookup_flags)) {
4151		lookup_flags |= LOOKUP_REVAL;
4152		goto retry;
4153	}
4154out_putname:
4155	putname(name);
4156	return error;
4157}
4158
4159SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
4160{
4161	return do_mkdirat(dfd, getname(pathname), mode);
4162}
4163
4164SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
4165{
4166	return do_mkdirat(AT_FDCWD, getname(pathname), mode);
4167}
4168
4169/**
4170 * vfs_rmdir - remove directory
4171 * @idmap:	idmap of the mount the inode was found from
4172 * @dir:	inode of @dentry
4173 * @dentry:	pointer to dentry of the base directory
4174 *
4175 * Remove a directory.
4176 *
4177 * If the inode has been found through an idmapped mount the idmap of
4178 * the vfsmount must be passed through @idmap. This function will then take
4179 * care to map the inode according to @idmap before checking permissions.
4180 * On non-idmapped mounts or if permission checking is to be performed on the
4181 * raw inode simply pass @nop_mnt_idmap.
4182 */
4183int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
4184		     struct dentry *dentry)
4185{
4186	int error = may_delete(idmap, dir, dentry, 1);
4187
4188	if (error)
4189		return error;
4190
4191	if (!dir->i_op->rmdir)
4192		return -EPERM;
4193
4194	dget(dentry);
4195	inode_lock(dentry->d_inode);
4196
4197	error = -EBUSY;
4198	if (is_local_mountpoint(dentry) ||
4199	    (dentry->d_inode->i_flags & S_KERNEL_FILE))
4200		goto out;
4201
4202	error = security_inode_rmdir(dir, dentry);
4203	if (error)
4204		goto out;
4205
4206	error = dir->i_op->rmdir(dir, dentry);
4207	if (error)
4208		goto out;
4209
4210	shrink_dcache_parent(dentry);
4211	dentry->d_inode->i_flags |= S_DEAD;
4212	dont_mount(dentry);
4213	detach_mounts(dentry);
4214
4215out:
4216	inode_unlock(dentry->d_inode);
4217	dput(dentry);
4218	if (!error)
4219		d_delete_notify(dir, dentry);
4220	return error;
4221}
4222EXPORT_SYMBOL(vfs_rmdir);
4223
4224int do_rmdir(int dfd, struct filename *name)
4225{
4226	int error;
4227	struct dentry *dentry;
4228	struct path path;
4229	struct qstr last;
4230	int type;
4231	unsigned int lookup_flags = 0;
4232retry:
4233	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4234	if (error)
4235		goto exit1;
4236
4237	switch (type) {
4238	case LAST_DOTDOT:
4239		error = -ENOTEMPTY;
4240		goto exit2;
4241	case LAST_DOT:
4242		error = -EINVAL;
4243		goto exit2;
4244	case LAST_ROOT:
4245		error = -EBUSY;
4246		goto exit2;
4247	}
4248
4249	error = mnt_want_write(path.mnt);
4250	if (error)
4251		goto exit2;
4252
4253	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4254	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4255	error = PTR_ERR(dentry);
4256	if (IS_ERR(dentry))
4257		goto exit3;
4258	if (!dentry->d_inode) {
4259		error = -ENOENT;
4260		goto exit4;
4261	}
4262	error = security_path_rmdir(&path, dentry);
4263	if (error)
4264		goto exit4;
4265	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
4266exit4:
4267	dput(dentry);
4268exit3:
4269	inode_unlock(path.dentry->d_inode);
4270	mnt_drop_write(path.mnt);
4271exit2:
4272	path_put(&path);
4273	if (retry_estale(error, lookup_flags)) {
4274		lookup_flags |= LOOKUP_REVAL;
4275		goto retry;
4276	}
4277exit1:
4278	putname(name);
4279	return error;
4280}
4281
4282SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
4283{
4284	return do_rmdir(AT_FDCWD, getname(pathname));
4285}
4286
4287/**
4288 * vfs_unlink - unlink a filesystem object
4289 * @idmap:	idmap of the mount the inode was found from
4290 * @dir:	parent directory
4291 * @dentry:	victim
4292 * @delegated_inode: returns victim inode, if the inode is delegated.
4293 *
4294 * The caller must hold dir->i_mutex.
4295 *
4296 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4297 * return a reference to the inode in delegated_inode.  The caller
4298 * should then break the delegation on that inode and retry.  Because
4299 * breaking a delegation may take a long time, the caller should drop
4300 * dir->i_mutex before doing so.
4301 *
4302 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4303 * be appropriate for callers that expect the underlying filesystem not
4304 * to be NFS exported.
4305 *
4306 * If the inode has been found through an idmapped mount the idmap of
4307 * the vfsmount must be passed through @idmap. This function will then take
4308 * care to map the inode according to @idmap before checking permissions.
4309 * On non-idmapped mounts or if permission checking is to be performed on the
4310 * raw inode simply pass @nop_mnt_idmap.
4311 */
4312int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
4313	       struct dentry *dentry, struct inode **delegated_inode)
4314{
4315	struct inode *target = dentry->d_inode;
4316	int error = may_delete(idmap, dir, dentry, 0);
4317
4318	if (error)
4319		return error;
4320
4321	if (!dir->i_op->unlink)
4322		return -EPERM;
4323
4324	inode_lock(target);
4325	if (IS_SWAPFILE(target))
4326		error = -EPERM;
4327	else if (is_local_mountpoint(dentry))
4328		error = -EBUSY;
4329	else {
4330		error = security_inode_unlink(dir, dentry);
4331		if (!error) {
4332			error = try_break_deleg(target, delegated_inode);
4333			if (error)
4334				goto out;
4335			error = dir->i_op->unlink(dir, dentry);
4336			if (!error) {
4337				dont_mount(dentry);
4338				detach_mounts(dentry);
4339			}
4340		}
4341	}
4342out:
4343	inode_unlock(target);
4344
4345	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
4346	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
4347		fsnotify_unlink(dir, dentry);
4348	} else if (!error) {
4349		fsnotify_link_count(target);
4350		d_delete_notify(dir, dentry);
4351	}
4352
4353	return error;
4354}
4355EXPORT_SYMBOL(vfs_unlink);
4356
4357/*
4358 * Make sure that the actual truncation of the file will occur outside its
4359 * directory's i_mutex.  Truncate can take a long time if there is a lot of
4360 * writeout happening, and we don't want to prevent access to the directory
4361 * while waiting on the I/O.
4362 */
4363int do_unlinkat(int dfd, struct filename *name)
4364{
4365	int error;
4366	struct dentry *dentry;
4367	struct path path;
4368	struct qstr last;
4369	int type;
4370	struct inode *inode = NULL;
4371	struct inode *delegated_inode = NULL;
4372	unsigned int lookup_flags = 0;
4373retry:
4374	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4375	if (error)
4376		goto exit1;
4377
4378	error = -EISDIR;
4379	if (type != LAST_NORM)
4380		goto exit2;
4381
4382	error = mnt_want_write(path.mnt);
4383	if (error)
4384		goto exit2;
4385retry_deleg:
4386	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4387	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4388	error = PTR_ERR(dentry);
4389	if (!IS_ERR(dentry)) {
4390
4391		/* Why not before? Because we want correct error value */
4392		if (last.name[last.len] || d_is_negative(dentry))
4393			goto slashes;
4394		inode = dentry->d_inode;
4395		ihold(inode);
4396		error = security_path_unlink(&path, dentry);
4397		if (error)
4398			goto exit3;
4399		error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
4400				   dentry, &delegated_inode);
4401exit3:
4402		dput(dentry);
4403	}
4404	inode_unlock(path.dentry->d_inode);
4405	if (inode)
4406		iput(inode);	/* truncate the inode here */
4407	inode = NULL;
4408	if (delegated_inode) {
4409		error = break_deleg_wait(&delegated_inode);
4410		if (!error)
4411			goto retry_deleg;
4412	}
4413	mnt_drop_write(path.mnt);
4414exit2:
4415	path_put(&path);
4416	if (retry_estale(error, lookup_flags)) {
4417		lookup_flags |= LOOKUP_REVAL;
4418		inode = NULL;
4419		goto retry;
4420	}
4421exit1:
4422	putname(name);
4423	return error;
4424
4425slashes:
4426	if (d_is_negative(dentry))
4427		error = -ENOENT;
4428	else if (d_is_dir(dentry))
4429		error = -EISDIR;
4430	else
4431		error = -ENOTDIR;
4432	goto exit3;
4433}
4434
4435SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4436{
4437	if ((flag & ~AT_REMOVEDIR) != 0)
4438		return -EINVAL;
4439
4440	if (flag & AT_REMOVEDIR)
4441		return do_rmdir(dfd, getname(pathname));
4442	return do_unlinkat(dfd, getname(pathname));
4443}
4444
4445SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4446{
4447	return do_unlinkat(AT_FDCWD, getname(pathname));
4448}
4449
4450/**
4451 * vfs_symlink - create symlink
4452 * @idmap:	idmap of the mount the inode was found from
4453 * @dir:	inode of @dentry
4454 * @dentry:	pointer to dentry of the base directory
4455 * @oldname:	name of the file to link to
4456 *
4457 * Create a symlink.
4458 *
4459 * If the inode has been found through an idmapped mount the idmap of
4460 * the vfsmount must be passed through @idmap. This function will then take
4461 * care to map the inode according to @idmap before checking permissions.
4462 * On non-idmapped mounts or if permission checking is to be performed on the
4463 * raw inode simply pass @nop_mnt_idmap.
4464 */
4465int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
4466		struct dentry *dentry, const char *oldname)
4467{
4468	int error;
4469
4470	error = may_create(idmap, dir, dentry);
4471	if (error)
4472		return error;
4473
4474	if (!dir->i_op->symlink)
4475		return -EPERM;
4476
4477	error = security_inode_symlink(dir, dentry, oldname);
4478	if (error)
4479		return error;
4480
4481	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
4482	if (!error)
4483		fsnotify_create(dir, dentry);
4484	return error;
4485}
4486EXPORT_SYMBOL(vfs_symlink);
4487
4488int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
4489{
4490	int error;
4491	struct dentry *dentry;
4492	struct path path;
4493	unsigned int lookup_flags = 0;
4494
4495	if (IS_ERR(from)) {
4496		error = PTR_ERR(from);
4497		goto out_putnames;
4498	}
4499retry:
4500	dentry = filename_create(newdfd, to, &path, lookup_flags);
4501	error = PTR_ERR(dentry);
4502	if (IS_ERR(dentry))
4503		goto out_putnames;
4504
4505	error = security_path_symlink(&path, dentry, from->name);
4506	if (!error)
4507		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
4508				    dentry, from->name);
4509	done_path_create(&path, dentry);
4510	if (retry_estale(error, lookup_flags)) {
4511		lookup_flags |= LOOKUP_REVAL;
4512		goto retry;
4513	}
4514out_putnames:
4515	putname(to);
4516	putname(from);
4517	return error;
4518}
4519
4520SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4521		int, newdfd, const char __user *, newname)
4522{
4523	return do_symlinkat(getname(oldname), newdfd, getname(newname));
4524}
4525
4526SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4527{
4528	return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
4529}
4530
4531/**
4532 * vfs_link - create a new link
4533 * @old_dentry:	object to be linked
4534 * @idmap:	idmap of the mount
4535 * @dir:	new parent
4536 * @new_dentry:	where to create the new link
4537 * @delegated_inode: returns inode needing a delegation break
4538 *
4539 * The caller must hold dir->i_mutex
4540 *
4541 * If vfs_link discovers a delegation on the to-be-linked file in need
4542 * of breaking, it will return -EWOULDBLOCK and return a reference to the
4543 * inode in delegated_inode.  The caller should then break the delegation
4544 * and retry.  Because breaking a delegation may take a long time, the
4545 * caller should drop the i_mutex before doing so.
4546 *
4547 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4548 * be appropriate for callers that expect the underlying filesystem not
4549 * to be NFS exported.
4550 *
4551 * If the inode has been found through an idmapped mount the idmap of
4552 * the vfsmount must be passed through @idmap. This function will then take
4553 * care to map the inode according to @idmap before checking permissions.
4554 * On non-idmapped mounts or if permission checking is to be performed on the
4555 * raw inode simply pass @nop_mnt_idmap.
4556 */
4557int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
4558	     struct inode *dir, struct dentry *new_dentry,
4559	     struct inode **delegated_inode)
4560{
4561	struct inode *inode = old_dentry->d_inode;
4562	unsigned max_links = dir->i_sb->s_max_links;
4563	int error;
4564
4565	if (!inode)
4566		return -ENOENT;
4567
4568	error = may_create(idmap, dir, new_dentry);
4569	if (error)
4570		return error;
4571
4572	if (dir->i_sb != inode->i_sb)
4573		return -EXDEV;
4574
4575	/*
4576	 * A link to an append-only or immutable file cannot be created.
4577	 */
4578	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4579		return -EPERM;
4580	/*
4581	 * Updating the link count will likely cause i_uid and i_gid to
4582	 * be writen back improperly if their true value is unknown to
4583	 * the vfs.
4584	 */
4585	if (HAS_UNMAPPED_ID(idmap, inode))
4586		return -EPERM;
4587	if (!dir->i_op->link)
4588		return -EPERM;
4589	if (S_ISDIR(inode->i_mode))
4590		return -EPERM;
4591
4592	error = security_inode_link(old_dentry, dir, new_dentry);
4593	if (error)
4594		return error;
4595
4596	inode_lock(inode);
4597	/* Make sure we don't allow creating hardlink to an unlinked file */
4598	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4599		error =  -ENOENT;
4600	else if (max_links && inode->i_nlink >= max_links)
4601		error = -EMLINK;
4602	else {
4603		error = try_break_deleg(inode, delegated_inode);
4604		if (!error)
4605			error = dir->i_op->link(old_dentry, dir, new_dentry);
4606	}
4607
4608	if (!error && (inode->i_state & I_LINKABLE)) {
4609		spin_lock(&inode->i_lock);
4610		inode->i_state &= ~I_LINKABLE;
4611		spin_unlock(&inode->i_lock);
4612	}
4613	inode_unlock(inode);
4614	if (!error)
4615		fsnotify_link(dir, inode, new_dentry);
4616	return error;
4617}
4618EXPORT_SYMBOL(vfs_link);
4619
4620/*
4621 * Hardlinks are often used in delicate situations.  We avoid
4622 * security-related surprises by not following symlinks on the
4623 * newname.  --KAB
4624 *
4625 * We don't follow them on the oldname either to be compatible
4626 * with linux 2.0, and to avoid hard-linking to directories
4627 * and other special files.  --ADM
4628 */
4629int do_linkat(int olddfd, struct filename *old, int newdfd,
4630	      struct filename *new, int flags)
4631{
4632	struct mnt_idmap *idmap;
4633	struct dentry *new_dentry;
4634	struct path old_path, new_path;
4635	struct inode *delegated_inode = NULL;
4636	int how = 0;
4637	int error;
4638
4639	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
4640		error = -EINVAL;
4641		goto out_putnames;
4642	}
4643	/*
4644	 * To use null names we require CAP_DAC_READ_SEARCH
4645	 * This ensures that not everyone will be able to create
4646	 * handlink using the passed filedescriptor.
4647	 */
4648	if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
4649		error = -ENOENT;
4650		goto out_putnames;
4651	}
4652
4653	if (flags & AT_SYMLINK_FOLLOW)
4654		how |= LOOKUP_FOLLOW;
4655retry:
4656	error = filename_lookup(olddfd, old, how, &old_path, NULL);
4657	if (error)
4658		goto out_putnames;
4659
4660	new_dentry = filename_create(newdfd, new, &new_path,
4661					(how & LOOKUP_REVAL));
4662	error = PTR_ERR(new_dentry);
4663	if (IS_ERR(new_dentry))
4664		goto out_putpath;
4665
4666	error = -EXDEV;
4667	if (old_path.mnt != new_path.mnt)
4668		goto out_dput;
4669	idmap = mnt_idmap(new_path.mnt);
4670	error = may_linkat(idmap, &old_path);
4671	if (unlikely(error))
4672		goto out_dput;
4673	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4674	if (error)
4675		goto out_dput;
4676	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
4677			 new_dentry, &delegated_inode);
4678out_dput:
4679	done_path_create(&new_path, new_dentry);
4680	if (delegated_inode) {
4681		error = break_deleg_wait(&delegated_inode);
4682		if (!error) {
4683			path_put(&old_path);
4684			goto retry;
4685		}
4686	}
4687	if (retry_estale(error, how)) {
4688		path_put(&old_path);
4689		how |= LOOKUP_REVAL;
4690		goto retry;
4691	}
4692out_putpath:
4693	path_put(&old_path);
4694out_putnames:
4695	putname(old);
4696	putname(new);
4697
4698	return error;
4699}
4700
4701SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4702		int, newdfd, const char __user *, newname, int, flags)
4703{
4704	return do_linkat(olddfd, getname_uflags(oldname, flags),
4705		newdfd, getname(newname), flags);
4706}
4707
4708SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4709{
4710	return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
4711}
4712
4713/**
4714 * vfs_rename - rename a filesystem object
4715 * @rd:		pointer to &struct renamedata info
4716 *
4717 * The caller must hold multiple mutexes--see lock_rename()).
4718 *
4719 * If vfs_rename discovers a delegation in need of breaking at either
4720 * the source or destination, it will return -EWOULDBLOCK and return a
4721 * reference to the inode in delegated_inode.  The caller should then
4722 * break the delegation and retry.  Because breaking a delegation may
4723 * take a long time, the caller should drop all locks before doing
4724 * so.
4725 *
4726 * Alternatively, a caller may pass NULL for delegated_inode.  This may
4727 * be appropriate for callers that expect the underlying filesystem not
4728 * to be NFS exported.
4729 *
4730 * The worst of all namespace operations - renaming directory. "Perverted"
4731 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4732 * Problems:
4733 *
4734 *	a) we can get into loop creation.
4735 *	b) race potential - two innocent renames can create a loop together.
4736 *	   That's where 4.4BSD screws up. Current fix: serialization on
4737 *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4738 *	   story.
4739 *	c) we may have to lock up to _four_ objects - parents and victim (if it exists),
4740 *	   and source (if it's a non-directory or a subdirectory that moves to
4741 *	   different parent).
4742 *	   And that - after we got ->i_mutex on parents (until then we don't know
4743 *	   whether the target exists).  Solution: try to be smart with locking
4744 *	   order for inodes.  We rely on the fact that tree topology may change
4745 *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
4746 *	   move will be locked.  Thus we can rank directories by the tree
4747 *	   (ancestors first) and rank all non-directories after them.
4748 *	   That works since everybody except rename does "lock parent, lookup,
4749 *	   lock child" and rename is under ->s_vfs_rename_mutex.
4750 *	   HOWEVER, it relies on the assumption that any object with ->lookup()
4751 *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
4752 *	   we'd better make sure that there's no link(2) for them.
4753 *	d) conversion from fhandle to dentry may come in the wrong moment - when
4754 *	   we are removing the target. Solution: we will have to grab ->i_mutex
4755 *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4756 *	   ->i_mutex on parents, which works but leads to some truly excessive
4757 *	   locking].
4758 */
4759int vfs_rename(struct renamedata *rd)
4760{
4761	int error;
4762	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
4763	struct dentry *old_dentry = rd->old_dentry;
4764	struct dentry *new_dentry = rd->new_dentry;
4765	struct inode **delegated_inode = rd->delegated_inode;
4766	unsigned int flags = rd->flags;
4767	bool is_dir = d_is_dir(old_dentry);
4768	struct inode *source = old_dentry->d_inode;
4769	struct inode *target = new_dentry->d_inode;
4770	bool new_is_dir = false;
4771	unsigned max_links = new_dir->i_sb->s_max_links;
4772	struct name_snapshot old_name;
4773	bool lock_old_subdir, lock_new_subdir;
4774
4775	if (source == target)
4776		return 0;
4777
4778	error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
4779	if (error)
4780		return error;
4781
4782	if (!target) {
4783		error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
4784	} else {
4785		new_is_dir = d_is_dir(new_dentry);
4786
4787		if (!(flags & RENAME_EXCHANGE))
4788			error = may_delete(rd->new_mnt_idmap, new_dir,
4789					   new_dentry, is_dir);
4790		else
4791			error = may_delete(rd->new_mnt_idmap, new_dir,
4792					   new_dentry, new_is_dir);
4793	}
4794	if (error)
4795		return error;
4796
4797	if (!old_dir->i_op->rename)
4798		return -EPERM;
4799
4800	/*
4801	 * If we are going to change the parent - check write permissions,
4802	 * we'll need to flip '..'.
4803	 */
4804	if (new_dir != old_dir) {
4805		if (is_dir) {
4806			error = inode_permission(rd->old_mnt_idmap, source,
4807						 MAY_WRITE);
4808			if (error)
4809				return error;
4810		}
4811		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4812			error = inode_permission(rd->new_mnt_idmap, target,
4813						 MAY_WRITE);
4814			if (error)
4815				return error;
4816		}
4817	}
4818
4819	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4820				      flags);
4821	if (error)
4822		return error;
4823
4824	take_dentry_name_snapshot(&old_name, old_dentry);
4825	dget(new_dentry);
4826	/*
4827	 * Lock children.
4828	 * The source subdirectory needs to be locked on cross-directory
4829	 * rename or cross-directory exchange since its parent changes.
4830	 * The target subdirectory needs to be locked on cross-directory
4831	 * exchange due to parent change and on any rename due to becoming
4832	 * a victim.
4833	 * Non-directories need locking in all cases (for NFS reasons);
4834	 * they get locked after any subdirectories (in inode address order).
4835	 *
4836	 * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
4837	 * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
4838	 */
4839	lock_old_subdir = new_dir != old_dir;
4840	lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
4841	if (is_dir) {
4842		if (lock_old_subdir)
4843			inode_lock_nested(source, I_MUTEX_CHILD);
4844		if (target && (!new_is_dir || lock_new_subdir))
4845			inode_lock(target);
4846	} else if (new_is_dir) {
4847		if (lock_new_subdir)
4848			inode_lock_nested(target, I_MUTEX_CHILD);
4849		inode_lock(source);
4850	} else {
4851		lock_two_nondirectories(source, target);
4852	}
4853
4854	error = -EPERM;
4855	if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
4856		goto out;
4857
4858	error = -EBUSY;
4859	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4860		goto out;
4861
4862	if (max_links && new_dir != old_dir) {
4863		error = -EMLINK;
4864		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4865			goto out;
4866		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4867		    old_dir->i_nlink >= max_links)
4868			goto out;
4869	}
4870	if (!is_dir) {
4871		error = try_break_deleg(source, delegated_inode);
4872		if (error)
4873			goto out;
4874	}
4875	if (target && !new_is_dir) {
4876		error = try_break_deleg(target, delegated_inode);
4877		if (error)
4878			goto out;
4879	}
4880	error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
4881				      new_dir, new_dentry, flags);
4882	if (error)
4883		goto out;
4884
4885	if (!(flags & RENAME_EXCHANGE) && target) {
4886		if (is_dir) {
4887			shrink_dcache_parent(new_dentry);
4888			target->i_flags |= S_DEAD;
4889		}
4890		dont_mount(new_dentry);
4891		detach_mounts(new_dentry);
4892	}
4893	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4894		if (!(flags & RENAME_EXCHANGE))
4895			d_move(old_dentry, new_dentry);
4896		else
4897			d_exchange(old_dentry, new_dentry);
4898	}
4899out:
4900	if (!is_dir || lock_old_subdir)
4901		inode_unlock(source);
4902	if (target && (!new_is_dir || lock_new_subdir))
4903		inode_unlock(target);
4904	dput(new_dentry);
4905	if (!error) {
4906		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
4907			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
4908		if (flags & RENAME_EXCHANGE) {
4909			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
4910				      new_is_dir, NULL, new_dentry);
4911		}
4912	}
4913	release_dentry_name_snapshot(&old_name);
4914
4915	return error;
4916}
4917EXPORT_SYMBOL(vfs_rename);
4918
4919int do_renameat2(int olddfd, struct filename *from, int newdfd,
4920		 struct filename *to, unsigned int flags)
4921{
4922	struct renamedata rd;
4923	struct dentry *old_dentry, *new_dentry;
4924	struct dentry *trap;
4925	struct path old_path, new_path;
4926	struct qstr old_last, new_last;
4927	int old_type, new_type;
4928	struct inode *delegated_inode = NULL;
4929	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
4930	bool should_retry = false;
4931	int error = -EINVAL;
4932
4933	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4934		goto put_names;
4935
4936	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4937	    (flags & RENAME_EXCHANGE))
4938		goto put_names;
4939
4940	if (flags & RENAME_EXCHANGE)
4941		target_flags = 0;
4942
4943retry:
4944	error = filename_parentat(olddfd, from, lookup_flags, &old_path,
4945				  &old_last, &old_type);
4946	if (error)
4947		goto put_names;
4948
4949	error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
4950				  &new_type);
4951	if (error)
4952		goto exit1;
4953
4954	error = -EXDEV;
4955	if (old_path.mnt != new_path.mnt)
4956		goto exit2;
4957
4958	error = -EBUSY;
4959	if (old_type != LAST_NORM)
4960		goto exit2;
4961
4962	if (flags & RENAME_NOREPLACE)
4963		error = -EEXIST;
4964	if (new_type != LAST_NORM)
4965		goto exit2;
4966
4967	error = mnt_want_write(old_path.mnt);
4968	if (error)
4969		goto exit2;
4970
4971retry_deleg:
4972	trap = lock_rename(new_path.dentry, old_path.dentry);
4973	if (IS_ERR(trap)) {
4974		error = PTR_ERR(trap);
4975		goto exit_lock_rename;
4976	}
4977
4978	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
4979					  lookup_flags);
4980	error = PTR_ERR(old_dentry);
4981	if (IS_ERR(old_dentry))
4982		goto exit3;
4983	/* source must exist */
4984	error = -ENOENT;
4985	if (d_is_negative(old_dentry))
4986		goto exit4;
4987	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
4988					  lookup_flags | target_flags);
4989	error = PTR_ERR(new_dentry);
4990	if (IS_ERR(new_dentry))
4991		goto exit4;
4992	error = -EEXIST;
4993	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
4994		goto exit5;
4995	if (flags & RENAME_EXCHANGE) {
4996		error = -ENOENT;
4997		if (d_is_negative(new_dentry))
4998			goto exit5;
4999
5000		if (!d_is_dir(new_dentry)) {
5001			error = -ENOTDIR;
5002			if (new_last.name[new_last.len])
5003				goto exit5;
5004		}
5005	}
5006	/* unless the source is a directory trailing slashes give -ENOTDIR */
5007	if (!d_is_dir(old_dentry)) {
5008		error = -ENOTDIR;
5009		if (old_last.name[old_last.len])
5010			goto exit5;
5011		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
5012			goto exit5;
5013	}
5014	/* source should not be ancestor of target */
5015	error = -EINVAL;
5016	if (old_dentry == trap)
5017		goto exit5;
5018	/* target should not be an ancestor of source */
5019	if (!(flags & RENAME_EXCHANGE))
5020		error = -ENOTEMPTY;
5021	if (new_dentry == trap)
5022		goto exit5;
5023
5024	error = security_path_rename(&old_path, old_dentry,
5025				     &new_path, new_dentry, flags);
5026	if (error)
5027		goto exit5;
5028
5029	rd.old_dir	   = old_path.dentry->d_inode;
5030	rd.old_dentry	   = old_dentry;
5031	rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
5032	rd.new_dir	   = new_path.dentry->d_inode;
5033	rd.new_dentry	   = new_dentry;
5034	rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
5035	rd.delegated_inode = &delegated_inode;
5036	rd.flags	   = flags;
5037	error = vfs_rename(&rd);
5038exit5:
5039	dput(new_dentry);
5040exit4:
5041	dput(old_dentry);
5042exit3:
5043	unlock_rename(new_path.dentry, old_path.dentry);
5044exit_lock_rename:
5045	if (delegated_inode) {
5046		error = break_deleg_wait(&delegated_inode);
5047		if (!error)
5048			goto retry_deleg;
5049	}
5050	mnt_drop_write(old_path.mnt);
5051exit2:
5052	if (retry_estale(error, lookup_flags))
5053		should_retry = true;
5054	path_put(&new_path);
5055exit1:
5056	path_put(&old_path);
5057	if (should_retry) {
5058		should_retry = false;
5059		lookup_flags |= LOOKUP_REVAL;
5060		goto retry;
5061	}
5062put_names:
5063	putname(from);
5064	putname(to);
5065	return error;
5066}
5067
5068SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
5069		int, newdfd, const char __user *, newname, unsigned int, flags)
5070{
5071	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
5072				flags);
5073}
5074
5075SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
5076		int, newdfd, const char __user *, newname)
5077{
5078	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
5079				0);
5080}
5081
5082SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
5083{
5084	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
5085				getname(newname), 0);
5086}
5087
5088int readlink_copy(char __user *buffer, int buflen, const char *link)
5089{
5090	int len = PTR_ERR(link);
5091	if (IS_ERR(link))
5092		goto out;
5093
5094	len = strlen(link);
5095	if (len > (unsigned) buflen)
5096		len = buflen;
5097	if (copy_to_user(buffer, link, len))
5098		len = -EFAULT;
5099out:
5100	return len;
5101}
5102
5103/**
5104 * vfs_readlink - copy symlink body into userspace buffer
5105 * @dentry: dentry on which to get symbolic link
5106 * @buffer: user memory pointer
5107 * @buflen: size of buffer
5108 *
5109 * Does not touch atime.  That's up to the caller if necessary
5110 *
5111 * Does not call security hook.
5112 */
5113int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
5114{
5115	struct inode *inode = d_inode(dentry);
5116	DEFINE_DELAYED_CALL(done);
5117	const char *link;
5118	int res;
5119
5120	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
5121		if (unlikely(inode->i_op->readlink))
5122			return inode->i_op->readlink(dentry, buffer, buflen);
5123
5124		if (!d_is_symlink(dentry))
5125			return -EINVAL;
5126
5127		spin_lock(&inode->i_lock);
5128		inode->i_opflags |= IOP_DEFAULT_READLINK;
5129		spin_unlock(&inode->i_lock);
5130	}
5131
5132	link = READ_ONCE(inode->i_link);
5133	if (!link) {
5134		link = inode->i_op->get_link(dentry, inode, &done);
5135		if (IS_ERR(link))
5136			return PTR_ERR(link);
5137	}
5138	res = readlink_copy(buffer, buflen, link);
5139	do_delayed_call(&done);
5140	return res;
5141}
5142EXPORT_SYMBOL(vfs_readlink);
5143
5144/**
5145 * vfs_get_link - get symlink body
5146 * @dentry: dentry on which to get symbolic link
5147 * @done: caller needs to free returned data with this
5148 *
5149 * Calls security hook and i_op->get_link() on the supplied inode.
5150 *
5151 * It does not touch atime.  That's up to the caller if necessary.
5152 *
5153 * Does not work on "special" symlinks like /proc/$$/fd/N
5154 */
5155const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
5156{
5157	const char *res = ERR_PTR(-EINVAL);
5158	struct inode *inode = d_inode(dentry);
5159
5160	if (d_is_symlink(dentry)) {
5161		res = ERR_PTR(security_inode_readlink(dentry));
5162		if (!res)
5163			res = inode->i_op->get_link(dentry, inode, done);
5164	}
5165	return res;
5166}
5167EXPORT_SYMBOL(vfs_get_link);
5168
5169/* get the link contents into pagecache */
5170const char *page_get_link(struct dentry *dentry, struct inode *inode,
5171			  struct delayed_call *callback)
5172{
5173	char *kaddr;
5174	struct page *page;
5175	struct address_space *mapping = inode->i_mapping;
5176
5177	if (!dentry) {
5178		page = find_get_page(mapping, 0);
5179		if (!page)
5180			return ERR_PTR(-ECHILD);
5181		if (!PageUptodate(page)) {
5182			put_page(page);
5183			return ERR_PTR(-ECHILD);
5184		}
5185	} else {
5186		page = read_mapping_page(mapping, 0, NULL);
5187		if (IS_ERR(page))
5188			return (char*)page;
5189	}
5190	set_delayed_call(callback, page_put_link, page);
5191	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
5192	kaddr = page_address(page);
5193	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
5194	return kaddr;
5195}
5196
5197EXPORT_SYMBOL(page_get_link);
5198
5199void page_put_link(void *arg)
5200{
5201	put_page(arg);
5202}
5203EXPORT_SYMBOL(page_put_link);
5204
5205int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
5206{
5207	DEFINE_DELAYED_CALL(done);
5208	int res = readlink_copy(buffer, buflen,
5209				page_get_link(dentry, d_inode(dentry),
5210					      &done));
5211	do_delayed_call(&done);
5212	return res;
5213}
5214EXPORT_SYMBOL(page_readlink);
5215
5216int page_symlink(struct inode *inode, const char *symname, int len)
5217{
5218	struct address_space *mapping = inode->i_mapping;
5219	const struct address_space_operations *aops = mapping->a_ops;
5220	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
5221	struct page *page;
5222	void *fsdata = NULL;
5223	int err;
5224	unsigned int flags;
5225
5226retry:
5227	if (nofs)
5228		flags = memalloc_nofs_save();
5229	err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
5230	if (nofs)
5231		memalloc_nofs_restore(flags);
5232	if (err)
5233		goto fail;
5234
5235	memcpy(page_address(page), symname, len-1);
5236
5237	err = aops->write_end(NULL, mapping, 0, len-1, len-1,
5238							page, fsdata);
5239	if (err < 0)
5240		goto fail;
5241	if (err < len-1)
5242		goto retry;
5243
5244	mark_inode_dirty(inode);
5245	return 0;
5246fail:
5247	return err;
5248}
5249EXPORT_SYMBOL(page_symlink);
5250
5251const struct inode_operations page_symlink_inode_operations = {
5252	.get_link	= page_get_link,
5253};
5254EXPORT_SYMBOL(page_symlink_inode_operations);
5255