ext2_lookup.c revision 141633
1/*-
2 *  modified for Lites 1.1
3 *
4 *  Aug 1995, Godmar Back (gback@cs.utah.edu)
5 *  University of Utah, Department of Computer Science
6 */
7/*-
8 * Copyright (c) 1989, 1993
9 *	The Regents of the University of California.  All rights reserved.
10 * (c) UNIX System Laboratories, Inc.
11 * All or some portions of this file are derived from material licensed
12 * to the University of California by American Telephone and Telegraph
13 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
14 * the permission of UNIX System Laboratories, Inc.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 *    notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in the
23 *    documentation and/or other materials provided with the distribution.
24 * 4. Neither the name of the University nor the names of its contributors
25 *    may be used to endorse or promote products derived from this software
26 *    without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 *	@(#)ufs_lookup.c	8.6 (Berkeley) 4/1/94
41 * $FreeBSD: head/sys/gnu/fs/ext2fs/ext2_lookup.c 141633 2005-02-10 12:23:29Z phk $
42 */
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/namei.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/mount.h>
50#include <sys/vnode.h>
51#include <sys/malloc.h>
52#include <sys/dirent.h>
53#include <sys/sysctl.h>
54
55#include <ufs/ufs/dir.h>
56
57#include <gnu/ext2fs/inode.h>
58#include <gnu/ext2fs/ext2_mount.h>
59#include <gnu/ext2fs/ext2_extern.h>
60#include <gnu/ext2fs/ext2_fs.h>
61#include <gnu/ext2fs/ext2_fs_sb.h>
62
63#ifdef DIAGNOSTIC
64static int dirchk = 1;
65#else
66static int dirchk = 0;
67#endif
68
69static SYSCTL_NODE(_vfs, OID_AUTO, e2fs, CTLFLAG_RD, 0, "EXT2FS filesystem");
70SYSCTL_INT(_vfs_e2fs, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
71
72/*
73   DIRBLKSIZE in ffs is DEV_BSIZE (in most cases 512)
74   while it is the native blocksize in ext2fs - thus, a #define
75   is no longer appropriate
76*/
77#undef  DIRBLKSIZ
78
79static u_char ext2_ft_to_dt[] = {
80	DT_UNKNOWN,		/* EXT2_FT_UNKNOWN */
81	DT_REG,			/* EXT2_FT_REG_FILE */
82	DT_DIR,			/* EXT2_FT_DIR */
83	DT_CHR,			/* EXT2_FT_CHRDEV */
84	DT_BLK,			/* EXT2_FT_BLKDEV */
85	DT_FIFO,		/* EXT2_FT_FIFO */
86	DT_SOCK,		/* EXT2_FT_SOCK */
87	DT_LNK,			/* EXT2_FT_SYMLINK */
88};
89#define	FTTODT(ft)						\
90    ((ft) > sizeof(ext2_ft_to_dt) / sizeof(ext2_ft_to_dt[0]) ?	\
91    DT_UNKNOWN : ext2_ft_to_dt[(ft)])
92
93static u_char dt_to_ext2_ft[] = {
94	EXT2_FT_UNKNOWN,	/* DT_UNKNOWN */
95	EXT2_FT_FIFO,		/* DT_FIFO */
96	EXT2_FT_CHRDEV,		/* DT_CHR */
97	EXT2_FT_UNKNOWN,	/* unused */
98	EXT2_FT_DIR,		/* DT_DIR */
99	EXT2_FT_UNKNOWN,	/* unused */
100	EXT2_FT_BLKDEV,		/* DT_BLK */
101	EXT2_FT_UNKNOWN,	/* unused */
102	EXT2_FT_REG_FILE,	/* DT_REG */
103	EXT2_FT_UNKNOWN,	/* unused */
104	EXT2_FT_SYMLINK,	/* DT_LNK */
105	EXT2_FT_UNKNOWN,	/* unused */
106	EXT2_FT_SOCK,		/* DT_SOCK */
107	EXT2_FT_UNKNOWN,	/* unused */
108	EXT2_FT_UNKNOWN,	/* DT_WHT */
109};
110#define	DTTOFT(dt)						\
111    ((dt) > sizeof(dt_to_ext2_ft) / sizeof(dt_to_ext2_ft[0]) ?	\
112    EXT2_FT_UNKNOWN : dt_to_ext2_ft[(dt)])
113
114static int	ext2_dirbadentry(struct vnode *dp, struct ext2_dir_entry_2 *de,
115		    int entryoffsetinblock);
116
117/*
118 * Vnode op for reading directories.
119 *
120 * The routine below assumes that the on-disk format of a directory
121 * is the same as that defined by <sys/dirent.h>. If the on-disk
122 * format changes, then it will be necessary to do a conversion
123 * from the on-disk format that read returns to the format defined
124 * by <sys/dirent.h>.
125 */
126/*
127 * this is exactly what we do here - the problem is that the conversion
128 * will blow up some entries by four bytes, so it can't be done in place.
129 * This is too bad. Right now the conversion is done entry by entry, the
130 * converted entry is sent via uiomove.
131 *
132 * XXX allocate a buffer, convert as many entries as possible, then send
133 * the whole buffer to uiomove
134 */
135int
136ext2_readdir(ap)
137	struct vop_readdir_args /* {
138		struct vnode *a_vp;
139		struct uio *a_uio;
140		struct ucred *a_cred;
141	} */ *ap;
142{
143	struct uio *uio = ap->a_uio;
144	int count, error;
145
146	struct ext2_dir_entry_2 *edp, *dp;
147	int ncookies;
148	struct dirent dstdp;
149	struct uio auio;
150	struct iovec aiov;
151	caddr_t dirbuf;
152	int DIRBLKSIZ = VTOI(ap->a_vp)->i_e2fs->s_blocksize;
153	int readcnt;
154	off_t startoffset = uio->uio_offset;
155
156	count = uio->uio_resid;
157	/*
158	 * Avoid complications for partial directory entries by adjusting
159	 * the i/o to end at a block boundary.  Don't give up (like ufs
160	 * does) if the initial adjustment gives a negative count, since
161	 * many callers don't supply a large enough buffer.  The correct
162	 * size is a little larger than DIRBLKSIZ to allow for expansion
163	 * of directory entries, but some callers just use 512.
164	 */
165	count -= (uio->uio_offset + count) & (DIRBLKSIZ -1);
166	if (count <= 0)
167		count += DIRBLKSIZ;
168
169#ifdef EXT2FS_DEBUG
170	printf("ext2_readdir: uio_offset = %lld, uio_resid = %d, count = %d\n",
171	    uio->uio_offset, uio->uio_resid, count);
172#endif
173
174	auio = *uio;
175	auio.uio_iov = &aiov;
176	auio.uio_iovcnt = 1;
177	auio.uio_resid = count;
178	auio.uio_segflg = UIO_SYSSPACE;
179	aiov.iov_len = count;
180	MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK);
181	aiov.iov_base = dirbuf;
182	error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
183	if (error == 0) {
184		readcnt = count - auio.uio_resid;
185		edp = (struct ext2_dir_entry_2 *)&dirbuf[readcnt];
186		ncookies = 0;
187		bzero(&dstdp, offsetof(struct dirent, d_name));
188		for (dp = (struct ext2_dir_entry_2 *)dirbuf;
189		    !error && uio->uio_resid > 0 && dp < edp; ) {
190			/*-
191			 * "New" ext2fs directory entries differ in 3 ways
192			 * from ufs on-disk ones:
193			 * - the name is not necessarily NUL-terminated.
194			 * - the file type field always exists and always
195			 *   follows the name length field.
196			 * - the file type is encoded in a different way.
197			 *
198			 * "Old" ext2fs directory entries need no special
199			 * conversions, since they are binary compatible
200			 * with "new" entries having a file type of 0 (i.e.,
201			 * EXT2_FT_UNKNOWN).  Splitting the old name length
202			 * field didn't make a mess like it did in ufs,
203			 * because ext2fs uses a machine-independent disk
204			 * layout.
205			 */
206			dstdp.d_fileno = dp->inode;
207			dstdp.d_type = FTTODT(dp->file_type);
208			dstdp.d_namlen = dp->name_len;
209			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
210			bcopy(dp->name, dstdp.d_name, dstdp.d_namlen);
211			bzero(dstdp.d_name + dstdp.d_namlen,
212			    dstdp.d_reclen - offsetof(struct dirent, d_name) -
213			    dstdp.d_namlen);
214
215			if (dp->rec_len > 0) {
216				if(dstdp.d_reclen <= uio->uio_resid) {
217					/* advance dp */
218					dp = (struct ext2_dir_entry_2 *)
219					    ((char *)dp + dp->rec_len);
220					error =
221					  uiomove(&dstdp, dstdp.d_reclen, uio);
222					if (!error)
223						ncookies++;
224				} else
225					break;
226			} else {
227				error = EIO;
228				break;
229			}
230		}
231		/* we need to correct uio_offset */
232		uio->uio_offset = startoffset + (caddr_t)dp - dirbuf;
233
234		if (!error && ap->a_ncookies != NULL) {
235			u_long *cookiep, *cookies, *ecookies;
236			off_t off;
237
238			if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
239				panic("ext2_readdir: unexpected uio from NFS server");
240			MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP,
241			       M_WAITOK);
242			off = startoffset;
243			for (dp = (struct ext2_dir_entry_2 *)dirbuf,
244			     cookiep = cookies, ecookies = cookies + ncookies;
245			     cookiep < ecookies;
246			     dp = (struct ext2_dir_entry_2 *)((caddr_t) dp + dp->rec_len)) {
247				off += dp->rec_len;
248				*cookiep++ = (u_long) off;
249			}
250			*ap->a_ncookies = ncookies;
251			*ap->a_cookies = cookies;
252		}
253	}
254	FREE(dirbuf, M_TEMP);
255	if (ap->a_eofflag)
256		*ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset;
257	return (error);
258}
259
260/*
261 * Convert a component of a pathname into a pointer to a locked inode.
262 * This is a very central and rather complicated routine.
263 * If the file system is not maintained in a strict tree hierarchy,
264 * this can result in a deadlock situation (see comments in code below).
265 *
266 * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
267 * on whether the name is to be looked up, created, renamed, or deleted.
268 * When CREATE, RENAME, or DELETE is specified, information usable in
269 * creating, renaming, or deleting a directory entry may be calculated.
270 * If flag has LOCKPARENT or'ed into it and the target of the pathname
271 * exists, lookup returns both the target and its parent directory locked.
272 * When creating or renaming and LOCKPARENT is specified, the target may
273 * not be ".".  When deleting and LOCKPARENT is specified, the target may
274 * be "."., but the caller must check to ensure it does an vrele and vput
275 * instead of two vputs.
276 *
277 * Overall outline of ext2_lookup:
278 *
279 *	search for name in directory, to found or notfound
280 * notfound:
281 *	if creating, return locked directory, leaving info on available slots
282 *	else return error
283 * found:
284 *	if at end of path and deleting, return information to allow delete
285 *	if at end of path and rewriting (RENAME and LOCKPARENT), lock target
286 *	  inode and return info to allow rewrite
287 *	if not at end, add name to cache; if at end and neither creating
288 *	  nor deleting, add name to cache
289 */
290int
291ext2_lookup(ap)
292	struct vop_cachedlookup_args /* {
293		struct vnode *a_dvp;
294		struct vnode **a_vpp;
295		struct componentname *a_cnp;
296	} */ *ap;
297{
298	struct vnode *vdp;		/* vnode for directory being searched */
299	struct inode *dp;		/* inode for directory being searched */
300	struct buf *bp;			/* a buffer of directory entries */
301	struct ext2_dir_entry_2 *ep;	/* the current directory entry */
302	int entryoffsetinblock;		/* offset of ep in bp's buffer */
303	enum {NONE, COMPACT, FOUND} slotstatus;
304	doff_t slotoffset;		/* offset of area with free space */
305	int slotsize;			/* size of area at slotoffset */
306	int slotfreespace;		/* amount of space free in slot */
307	int slotneeded;			/* size of the entry we're seeking */
308	int numdirpasses;		/* strategy for directory search */
309	doff_t endsearch;		/* offset to end directory search */
310	doff_t prevoff;			/* prev entry dp->i_offset */
311	struct vnode *pdp;		/* saved dp during symlink work */
312	struct vnode *tdp;		/* returned by VFS_VGET */
313	doff_t enduseful;		/* pointer past last used dir slot */
314	u_long bmask;			/* block offset mask */
315	int lockparent;			/* 1 => lockparent flag is set */
316	int wantparent;			/* 1 => wantparent or lockparent flag */
317	int namlen, error;
318	struct vnode **vpp = ap->a_vpp;
319	struct componentname *cnp = ap->a_cnp;
320	struct ucred *cred = cnp->cn_cred;
321	int flags = cnp->cn_flags;
322	int nameiop = cnp->cn_nameiop;
323	struct thread *td = cnp->cn_thread;
324
325	int	DIRBLKSIZ = VTOI(ap->a_dvp)->i_e2fs->s_blocksize;
326
327	bp = NULL;
328	slotoffset = -1;
329	*vpp = NULL;
330	vdp = ap->a_dvp;
331	dp = VTOI(vdp);
332	lockparent = flags & LOCKPARENT;
333	wantparent = flags & (LOCKPARENT|WANTPARENT);
334
335	/*
336	 * We now have a segment name to search for, and a directory to search.
337	 */
338
339	/*
340	 * Suppress search for slots unless creating
341	 * file and at end of pathname, in which case
342	 * we watch for a place to put the new file in
343	 * case it doesn't already exist.
344	 */
345	slotstatus = FOUND;
346	slotfreespace = slotsize = slotneeded = 0;
347	if ((nameiop == CREATE || nameiop == RENAME) &&
348	    (flags & ISLASTCN)) {
349		slotstatus = NONE;
350		slotneeded = EXT2_DIR_REC_LEN(cnp->cn_namelen);
351		/* was
352		slotneeded = (sizeof(struct direct) - MAXNAMLEN +
353			cnp->cn_namelen + 3) &~ 3; */
354	}
355
356	/*
357	 * If there is cached information on a previous search of
358	 * this directory, pick up where we last left off.
359	 * We cache only lookups as these are the most common
360	 * and have the greatest payoff. Caching CREATE has little
361	 * benefit as it usually must search the entire directory
362	 * to determine that the entry does not exist. Caching the
363	 * location of the last DELETE or RENAME has not reduced
364	 * profiling time and hence has been removed in the interest
365	 * of simplicity.
366	 */
367	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
368	if (nameiop != LOOKUP || dp->i_diroff == 0 ||
369	    dp->i_diroff > dp->i_size) {
370		entryoffsetinblock = 0;
371		dp->i_offset = 0;
372		numdirpasses = 1;
373	} else {
374		dp->i_offset = dp->i_diroff;
375		if ((entryoffsetinblock = dp->i_offset & bmask) &&
376		    (error = ext2_blkatoff(vdp, (off_t)dp->i_offset, NULL,
377		    &bp)))
378			return (error);
379		numdirpasses = 2;
380		nchstats.ncs_2passes++;
381	}
382	prevoff = dp->i_offset;
383	endsearch = roundup(dp->i_size, DIRBLKSIZ);
384	enduseful = 0;
385
386searchloop:
387	while (dp->i_offset < endsearch) {
388		/*
389		 * If necessary, get the next directory block.
390		 */
391		if ((dp->i_offset & bmask) == 0) {
392			if (bp != NULL)
393				brelse(bp);
394			if ((error =
395			    ext2_blkatoff(vdp, (off_t)dp->i_offset, NULL,
396			    &bp)) != 0)
397				return (error);
398			entryoffsetinblock = 0;
399		}
400		/*
401		 * If still looking for a slot, and at a DIRBLKSIZE
402		 * boundary, have to start looking for free space again.
403		 */
404		if (slotstatus == NONE &&
405		    (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) {
406			slotoffset = -1;
407			slotfreespace = 0;
408		}
409		/*
410		 * Get pointer to next entry.
411		 * Full validation checks are slow, so we only check
412		 * enough to insure forward progress through the
413		 * directory. Complete checks can be run by setting
414		 * "vfs.e2fs.dirchk" to be true.
415		 */
416		ep = (struct ext2_dir_entry_2 *)
417			((char *)bp->b_data + entryoffsetinblock);
418		if (ep->rec_len == 0 ||
419		    (dirchk && ext2_dirbadentry(vdp, ep, entryoffsetinblock))) {
420			int i;
421			ext2_dirbad(dp, dp->i_offset, "mangled entry");
422			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
423			dp->i_offset += i;
424			entryoffsetinblock += i;
425			continue;
426		}
427
428		/*
429		 * If an appropriate sized slot has not yet been found,
430		 * check to see if one is available. Also accumulate space
431		 * in the current block so that we can determine if
432		 * compaction is viable.
433		 */
434		if (slotstatus != FOUND) {
435			int size = ep->rec_len;
436
437			if (ep->inode != 0)
438				size -= EXT2_DIR_REC_LEN(ep->name_len);
439			if (size > 0) {
440				if (size >= slotneeded) {
441					slotstatus = FOUND;
442					slotoffset = dp->i_offset;
443					slotsize = ep->rec_len;
444				} else if (slotstatus == NONE) {
445					slotfreespace += size;
446					if (slotoffset == -1)
447						slotoffset = dp->i_offset;
448					if (slotfreespace >= slotneeded) {
449						slotstatus = COMPACT;
450						slotsize = dp->i_offset +
451						      ep->rec_len - slotoffset;
452					}
453				}
454			}
455		}
456
457		/*
458		 * Check for a name match.
459		 */
460		if (ep->inode) {
461			namlen = ep->name_len;
462			if (namlen == cnp->cn_namelen &&
463			    !bcmp(cnp->cn_nameptr, ep->name,
464				(unsigned)namlen)) {
465				/*
466				 * Save directory entry's inode number and
467				 * reclen in ndp->ni_ufs area, and release
468				 * directory buffer.
469				 */
470				dp->i_ino = ep->inode;
471				dp->i_reclen = ep->rec_len;
472				goto found;
473			}
474		}
475		prevoff = dp->i_offset;
476		dp->i_offset += ep->rec_len;
477		entryoffsetinblock += ep->rec_len;
478		if (ep->inode)
479			enduseful = dp->i_offset;
480	}
481/* notfound: */
482	/*
483	 * If we started in the middle of the directory and failed
484	 * to find our target, we must check the beginning as well.
485	 */
486	if (numdirpasses == 2) {
487		numdirpasses--;
488		dp->i_offset = 0;
489		endsearch = dp->i_diroff;
490		goto searchloop;
491	}
492	if (bp != NULL)
493		brelse(bp);
494	/*
495	 * If creating, and at end of pathname and current
496	 * directory has not been removed, then can consider
497	 * allowing file to be created.
498	 */
499	if ((nameiop == CREATE || nameiop == RENAME) &&
500	    (flags & ISLASTCN) && dp->i_nlink != 0) {
501		/*
502		 * Access for write is interpreted as allowing
503		 * creation of files in the directory.
504		 */
505		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
506			return (error);
507		/*
508		 * Return an indication of where the new directory
509		 * entry should be put.  If we didn't find a slot,
510		 * then set dp->i_count to 0 indicating
511		 * that the new slot belongs at the end of the
512		 * directory. If we found a slot, then the new entry
513		 * can be put in the range from dp->i_offset to
514		 * dp->i_offset + dp->i_count.
515		 */
516		if (slotstatus == NONE) {
517			dp->i_offset = roundup(dp->i_size, DIRBLKSIZ);
518			dp->i_count = 0;
519			enduseful = dp->i_offset;
520		} else {
521			dp->i_offset = slotoffset;
522			dp->i_count = slotsize;
523			if (enduseful < slotoffset + slotsize)
524				enduseful = slotoffset + slotsize;
525		}
526		dp->i_endoff = roundup(enduseful, DIRBLKSIZ);
527		dp->i_flag |= IN_CHANGE | IN_UPDATE;
528		/*
529		 * We return with the directory locked, so that
530		 * the parameters we set up above will still be
531		 * valid if we actually decide to do a direnter().
532		 * We return ni_vp == NULL to indicate that the entry
533		 * does not currently exist; we leave a pointer to
534		 * the (locked) directory inode in ndp->ni_dvp.
535		 * The pathname buffer is saved so that the name
536		 * can be obtained later.
537		 *
538		 * NB - if the directory is unlocked, then this
539		 * information cannot be used.
540		 */
541		cnp->cn_flags |= SAVENAME;
542		if (!lockparent)
543			VOP_UNLOCK(vdp, 0, td);
544		return (EJUSTRETURN);
545	}
546	/*
547	 * Insert name into cache (as non-existent) if appropriate.
548	 */
549	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
550		cache_enter(vdp, *vpp, cnp);
551	return (ENOENT);
552
553found:
554	if (numdirpasses == 2)
555		nchstats.ncs_pass2++;
556	/*
557	 * Check that directory length properly reflects presence
558	 * of this entry.
559	 */
560	if (entryoffsetinblock + EXT2_DIR_REC_LEN(ep->name_len)
561		> dp->i_size) {
562		ext2_dirbad(dp, dp->i_offset, "i_size too small");
563		dp->i_size = entryoffsetinblock+EXT2_DIR_REC_LEN(ep->name_len);
564		dp->i_flag |= IN_CHANGE | IN_UPDATE;
565	}
566	brelse(bp);
567
568	/*
569	 * Found component in pathname.
570	 * If the final component of path name, save information
571	 * in the cache as to where the entry was found.
572	 */
573	if ((flags & ISLASTCN) && nameiop == LOOKUP)
574		dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1);
575
576	/*
577	 * If deleting, and at end of pathname, return
578	 * parameters which can be used to remove file.
579	 * If the wantparent flag isn't set, we return only
580	 * the directory (in ndp->ni_dvp), otherwise we go
581	 * on and lock the inode, being careful with ".".
582	 */
583	if (nameiop == DELETE && (flags & ISLASTCN)) {
584		/*
585		 * Write access to directory required to delete files.
586		 */
587		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
588			return (error);
589		/*
590		 * Return pointer to current entry in dp->i_offset,
591		 * and distance past previous entry (if there
592		 * is a previous entry in this block) in dp->i_count.
593		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
594		 */
595		if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0)
596			dp->i_count = 0;
597		else
598			dp->i_count = dp->i_offset - prevoff;
599		if (dp->i_number == dp->i_ino) {
600			VREF(vdp);
601			*vpp = vdp;
602			return (0);
603		}
604		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
605		    &tdp)) != 0)
606			return (error);
607		/*
608		 * If directory is "sticky", then user must own
609		 * the directory, or the file in it, else she
610		 * may not delete it (unless she's root). This
611		 * implements append-only directories.
612		 */
613		if ((dp->i_mode & ISVTX) &&
614		    cred->cr_uid != 0 &&
615		    cred->cr_uid != dp->i_uid &&
616		    VTOI(tdp)->i_uid != cred->cr_uid) {
617			vput(tdp);
618			return (EPERM);
619		}
620		*vpp = tdp;
621		if (!lockparent)
622			VOP_UNLOCK(vdp, 0, td);
623		return (0);
624	}
625
626	/*
627	 * If rewriting (RENAME), return the inode and the
628	 * information required to rewrite the present directory
629	 * Must get inode of directory entry to verify it's a
630	 * regular file, or empty directory.
631	 */
632	if (nameiop == RENAME && wantparent &&
633	    (flags & ISLASTCN)) {
634		if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread)) != 0)
635			return (error);
636		/*
637		 * Careful about locking second inode.
638		 * This can only occur if the target is ".".
639		 */
640		if (dp->i_number == dp->i_ino)
641			return (EISDIR);
642		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
643		    &tdp)) != 0)
644			return (error);
645		*vpp = tdp;
646		cnp->cn_flags |= SAVENAME;
647		if (!lockparent)
648			VOP_UNLOCK(vdp, 0, td);
649		return (0);
650	}
651
652	/*
653	 * Step through the translation in the name.  We do not `vput' the
654	 * directory because we may need it again if a symbolic link
655	 * is relative to the current directory.  Instead we save it
656	 * unlocked as "pdp".  We must get the target inode before unlocking
657	 * the directory to insure that the inode will not be removed
658	 * before we get it.  We prevent deadlock by always fetching
659	 * inodes from the root, moving down the directory tree. Thus
660	 * when following backward pointers ".." we must unlock the
661	 * parent directory before getting the requested directory.
662	 * There is a potential race condition here if both the current
663	 * and parent directories are removed before the VFS_VGET for the
664	 * inode associated with ".." returns.  We hope that this occurs
665	 * infrequently since we cannot avoid this race condition without
666	 * implementing a sophisticated deadlock detection algorithm.
667	 * Note also that this simple deadlock detection scheme will not
668	 * work if the file system has any hard links other than ".."
669	 * that point backwards in the directory structure.
670	 */
671	pdp = vdp;
672	if (flags & ISDOTDOT) {
673		VOP_UNLOCK(pdp, 0, td);	/* race to get the inode */
674		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
675		    &tdp)) != 0) {
676			vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td);
677			return (error);
678		}
679		if (lockparent && (flags & ISLASTCN) &&
680		    (error = vn_lock(pdp, LK_EXCLUSIVE, td))) {
681			vput(tdp);
682			return (error);
683		}
684		*vpp = tdp;
685	} else if (dp->i_number == dp->i_ino) {
686		VREF(vdp);	/* we want ourself, ie "." */
687		*vpp = vdp;
688	} else {
689		if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, LK_EXCLUSIVE,
690		    &tdp)) != 0)
691			return (error);
692		if (!lockparent || !(flags & ISLASTCN))
693			VOP_UNLOCK(pdp, 0, td);
694		*vpp = tdp;
695	}
696
697	/*
698	 * Insert name into cache if appropriate.
699	 */
700	if (cnp->cn_flags & MAKEENTRY)
701		cache_enter(vdp, *vpp, cnp);
702	return (0);
703}
704
705void
706ext2_dirbad(ip, offset, how)
707	struct inode *ip;
708	doff_t offset;
709	char *how;
710{
711	struct mount *mp;
712
713	mp = ITOV(ip)->v_mount;
714	(void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
715	    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
716	if ((mp->mnt_flag & MNT_RDONLY) == 0)
717		panic("ext2_dirbad: bad dir");
718}
719
720/*
721 * Do consistency checking on a directory entry:
722 *	record length must be multiple of 4
723 *	entry must fit in rest of its DIRBLKSIZ block
724 *	record must be large enough to contain entry
725 *	name is not longer than MAXNAMLEN
726 *	name must be as long as advertised, and null terminated
727 */
728/*
729 *	changed so that it confirms to ext2_check_dir_entry
730 */
731static int
732ext2_dirbadentry(dp, de, entryoffsetinblock)
733	struct vnode *dp;
734	struct ext2_dir_entry_2 *de;
735	int entryoffsetinblock;
736{
737	int	DIRBLKSIZ = VTOI(dp)->i_e2fs->s_blocksize;
738
739	char * error_msg = NULL;
740
741	if (de->rec_len < EXT2_DIR_REC_LEN(1))
742		error_msg = "rec_len is smaller than minimal";
743	else if (de->rec_len % 4 != 0)
744		error_msg = "rec_len % 4 != 0";
745	else if (de->rec_len < EXT2_DIR_REC_LEN(de->name_len))
746		error_msg = "reclen is too small for name_len";
747	else if (entryoffsetinblock + de->rec_len > DIRBLKSIZ)
748		error_msg = "directory entry across blocks";
749	/* else LATER
750	     if (de->inode > dir->i_sb->u.ext2_sb.s_es->s_inodes_count)
751		error_msg = "inode out of bounds";
752	*/
753
754	if (error_msg != NULL) {
755		printf("bad directory entry: %s\n", error_msg);
756		printf("offset=%d, inode=%lu, rec_len=%u, name_len=%u\n",
757			entryoffsetinblock, (unsigned long)de->inode,
758			de->rec_len, de->name_len);
759	}
760	return error_msg == NULL ? 0 : 1;
761}
762
763/*
764 * Write a directory entry after a call to namei, using the parameters
765 * that it left in nameidata.  The argument ip is the inode which the new
766 * directory entry will refer to.  Dvp is a pointer to the directory to
767 * be written, which was left locked by namei. Remaining parameters
768 * (dp->i_offset, dp->i_count) indicate how the space for the new
769 * entry is to be obtained.
770 */
771int
772ext2_direnter(ip, dvp, cnp)
773	struct inode *ip;
774	struct vnode *dvp;
775	struct componentname *cnp;
776{
777	struct ext2_dir_entry_2 *ep, *nep;
778	struct inode *dp;
779	struct buf *bp;
780	struct ext2_dir_entry_2 newdir;
781	struct iovec aiov;
782	struct uio auio;
783	u_int dsize;
784	int error, loc, newentrysize, spacefree;
785	char *dirbuf;
786	int     DIRBLKSIZ = ip->i_e2fs->s_blocksize;
787
788
789#if DIAGNOSTIC
790	if ((cnp->cn_flags & SAVENAME) == 0)
791		panic("direnter: missing name");
792#endif
793	dp = VTOI(dvp);
794	newdir.inode = ip->i_number;
795	newdir.name_len = cnp->cn_namelen;
796	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es,
797	    EXT2_FEATURE_INCOMPAT_FILETYPE))
798		newdir.file_type = DTTOFT(IFTODT(ip->i_mode));
799	else
800		newdir.file_type = EXT2_FT_UNKNOWN;
801	bcopy(cnp->cn_nameptr, newdir.name, (unsigned)cnp->cn_namelen + 1);
802	newentrysize = EXT2_DIR_REC_LEN(newdir.name_len);
803	if (dp->i_count == 0) {
804		/*
805		 * If dp->i_count is 0, then namei could find no
806		 * space in the directory. Here, dp->i_offset will
807		 * be on a directory block boundary and we will write the
808		 * new entry into a fresh block.
809		 */
810		if (dp->i_offset & (DIRBLKSIZ - 1))
811			panic("ext2_direnter: newblk");
812		auio.uio_offset = dp->i_offset;
813		newdir.rec_len = DIRBLKSIZ;
814		auio.uio_resid = newentrysize;
815		aiov.iov_len = newentrysize;
816		aiov.iov_base = (caddr_t)&newdir;
817		auio.uio_iov = &aiov;
818		auio.uio_iovcnt = 1;
819		auio.uio_rw = UIO_WRITE;
820		auio.uio_segflg = UIO_SYSSPACE;
821		auio.uio_td = (struct thread *)0;
822		error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred);
823		if (DIRBLKSIZ >
824		    VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
825			/* XXX should grow with balloc() */
826			panic("ext2_direnter: frag size");
827		else if (!error) {
828			dp->i_size = roundup(dp->i_size, DIRBLKSIZ);
829			dp->i_flag |= IN_CHANGE;
830		}
831		return (error);
832	}
833
834	/*
835	 * If dp->i_count is non-zero, then namei found space
836	 * for the new entry in the range dp->i_offset to
837	 * dp->i_offset + dp->i_count in the directory.
838	 * To use this space, we may have to compact the entries located
839	 * there, by copying them together towards the beginning of the
840	 * block, leaving the free space in one usable chunk at the end.
841	 */
842
843	/*
844	 * Increase size of directory if entry eats into new space.
845	 * This should never push the size past a new multiple of
846	 * DIRBLKSIZE.
847	 *
848	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
849	 */
850	if (dp->i_offset + dp->i_count > dp->i_size)
851		dp->i_size = dp->i_offset + dp->i_count;
852	/*
853	 * Get the block containing the space for the new directory entry.
854	 */
855	if ((error = ext2_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf,
856	    &bp)) != 0)
857		return (error);
858	/*
859	 * Find space for the new entry. In the simple case, the entry at
860	 * offset base will have the space. If it does not, then namei
861	 * arranged that compacting the region dp->i_offset to
862	 * dp->i_offset + dp->i_count would yield the
863	 * space.
864	 */
865	ep = (struct ext2_dir_entry_2 *)dirbuf;
866	dsize = EXT2_DIR_REC_LEN(ep->name_len);
867	spacefree = ep->rec_len - dsize;
868	for (loc = ep->rec_len; loc < dp->i_count; ) {
869		nep = (struct ext2_dir_entry_2 *)(dirbuf + loc);
870		if (ep->inode) {
871			/* trim the existing slot */
872			ep->rec_len = dsize;
873			ep = (struct ext2_dir_entry_2 *)((char *)ep + dsize);
874		} else {
875			/* overwrite; nothing there; header is ours */
876			spacefree += dsize;
877		}
878		dsize = EXT2_DIR_REC_LEN(nep->name_len);
879		spacefree += nep->rec_len - dsize;
880		loc += nep->rec_len;
881		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
882	}
883	/*
884	 * Update the pointer fields in the previous entry (if any),
885	 * copy in the new entry, and write out the block.
886	 */
887	if (ep->inode == 0) {
888		if (spacefree + dsize < newentrysize)
889			panic("ext2_direnter: compact1");
890		newdir.rec_len = spacefree + dsize;
891	} else {
892		if (spacefree < newentrysize)
893			panic("ext2_direnter: compact2");
894		newdir.rec_len = spacefree;
895		ep->rec_len = dsize;
896		ep = (struct ext2_dir_entry_2 *)((char *)ep + dsize);
897	}
898	bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize);
899	error = bwrite(bp);
900	dp->i_flag |= IN_CHANGE | IN_UPDATE;
901	if (!error && dp->i_endoff && dp->i_endoff < dp->i_size)
902		error = ext2_truncate(dvp, (off_t)dp->i_endoff, IO_SYNC,
903		    cnp->cn_cred, cnp->cn_thread);
904	return (error);
905}
906
907/*
908 * Remove a directory entry after a call to namei, using
909 * the parameters which it left in nameidata. The entry
910 * dp->i_offset contains the offset into the directory of the
911 * entry to be eliminated.  The dp->i_count field contains the
912 * size of the previous record in the directory.  If this
913 * is 0, the first entry is being deleted, so we need only
914 * zero the inode number to mark the entry as free.  If the
915 * entry is not the first in the directory, we must reclaim
916 * the space of the now empty record by adding the record size
917 * to the size of the previous entry.
918 */
919int
920ext2_dirremove(dvp, cnp)
921	struct vnode *dvp;
922	struct componentname *cnp;
923{
924	struct inode *dp;
925	struct ext2_dir_entry_2 *ep;
926	struct buf *bp;
927	int error;
928
929	dp = VTOI(dvp);
930	if (dp->i_count == 0) {
931		/*
932		 * First entry in block: set d_ino to zero.
933		 */
934		if ((error =
935		    ext2_blkatoff(dvp, (off_t)dp->i_offset, (char **)&ep,
936		    &bp)) != 0)
937			return (error);
938		ep->inode = 0;
939		error = bwrite(bp);
940		dp->i_flag |= IN_CHANGE | IN_UPDATE;
941		return (error);
942	}
943	/*
944	 * Collapse new free space into previous entry.
945	 */
946	if ((error = ext2_blkatoff(dvp, (off_t)(dp->i_offset - dp->i_count),
947	    (char **)&ep, &bp)) != 0)
948		return (error);
949	ep->rec_len += dp->i_reclen;
950	error = bwrite(bp);
951	dp->i_flag |= IN_CHANGE | IN_UPDATE;
952	return (error);
953}
954
955/*
956 * Rewrite an existing directory entry to point at the inode
957 * supplied.  The parameters describing the directory entry are
958 * set up by a call to namei.
959 */
960int
961ext2_dirrewrite(dp, ip, cnp)
962	struct inode *dp, *ip;
963	struct componentname *cnp;
964{
965	struct buf *bp;
966	struct ext2_dir_entry_2 *ep;
967	struct vnode *vdp = ITOV(dp);
968	int error;
969
970	if ((error = ext2_blkatoff(vdp, (off_t)dp->i_offset, (char **)&ep,
971	    &bp)) != 0)
972		return (error);
973	ep->inode = ip->i_number;
974	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs->s_es,
975	    EXT2_FEATURE_INCOMPAT_FILETYPE))
976		ep->file_type = DTTOFT(IFTODT(ip->i_mode));
977	else
978		ep->file_type = EXT2_FT_UNKNOWN;
979	error = bwrite(bp);
980	dp->i_flag |= IN_CHANGE | IN_UPDATE;
981	return (error);
982}
983
984/*
985 * Check if a directory is empty or not.
986 * Inode supplied must be locked.
987 *
988 * Using a struct dirtemplate here is not precisely
989 * what we want, but better than using a struct direct.
990 *
991 * NB: does not handle corrupted directories.
992 */
993int
994ext2_dirempty(ip, parentino, cred)
995	struct inode *ip;
996	ino_t parentino;
997	struct ucred *cred;
998{
999	off_t off;
1000	struct dirtemplate dbuf;
1001	struct ext2_dir_entry_2 *dp = (struct ext2_dir_entry_2 *)&dbuf;
1002	int error, count, namlen;
1003#define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1004
1005	for (off = 0; off < ip->i_size; off += dp->rec_len) {
1006		error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ,
1007		    off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred,
1008		    NOCRED, &count, (struct thread *)0);
1009		/*
1010		 * Since we read MINDIRSIZ, residual must
1011		 * be 0 unless we're at end of file.
1012		 */
1013		if (error || count != 0)
1014			return (0);
1015		/* avoid infinite loops */
1016		if (dp->rec_len == 0)
1017			return (0);
1018		/* skip empty entries */
1019		if (dp->inode == 0)
1020			continue;
1021		/* accept only "." and ".." */
1022		namlen = dp->name_len;
1023		if (namlen > 2)
1024			return (0);
1025		if (dp->name[0] != '.')
1026			return (0);
1027		/*
1028		 * At this point namlen must be 1 or 2.
1029		 * 1 implies ".", 2 implies ".." if second
1030		 * char is also "."
1031		 */
1032		if (namlen == 1)
1033			continue;
1034		if (dp->name[1] == '.' && dp->inode == parentino)
1035			continue;
1036		return (0);
1037	}
1038	return (1);
1039}
1040
1041/*
1042 * Check if source directory is in the path of the target directory.
1043 * Target is supplied locked, source is unlocked.
1044 * The target is always vput before returning.
1045 */
1046int
1047ext2_checkpath(source, target, cred)
1048	struct inode *source, *target;
1049	struct ucred *cred;
1050{
1051	struct vnode *vp;
1052	int error, rootino, namlen;
1053	struct dirtemplate dirbuf;
1054
1055	vp = ITOV(target);
1056	if (target->i_number == source->i_number) {
1057		error = EEXIST;
1058		goto out;
1059	}
1060	rootino = ROOTINO;
1061	error = 0;
1062	if (target->i_number == rootino)
1063		goto out;
1064
1065	for (;;) {
1066		if (vp->v_type != VDIR) {
1067			error = ENOTDIR;
1068			break;
1069		}
1070		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf,
1071			sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
1072			IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, (int *)0,
1073			(struct thread *)0);
1074		if (error != 0)
1075			break;
1076		namlen = dirbuf.dotdot_type;	/* like ufs little-endian */
1077		if (namlen != 2 ||
1078		    dirbuf.dotdot_name[0] != '.' ||
1079		    dirbuf.dotdot_name[1] != '.') {
1080			error = ENOTDIR;
1081			break;
1082		}
1083		if (dirbuf.dotdot_ino == source->i_number) {
1084			error = EINVAL;
1085			break;
1086		}
1087		if (dirbuf.dotdot_ino == rootino)
1088			break;
1089		vput(vp);
1090		if ((error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino,
1091		    LK_EXCLUSIVE, &vp)) != 0) {
1092			vp = NULL;
1093			break;
1094		}
1095	}
1096
1097out:
1098	if (error == ENOTDIR)
1099		printf("checkpath: .. not a directory\n");
1100	if (vp != NULL)
1101		vput(vp);
1102	return (error);
1103}
1104