1/*
2 * ntfs_dir.c - NTFS kernel directory operations.
3 *
4 * Copyright (c) 2006-2010 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2010 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/buf.h>
39#include <sys/param.h>
40#include <sys/dirent.h>
41#include <sys/errno.h>
42#include <sys/mount.h>
43#include <sys/stat.h>
44#include <sys/ucred.h>
45#include <sys/uio.h>
46#include <sys/vnode.h>
47
48#include <string.h>
49
50#include <libkern/OSAtomic.h>
51#include <libkern/OSMalloc.h>
52
53#include <kern/debug.h>
54#include <kern/locks.h>
55
56#include "ntfs.h"
57#include "ntfs_attr.h"
58#include "ntfs_debug.h"
59#include "ntfs_dir.h"
60#include "ntfs_endian.h"
61#include "ntfs_index.h"
62#include "ntfs_inode.h"
63#include "ntfs_layout.h"
64#include "ntfs_mft.h"
65#include "ntfs_page.h"
66#include "ntfs_time.h"
67#include "ntfs_types.h"
68#include "ntfs_unistr.h"
69#include "ntfs_volume.h"
70
71/**
72 * The little endian Unicode string $I30 as a global constant.
73 */
74ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
75		const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
76
77/**
78 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
79 * @dir_ni:	ntfs inode of the directory in which to search for the name
80 * @uname:	Unicode name for which to search in the directory
81 * @uname_len:	length of the name @uname in Unicode characters
82 * @res_mref:	return the mft reference of the inode of the found name
83 * @res_name:	return the found filename if necessary (see below)
84 *
85 * Look for an inode with name @uname of length @uname_len Unicode characters
86 * in the directory with inode @dir_ni.  This is done by walking the contents
87 * of the directory B+tree looking for the Unicode name.
88 *
89 * If the name is found in the directory, 0 is returned and the corresponding
90 * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it is
91 * a 64-bit number containing the sequence number, in *@res_mref.
92 *
93 * On error, the error code is returned.  In particular if the inode is not
94 * found ENOENT is returned which is not an error as such.
95 *
96 * Note, @uname_len does not include the (optional) terminating NUL character.
97 *
98 * Note, we look for a case sensitive match first but we also look for a case
99 * insensitive match at the same time.  If we find a case insensitive match, we
100 * save that for the case that we do not find an exact match, where we return
101 * the case insensitive match and setup *@res_name (which we allocate) with the
102 * mft reference, the filename type, length and with a copy of the little
103 * endian Unicode filename itself.  If we match a filename which is in the DOS
104 * namespace, we only return the mft reference and filename type in *@res_name.
105 * ntfs_vnop_lookup() then uses this to find the long filename in the inode
106 * itself.  This is so it can use the name cache effectively.
107 *
108 * Locking: Caller must hold @dir_ni->lock.
109 *
110 * TODO: From Mark's review comments: pull the iteration code into a separate
111 * function and call it both for the index root and index allocation iteration.
112 * See the ntfs_index_lookup() function in ntfs_index.c...
113 */
114errno_t ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
115		const signed uname_len, MFT_REF *res_mref,
116		ntfs_dir_lookup_name **res_name)
117{
118	VCN vcn, old_vcn;
119	ntfs_volume *vol = dir_ni->vol;
120	mount_t mp = vol->mp;
121	ntfs_inode *ia_ni;
122	vnode_t ia_vn = NULL;
123	MFT_RECORD *m;
124	INDEX_ROOT *ir;
125	INDEX_ENTRY *ie;
126	ntfs_dir_lookup_name *name = NULL;
127	upl_t upl;
128	upl_page_info_array_t pl;
129	u8 *kaddr;
130	INDEX_ALLOCATION *ia;
131	u8 *index_end;
132	ntfs_attr_search_ctx *ctx;
133	int rc;
134	errno_t err;
135
136	if (!S_ISDIR(dir_ni->mode))
137		panic("%s(): !S_ISDIR(dir_ni->mode\n", __FUNCTION__);
138	if (NInoAttr(dir_ni))
139		panic("%s(): NInoAttr(dir_ni)\n", __FUNCTION__);
140	/* Get the index allocation inode. */
141	err = ntfs_index_inode_get(dir_ni, I30, 4, FALSE, &ia_ni);
142	if (err) {
143		ntfs_error(mp, "Failed to get index vnode (error %d).", err);
144		return err;
145	}
146	ia_vn = ia_ni->vn;
147	lck_rw_lock_shared(&ia_ni->lock);
148	/* Get hold of the mft record for the directory. */
149	err = ntfs_mft_record_map(dir_ni, &m);
150	if (err) {
151		ntfs_error(mp, "Failed to map mft record for directory (error "
152				"%d).", err);
153		goto err;
154	}
155	ctx = ntfs_attr_search_ctx_get(dir_ni, m);
156	if (!ctx) {
157		ntfs_error(mp, "Failed to get attribute search context.");
158		err = ENOMEM;
159		goto unm_err;
160	}
161	/* Find the index root attribute in the mft record. */
162	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, 0, NULL, 0, ctx);
163	if (err) {
164		if (err == ENOENT) {
165			ntfs_error(mp, "Index root attribute missing in "
166					"directory inode 0x%llx.",
167					(unsigned long long)dir_ni->mft_no);
168			err = EIO;
169		}
170		goto put_err;
171	}
172	/*
173	 * Get to the index root value (it has been verified in
174	 * ntfs_index_inode_read()).
175	 */
176	ir = (INDEX_ROOT*)((u8*)ctx->a + le16_to_cpu(ctx->a->value_offset));
177	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
178	/* The first index entry. */
179	ie = (INDEX_ENTRY*)((u8*)&ir->index +
180			le32_to_cpu(ir->index.entries_offset));
181	/*
182	 * Loop until we exceed valid memory (corruption case) or until we
183	 * reach the last entry.
184	 */
185	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
186		ntfs_debug("In index root, offset 0x%x.",
187				(unsigned)((u8*)ie - (u8*)ir));
188		/* Bounds checks. */
189		if ((u8*)ie < (u8*)&ir->index || (u8*)ie +
190				sizeof(INDEX_ENTRY_HEADER) > index_end ||
191				(u8*)ie + le16_to_cpu(ie->key_length) >
192				index_end)
193			goto dir_err;
194		/*
195		 * The last entry cannot contain a name.  It can however
196		 * contain a pointer to a child node in the B+tree so we just
197		 * break out.
198		 */
199		if (ie->flags & INDEX_ENTRY_END)
200			break;
201		/*
202		 * We perform a case sensitive comparison and if that matches
203		 * we are done and return the mft reference of the inode (i.e.
204		 * the inode number together with the sequence number for
205		 * consistency checking).  We convert it to cpu format before
206		 * returning.
207		 */
208		if (ntfs_are_names_equal(uname, uname_len,
209				(ntfschar*)&ie->key.filename.filename,
210				ie->key.filename.filename_length, TRUE,
211				vol->upcase, vol->upcase_len)) {
212found_it:
213			/*
214			 * We have a perfect match, so we do not need to care
215			 * about having matched imperfectly before, so we can
216			 * free name and set *res_name to NULL.
217			 *
218			 * However, if the perfect match is a short filename,
219			 * we need to signal this through *res_name, so that
220			 * the caller can deal with the name cache effectively.
221			 *
222			 * As an optimization we just reuse an existing
223			 * allocation of *res_name.
224			 */
225			if (ie->key.filename.filename_type == FILENAME_DOS) {
226				u8 len;
227
228				if (!name) {
229					*res_name = name = OSMalloc(
230							sizeof(*name),
231							ntfs_malloc_tag);
232					if (!name) {
233						err = ENOMEM;
234						goto put_err;
235					}
236				}
237				name->mref = le64_to_cpu(ie->indexed_file);
238				name->type = FILENAME_DOS;
239				name->len = len = ie->key.filename.
240						filename_length;
241				memcpy(name->name, ie->key.filename.filename,
242						len * sizeof(ntfschar));
243			} else {
244				if (name)
245					OSFree(name, sizeof(*name),
246							ntfs_malloc_tag);
247				*res_name = NULL;
248			}
249			*res_mref = le64_to_cpu(ie->indexed_file);
250			ntfs_attr_search_ctx_put(ctx);
251			ntfs_mft_record_unmap(dir_ni);
252			lck_rw_unlock_shared(&ia_ni->lock);
253			(void)vnode_put(ia_vn);
254			return 0;
255		}
256		/*
257		 * For a case insensitive mount, we also perform a case
258		 * insensitive comparison.  If the comparison matches, we cache
259		 * the filename in *res_name so that the caller can work on it.
260		 */
261		if (!NVolCaseSensitive(vol) &&
262				ntfs_are_names_equal(uname, uname_len,
263				(ntfschar*)&ie->key.filename.filename,
264				ie->key.filename.filename_length, FALSE,
265				vol->upcase, vol->upcase_len)) {
266			u8 type;
267
268			/*
269			 * If no name is cached yet, cache it or if the current
270			 * name is the WIN32 name, replace the already cached
271			 * name with the WIN32 name.  Otherwise continue
272			 * caching the first match.
273			 */
274			type = ie->key.filename.filename_type;
275			if (!name || type == FILENAME_WIN32 || type ==
276					FILENAME_WIN32_AND_DOS) {
277				u8 len;
278
279				if (!name) {
280					*res_name = name = OSMalloc(
281							sizeof(*name),
282							ntfs_malloc_tag);
283					if (!name) {
284						err = ENOMEM;
285						goto put_err;
286					}
287				}
288				name->mref = le64_to_cpu(ie->indexed_file);
289				name->type = type;
290				name->len = len = ie->key.filename.
291						filename_length;
292				memcpy(name->name, ie->key.filename.filename,
293						len * sizeof(ntfschar));
294			}
295		}
296		/*
297		 * Not a perfect match, need to do full blown collation so we
298		 * know which way in the B+tree we have to go.
299		 */
300		rc = ntfs_collate_names(uname, uname_len,
301				(ntfschar*)&ie->key.filename.filename,
302				ie->key.filename.filename_length, 1, FALSE,
303				vol->upcase, vol->upcase_len);
304		/*
305		 * If uname collates before the name of the current entry,
306		 * there is definitely no such name in this index but we might
307		 * need to descend into the B+tree so we just break out of the
308		 * loop.
309		 */
310		if (rc == -1)
311			break;
312		/* The names are not equal, continue the search. */
313		if (rc)
314			continue;
315		/*
316		 * Names match with case insensitive comparison, now try the
317		 * case sensitive comparison, which is required for proper
318		 * collation.
319		 */
320		rc = ntfs_collate_names(uname, uname_len,
321				(ntfschar*)&ie->key.filename.filename,
322				ie->key.filename.filename_length, 1, TRUE,
323				vol->upcase, vol->upcase_len);
324		if (rc == -1)
325			break;
326		if (rc)
327			continue;
328		/*
329		 * Perfect match, this will never happen as the
330		 * ntfs_are_names_equal() call will have gotten a match but we
331		 * still treat it correctly.
332		 */
333		goto found_it;
334	}
335	/*
336	 * We have finished with this index without success.  Check for the
337	 * presence of a child node and if not present return ENOENT, unless we
338	 * have got a matching name cached in @name in which case return the
339	 * mft reference associated with it.
340	 */
341	if (!(ie->flags & INDEX_ENTRY_NODE)) {
342		ntfs_attr_search_ctx_put(ctx);
343		ntfs_mft_record_unmap(dir_ni);
344		goto not_found;
345	} /* Child node present, descend into it. */
346	/* Consistency check: Verify that an index allocation exists. */
347	if (!NInoIndexAllocPresent(ia_ni)) {
348		ntfs_error(mp, "No index allocation attribute but index entry "
349				"requires one.  Directory inode 0x%llx is "
350				"corrupt or driver bug.",
351				(unsigned long long)dir_ni->mft_no);
352		NVolSetErrors(vol);
353		goto put_err;
354	}
355	/* Get the starting vcn of the index block holding the child node. */
356	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
357	/*
358	 * We are done with the index root and the mft record.  Release them,
359	 * otherwise we deadlock with ntfs_page_map().
360	 */
361	ntfs_attr_search_ctx_put(ctx);
362	ntfs_mft_record_unmap(dir_ni);
363	m = NULL;
364	ctx = NULL;
365descend_into_child_node:
366	/*
367	 * Convert vcn to byte offset in the index allocation attribute and map
368	 * the corresponding page.
369	 */
370	err = ntfs_page_map(ia_ni, (vcn << ia_ni->vcn_size_shift) &
371			~PAGE_MASK_64, &upl, &pl, &kaddr, FALSE);
372	if (err) {
373		ntfs_error(mp, "Failed to map directory index page (error "
374				"%d).", err);
375		goto err;
376	}
377fast_descend_into_child_node:
378	/* Get to the index allocation block. */
379	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << ia_ni->vcn_size_shift) &
380			PAGE_MASK));
381	/* Bounds checks. */
382	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
383		ntfs_error(mp, "Out of bounds check failed.  Corrupt "
384				"directory inode 0x%llx or driver bug.",
385				(unsigned long long)dir_ni->mft_no);
386		goto page_err;
387	}
388	/* Catch multi sector transfer fixup errors. */
389	if (!ntfs_is_indx_record(ia->magic)) {
390		ntfs_error(mp, "Directory index record with VCN 0x%llx is "
391				"corrupt.  Corrupt inode 0x%llx.  Run chkdsk.",
392				(unsigned long long)vcn,
393				(unsigned long long)dir_ni->mft_no);
394		goto page_err;
395	}
396	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
397		ntfs_error(mp, "Actual VCN (0x%llx) of index buffer is "
398				"different from expected VCN (0x%llx).  "
399				"Directory inode 0x%llx is corrupt or driver "
400				"bug.", (unsigned long long)
401				sle64_to_cpu(ia->index_block_vcn),
402				(unsigned long long)vcn,
403				(unsigned long long)dir_ni->mft_no);
404		goto page_err;
405	}
406	if (offsetof(INDEX_BLOCK, index) +
407			le32_to_cpu(ia->index.allocated_size) !=
408			ia_ni->block_size) {
409		ntfs_error(mp, "Index buffer (VCN 0x%llx) of directory inode "
410				"0x%llx has a size (%u) differing from the "
411				"directory specified size (%u).  Directory "
412				"inode is corrupt or driver bug.",
413				(unsigned long long)vcn,
414				(unsigned long long)dir_ni->mft_no, (unsigned)
415				(offsetof(INDEX_BLOCK, index) +
416				le32_to_cpu(ia->index.allocated_size)),
417				(unsigned)ia_ni->block_size);
418		goto page_err;
419	}
420	index_end = (u8*)ia + ia_ni->block_size;
421	if (index_end > kaddr + PAGE_SIZE) {
422		ntfs_error(mp, "Index buffer (VCN 0x%llx) of directory inode "
423				"0x%llx crosses page boundary.  Impossible! "
424				"Cannot access!  This is probably a bug in "
425				"the driver.", (unsigned long long)vcn,
426				(unsigned long long)dir_ni->mft_no);
427		goto page_err;
428	}
429	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
430	if (index_end > (u8*)ia + ia_ni->block_size) {
431		ntfs_error(mp, "Size of index buffer (VCN 0x%llx) of directory "
432				"inode 0x%llx exceeds maximum size.",
433				(unsigned long long)vcn,
434				(unsigned long long)dir_ni->mft_no);
435		goto page_err;
436	}
437	/* The first index entry. */
438	ie = (INDEX_ENTRY*)((u8*)&ia->index +
439			le32_to_cpu(ia->index.entries_offset));
440	/*
441	 * Iterate similar to above big loop but applied to index buffer, thus
442	 * loop until we exceed valid memory (corruption case) or until we
443	 * reach the last entry.
444	 */
445	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
446		/* Bounds check. */
447		if ((u8*)ie < (u8*)&ia->index || (u8*)ie +
448				sizeof(INDEX_ENTRY_HEADER) > index_end ||
449				(u8*)ie + le16_to_cpu(ie->key_length) >
450				index_end) {
451			ntfs_error(mp, "Index entry out of bounds in "
452					"directory inode 0x%llx.",
453					(unsigned long long)dir_ni->mft_no);
454			goto page_err;
455		}
456		/*
457		 * The last entry cannot contain a name.  It can however
458		 * contain a pointer to a child node in the B+tree so we just
459		 * break out.
460		 */
461		if (ie->flags & INDEX_ENTRY_END)
462			break;
463		/*
464		 * We perform a case sensitive comparison and if that matches
465		 * we are done and return the mft reference of the inode (i.e.
466		 * the inode number together with the sequence number for
467		 * consistency checking).  We convert it to cpu format before
468		 * returning.
469		 */
470		if (ntfs_are_names_equal(uname, uname_len,
471				(ntfschar*)&ie->key.filename.filename,
472				ie->key.filename.filename_length, TRUE,
473				vol->upcase, vol->upcase_len)) {
474found_it2:
475			/*
476			 * We have a perfect match, so we do not need to care
477			 * about having matched imperfectly before, so we can
478			 * free name and set *res_name to NULL.
479			 *
480			 * However, if the perfect match is a short filename,
481			 * we need to signal this through *res_name, so that
482			 * the caller can deal with the name cache effectively.
483			 *
484			 * As an optimization we just reuse an existing
485			 * allocation of *res_name.
486			 */
487			if (ie->key.filename.filename_type == FILENAME_DOS) {
488				u8 len;
489
490				if (!name) {
491					*res_name = name = OSMalloc(
492							sizeof(*name),
493							ntfs_malloc_tag);
494					if (!name) {
495						err = ENOMEM;
496						goto page_err;
497					}
498				}
499				name->mref = le64_to_cpu(ie->indexed_file);
500				name->type = FILENAME_DOS;
501				name->len = len = ie->key.filename.
502						filename_length;
503				memcpy(name->name, ie->key.filename.filename,
504						len * sizeof(ntfschar));
505			} else {
506				if (name)
507					OSFree(name, sizeof(*name),
508							ntfs_malloc_tag);
509				*res_name = NULL;
510			}
511			*res_mref = le64_to_cpu(ie->indexed_file);
512			ntfs_page_unmap(ia_ni, upl, pl, FALSE);
513			lck_rw_unlock_shared(&ia_ni->lock);
514			(void)vnode_put(ia_vn);
515			return 0;
516		}
517		/*
518		 * For a case insensitive mount, we also perform a case
519		 * insensitive comparison.  If the comparison matches, we cache
520		 * the filename in *res_name so that the caller can work on it.
521		 * If the comparison matches, and the name is in the DOS
522		 * namespace, we only cache the mft reference and the filename
523		 * type (we set the name length to zero for simplicity).
524		 */
525		if (!NVolCaseSensitive(vol) &&
526				ntfs_are_names_equal(uname, uname_len,
527				(ntfschar*)&ie->key.filename.filename,
528				ie->key.filename.filename_length, FALSE,
529				vol->upcase, vol->upcase_len)) {
530			u8 type;
531
532			/*
533			 * If no name is cached yet, cache it or if the current
534			 * name is the WIN32 name, replace the already cached
535			 * name with the WIN32 name.  Otherwise continue
536			 * caching the first match.
537			 */
538			type = ie->key.filename.filename_type;
539			if (!name || type == FILENAME_WIN32 || type ==
540					FILENAME_WIN32_AND_DOS) {
541				u8 len;
542
543				if (!name) {
544					*res_name = name = OSMalloc(
545							sizeof(*name),
546							ntfs_malloc_tag);
547					if (!name) {
548						err = ENOMEM;
549						goto page_err;
550					}
551				}
552				name->mref = le64_to_cpu(ie->indexed_file);
553				name->type = type;
554				name->len = len = ie->key.filename.
555						filename_length;
556				memcpy(name->name, ie->key.filename.filename,
557						len * sizeof(ntfschar));
558			}
559		}
560		/*
561		 * Not a perfect match, need to do full blown collation so we
562		 * know which way in the B+tree we have to go.
563		 */
564		rc = ntfs_collate_names(uname, uname_len,
565				(ntfschar*)&ie->key.filename.filename,
566				ie->key.filename.filename_length, 1, FALSE,
567				vol->upcase, vol->upcase_len);
568		/*
569		 * If uname collates before the name of the current entry,
570		 * there is definitely no such name in this index but we might
571		 * need to descend into the B+tree so we just break out of the
572		 * loop.
573		 */
574		if (rc == -1)
575			break;
576		/* The names are not equal, continue the search. */
577		if (rc)
578			continue;
579		/*
580		 * Names match with case insensitive comparison, now try the
581		 * case sensitive comparison, which is required for proper
582		 * collation.
583		 */
584		rc = ntfs_collate_names(uname, uname_len,
585				(ntfschar*)&ie->key.filename.filename,
586				ie->key.filename.filename_length, 1, TRUE,
587				vol->upcase, vol->upcase_len);
588		if (rc == -1)
589			break;
590		if (rc)
591			continue;
592		/*
593		 * Perfect match, this will never happen as the
594		 * ntfs_are_names_equal() call will have gotten a match but we
595		 * still treat it correctly.
596		 */
597		goto found_it2;
598	}
599	/*
600	 * We have finished with this index buffer without success.  Check for
601	 * the presence of a child node.
602	 */
603	if (ie->flags & INDEX_ENTRY_NODE) {
604		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
605			ntfs_error(mp, "Index entry with child node found in "
606					"a leaf node in directory inode "
607					"0x%llx.",
608					(unsigned long long)dir_ni->mft_no);
609			goto page_err;
610		}
611		/* Child node present, descend into it. */
612		old_vcn = vcn;
613		vcn = sle64_to_cpup((sle64*)((u8*)ie +
614				le16_to_cpu(ie->length) - 8));
615		if (vcn >= 0) {
616			/*
617			 * If @vcn is in the same page cache page as @old_vcn
618			 * we recycle the mapped page.
619			 */
620			if (old_vcn << ia_ni->vcn_size_shift >> PAGE_SHIFT ==
621					vcn << ia_ni->vcn_size_shift >>
622					PAGE_SHIFT)
623				goto fast_descend_into_child_node;
624			ntfs_page_unmap(ia_ni, upl, pl, FALSE);
625			goto descend_into_child_node;
626		}
627		ntfs_error(mp, "Negative child node vcn in directory inode "
628				"0x%llx.", (unsigned long long)dir_ni->mft_no);
629		goto page_err;
630	}
631	/*
632	 * No child node present, return ENOENT, unless we have got a matching
633	 * name cached in @name in which case return the mft reference
634	 * associated with it.
635	 */
636	ntfs_page_unmap(ia_ni, upl, pl, FALSE);
637not_found:
638	lck_rw_unlock_shared(&ia_ni->lock);
639	(void)vnode_put(ia_vn);
640	if (name) {
641		*res_mref = name->mref;
642		return 0;
643	}
644	ntfs_debug("Entry not found.");
645	return ENOENT;
646page_err:
647	ntfs_page_unmap(ia_ni, upl, pl, FALSE);
648	goto err;
649dir_err:
650	ntfs_error(mp, "Corrupt directory inode 0x%llx.  Run chkdsk.",
651			(unsigned long long)dir_ni->mft_no);
652put_err:
653	ntfs_attr_search_ctx_put(ctx);
654unm_err:
655	ntfs_mft_record_unmap(dir_ni);
656err:
657	if (name)
658		OSFree(name, sizeof(*name), ntfs_malloc_tag);
659	lck_rw_unlock_shared(&ia_ni->lock);
660	(void)vnode_put(ia_vn);
661	if (!err)
662		err = EIO;
663	ntfs_debug("Failed (error %d).", err);
664	return err;
665}
666
667/**
668 * ntfs_do_dirent - generate a dirent structure and copy it to the destination
669 * @vol:	ntfs volume the index entry belongs to
670 * @ie:		index entry to return after conversion to a dirent structure
671 * @de:		buffer to generate the dirent structure in
672 * @uio:	destination in which to return the generated dirent structure
673 * @entries:	[IN/OUT] pointer to number of entries that have been returned
674 *
675 * This is a helper function for ntfs_readdir().
676 *
677 * First, check if we want to return this index entry @ie and if not return 0.
678 *
679 * Assuming we want to return this index entry, convert the NTFS specific
680 * directory index entry @ie into the file system independent dirent structure
681 * and store it in the supplied buffer @de.
682 *
683 * If there is not enough space in the destination @uio to return the converted
684 * dirent structure @de, return -1.
685 *
686 * Return the converted dirent structure @de in the destination @uio and return
687 * 0 on success or errno on error.
688 *
689 * If we successfully returned an entry *@entries is incremented.
690 */
691static inline int ntfs_do_dirent(ntfs_volume *vol, INDEX_ENTRY *ie,
692		struct dirent *de, uio_t uio, int *entries)
693{
694	ino64_t mref;
695	u8 *utf8_name;
696	size_t utf8_size;
697	signed res_size, padding;
698	int err;
699	FILENAME_TYPE_FLAGS name_type;
700#ifdef DEBUG
701	static const char *dts[15] = { "UNKNOWN", "FIFO", "CHR", "UNKNOWN",
702			"DIR", "UNKNOWN", "BLK", "UNKNOWN", "REG", "UNKNOWN",
703			"LNK", "UNKNOWN", "SOCK", "UNKNOWN", "WHT" };
704#endif
705
706	name_type = ie->key.filename.filename_type;
707	if (name_type == FILENAME_DOS) {
708		ntfs_debug("Skipping DOS namespace entry.");
709		return 0;
710	}
711	mref = MREF_LE(ie->indexed_file);
712	/*
713	 * Remove all NTFS core system files from the name space so we do not
714	 * need to worry about users damaging a volume by writing to them or
715	 * deleting/renaming them and so that we can return fsRtParID (1) as
716	 * the inode number of the parent of the volume root directory and
717	 * fsRtDirID (2) as the inode number of the volume root directory which
718	 * are both expected by Carbon and various applications.
719	 */
720	if (mref < FILE_first_user) {
721		ntfs_debug("Removing core NTFS system file (mft_no 0x%x) from "
722				"name space.", (unsigned)mref);
723		return 0;
724	}
725	if (sizeof(de->d_ino) < 8 && mref & 0xffffffff00000000ULL) {
726		ntfs_warning(vol->mp, "Skipping dirent because its inode "
727				"number 0x%llx does not fit in 32-bits.",
728				(unsigned long long)mref);
729		return 0;
730	}
731	utf8_name = (u8*)de->d_name;
732	utf8_size = sizeof(de->d_name);
733	res_size = ntfs_to_utf8(vol, (ntfschar*)&ie->key.filename.filename,
734			ie->key.filename.filename_length << NTFSCHAR_SIZE_SHIFT,
735			&utf8_name, &utf8_size);
736	if (res_size <= 0) {
737		ntfs_warning(vol->mp, "Skipping unrepresentable inode 0x%llx "
738				"(error %d).", (unsigned long long)mref,
739				-res_size);
740		return 0;
741	}
742	/*
743	 * The name is now in @de->d_name.  Set up the remainder of the dirent
744	 * structure.
745	 */
746	de->d_ino = mref;
747	/*
748	 * If a filename index is present it must be a directory.  Otherwise it
749	 * could be a file or a symbolic link (or something else but we do not
750	 * support anything else yet).
751	 */
752	if (ie->key.filename.file_attributes &
753			FILE_ATTR_DUP_FILENAME_INDEX_PRESENT)
754		de->d_type = DT_DIR;
755	else {
756		/*
757		 * If the file size is less than or equal to MAXPATHLEN it
758		 * could be a symbolic link so return DT_UNKNOWN as it would be
759		 * too expensive to get the inode to check what it is exactly.
760		 *
761		 * Also, system files need to be returned as DT_UNKNOWN as they
762		 * could be fifos, sockets, or block or character device
763		 * special files.  Note that the size check will actually catch
764		 * all relevant system files so we do not need to check for
765		 * them specifically here.
766		 */
767		if (ie->key.filename.data_size > MAXPATHLEN)
768			de->d_type = DT_REG;
769		else
770			de->d_type = DT_UNKNOWN;
771	}
772	/*
773	 * Note @de->d_namlen is only 8-bit thus @res_size may not be above
774	 * 255.  This is not a problem since sizeof(de->d_name) is 256 which
775	 * includes the terminating NUL byte thus ntfs_to_utf8() would have
776	 * aborted if the name translated to something longer than 255 bytes.
777	 *
778	 * As a little BUG check test it anyway...
779	 */
780	if (res_size > 0xff)
781		panic("%s(): res_size (0x%x) does not fit in 8 bits.  This is "
782				"a bug!", __FUNCTION__, res_size);
783	de->d_namlen = res_size;
784	/* Add the NUL terminator byte to the name length. */
785	res_size += offsetof(struct dirent, d_name) + 1;
786	de->d_reclen = (u16)(res_size + 3) & (u16)~3;
787	padding = de->d_reclen - res_size;
788	if (padding)
789		bzero((u8*)de + res_size, padding);
790	/*
791	 * If the remaining buffer space is not big enough to store the dirent
792	 * structure, return -1 to indicate that fact.
793	 */
794	if (uio_resid(uio) < de->d_reclen)
795		return -1;
796	ntfs_debug("Returning dirent with d_ino 0x%llx, d_reclen 0x%x, d_type "
797			"DT_%s, d_namlen %d, d_name \"%s\".",
798			(unsigned long long)mref, (unsigned)de->d_reclen,
799			de->d_type < 15 ? dts[de->d_type] : dts[0],
800			(unsigned)de->d_namlen, de->d_name);
801	/*
802	 * Copy the dirent structure to the result buffer.  uiomove() returns
803	 * zero to indicate success and the (positive) error code on error so
804	 * it can be clearly distinguished from us returning -1 to indicate
805	 * that the buffer does not have enough space remaining.
806	 */
807	err = uiomove((caddr_t)de, de->d_reclen, uio);
808	if (!err) {
809		/* We have successfully returned another dirent structure. */
810		(*entries)++;
811	}
812	return err;
813}
814
815/**
816 * ntfs_dirhint_get - get a directory hint
817 * @ni:		ntfs index inode of directory index for which to get a hint
818 * @ofs:	offset (containing tag and B+tree position) for hint to get
819 *
820 * Search through the list of hints attached to the ntfs directory index inode
821 * @ni for a directory hint with an offset @ofs.  If found return that hint and
822 * if not found either allocate a new directory hint or recycle an the oldest
823 * directory hint in the list and set it up ready to use.
824 *
825 * Return the directory hint or NULL if allocating a new hint failed and no
826 * hints are present in the list so a hint could not be recycled either.
827 *
828 * The caller can tell if the hint matched by the fact that if it matched it
829 * will have a filename attached to it, i.e. ->fn_size and thus also ->fn will
830 * be non-zero and non-NULL, respectively.
831 *
832 * Locking: Caller must hold @ni->lock for writing.
833 */
834static ntfs_dirhint *ntfs_dirhint_get(ntfs_inode *ni, unsigned ofs)
835{
836	ntfs_dirhint *dh;
837	BOOL need_init, need_remove;
838	struct timeval tv;
839
840	microuptime(&tv);
841	/*
842	 * Look for an existing hint first.  If not found, create a new one
843	 * (when the list is not full) or recycle the oldest hint.  Since new
844	 * hints are always added to the head of the list, the last hint is
845	 * always the oldest.
846	 */
847	dh = NULL;
848	if (ofs & ~NTFS_DIR_POS_MASK) {
849		TAILQ_FOREACH(dh, &ni->dirhint_list, link) {
850			if (dh->ofs == ofs)
851				break;
852		}
853	}
854	/* Assume we found a directory hint in which case it is initialized. */
855	need_init = FALSE;
856	need_remove = TRUE;
857	if (!dh) {
858		/* No directory hint matched. */
859		need_init = TRUE;
860		if (ni->nr_dirhints < NTFS_MAX_DIRHINTS) {
861			/*
862			 * Allocate a new directory hint.  If the allocation
863			 * fails try to recycle an existing directory hint.
864			 */
865			dh = OSMalloc(sizeof(*dh), ntfs_malloc_tag);
866			if (dh) {
867				ni->nr_dirhints++;
868				need_remove = FALSE;
869			}
870		}
871		if (!dh) {
872			/* Recycle the last, i.e. oldest, directory hint. */
873			dh = TAILQ_LAST(&ni->dirhint_list, ntfs_dirhint_head);
874			if (dh && dh->fn_size)
875				OSFree(dh->fn, dh->fn_size, ntfs_malloc_tag);
876		}
877	}
878	/*
879	 * If we managed to get a hint, move it to (or place it at if we
880	 * allocated it above) the head of the list of dircetory hints of the
881	 * index inode.
882	 */
883	if (dh) {
884		if (need_remove)
885			TAILQ_REMOVE(&ni->dirhint_list, dh, link);
886		TAILQ_INSERT_HEAD(&ni->dirhint_list, dh, link);
887		/*
888		 * Set up the hint if it is a new hint or we recycled an old
889		 * hint.
890		 */
891		if (need_init) {
892			dh->ofs = ofs;
893			dh->fn_size = 0;
894		}
895		dh->time = tv.tv_sec;
896	}
897	return dh;
898}
899
900/**
901 * ntfs_dirhint_put - put a directory hint
902 * @ni:		ntfs index inode to which the directory hint belongs
903 * @dh:		directory hint to free
904 *
905 * Detach the directory hint @dh from the ntfs directory index inode @ni and
906 * free it and all its resources.
907 *
908 * Locking: Caller must hold @ni->lock for writing.
909 */
910static void ntfs_dirhint_put(ntfs_inode *ni, ntfs_dirhint *dh)
911{
912	TAILQ_REMOVE(&ni->dirhint_list, dh, link);
913	ni->nr_dirhints--;
914	if (dh->fn_size)
915		OSFree(dh->fn, dh->fn_size, ntfs_malloc_tag);
916	OSFree(dh, sizeof(*dh), ntfs_malloc_tag);
917}
918
919/**
920 * ntfs_dirhints_put - put all directory hints
921 * @ni:		ntfs index inode whose directory hints to release
922 * @stale_only:	if true only release expired directory hints
923 *
924 * If @stale_only is false release all directory hints from the ntfs directory
925 * index inode @ni freeing them and all their resources.
926 *
927 * If @stale_only is true do the same as above but only release expired hints.
928 *
929 * Note we iterate from the oldest to the newest so we can stop when we reach
930 * the first valid hint if @stale_only is true.
931 *
932 * Locking: Caller must hold @ni->lock for writing.
933 */
934void ntfs_dirhints_put(ntfs_inode *ni, BOOL stale_only)
935{
936	ntfs_dirhint *dh, *tdh;
937	struct timeval tv;
938
939	if (stale_only)
940		microuptime(&tv);
941	TAILQ_FOREACH_REVERSE_SAFE(dh, &ni->dirhint_list, ntfs_dirhint_head,
942			link, tdh) {
943		if (stale_only) {
944			/* Stop here if this entry is too new. */
945			if (tv.tv_sec - dh->time < NTFS_DIRHINT_TTL)
946				break;
947		}
948		ntfs_dirhint_put(ni, dh);
949	}
950}
951
952/**
953 * ntfs_readdir - read directory entries into a supplied buffer
954 * @dir_ni:	directory inode to read directory entries from
955 * @uio:	destination in which to return the read entries
956 * @eofflag:	return end of file status (can be NULL)
957 * @numdirent:	return number of entries returned (can be NULL)
958 *
959 * ntfs_readdir() reads directory entries starting at the position described by
960 * uio_offset() into the buffer pointed to by @uio in a file system independent
961 * format.  Up to uio_resid() bytes of data can be returned.  The data in the
962 * buffer is a series of packed dirent structures where each contains the
963 * following elements:
964 *
965 *	ino_t	d_ino;			inode number of this entry
966 *	u16	d_reclen;		length of this entry record
967 *	u8	d_type;			inode type (see below)
968 *	u8	d_namlen;		length of string in d_name
969 *	char	d_name[MAXNAMELEN + 1];	null terminated filename
970 *
971 * The length of the record (d_reclen) must be a multiple of four.
972 *
973 * The following file types are defined:
974 *	DT_UNKNOWN, DT_FIFO, DT_CHR, DT_DIR, DT_BLK, DT_REG, DT_LNK, DT_SOCK,
975 *	DT_WHT
976 *
977 * The name (d_name) must be at most MAXNAMELEN + 1 bytes long including the
978 * compulsory NUL terminator.
979 *
980 * If the name length (d_namlen) is not a multiple of four, the unused space
981 * between the NUL terminator of the name and the end of the record (as
982 * specified by d_reclen which is aligned to four bytes) is filled with NUL
983 * bytes.
984 *
985 * Note how the inode number (d_ino) is only 32 bits.  Thus we do not return
986 * directory entries for inodes with an inode number that does not fit in 32
987 * bits.  In practice (at the present time) this is not a problem as 2^32
988 * inodes are a lot of inodes so are unlikely to be reached with existing data
989 * storage hardware that is NTFS formatted and accessed by OS X.  Further, up
990 * to and including Windows XP, Windows itself limits the maximum number of
991 * inodes to 2^32.
992 *
993 * When the current position (uio_offset()) is zero, we start at the first
994 * entry in the B+tree and then follow the entries in the B+tree in sequence.
995 * We cannot ignore the B+tree and just return all the index root entries
996 * followed by all the entries from each of the in-use index allocation blocks
997 * because when an entry is added to or deleted from the directory this can
998 * reshape the B+tree thus making it impossible to continue where we left of
999 * between two VNOP_READDIR() calls and thus makes it impossible to implement
1000 * POSIX seekdir()/telldir()/readdir() semantics.
1001 *
1002 * The current position (uio_offset()) refers to the next block of entries to
1003 * be returned.  The offset can only be set to a value previously returned by
1004 * ntfs_vnop_readdir() or zero.  This offset does not have to match the number
1005 * of bytes returned (in uio_resid()).
1006 *
1007 * Note that whilst uio_resid() is 32-bit, uio_offset() is of type off_t which
1008 * is 64-bit in OS X but it gets cast down to a 32-bit long on ppc and i386 by
1009 * the getdirentries() system call before it is returned to user space so we
1010 * cannot use more than the lower 32-bits of the uio_offset().
1011 *
1012 * In fact, the offset used by NTFS is essentially a numerical position as
1013 * described above (26 bits) with a tag (6 bits).  The tag is for associating
1014 * the next request with the current request.  This enables us to have multiple
1015 * threads reading the directory while the directory is also being modified.
1016 *
1017 * Each tag/position pair is tied to a unique directory hint.  The hint
1018 * contains information (filename) needed to build the B+tree index context
1019 * path for finding the next set of entries.
1020 *
1021 * The reason not to just use a unique tag each time that identifies a
1022 * directory hint is that we have no way to expire tags/directory hints when a
1023 * directory file descriptor is closed and instead only find out when all users
1024 * of the directory have closed it via our VNOP_INACTIVE() being called.  Thus,
1025 * we only can afford to keep a bounded number of tags/directory hints per
1026 * vnode thus we have to expire old tags/directory hints as new ones are added.
1027 * And when ntfs_readdir() is called with an expired tag we would have no way
1028 * of knowing where in the directory to proceed without the associated
1029 * numerical offset into the B+tree which tells us the position at which to
1030 * continue if there had not been any modifications since the tag and position
1031 * were returned by ntfs_readdir().  In practice in most cases this will still
1032 * be approximately the same location as where we left off unless a lot of
1033 * files have been created in/deleted from the directory.  This is not perfect
1034 * as it means we are only POSIX compliant when a tag/directory hint has not
1035 * expired but it is a lot better than nothing so is worth doing.  Also, using
1036 * only 26 bits for the numerical position in the B+tree still alows for
1037 * directories with up to 2^26-1 entries, i.e. over 67 million entries which is
1038 * likely to be quite sufficient for most intents and purposes.
1039 *
1040 * If @eofflag is not NULL, set *eofflag to 0 if we have not reached the end of
1041 * the directory yet and set it to 1 if we have reached the end of the
1042 * directory, i.e. @uio either contains nothing or it contains the last entry
1043 * in the directory.
1044 *
1045 * If @numdirent is not NULL, set *@numdirent to the number of directory
1046 * entries returned in the buffer described by @uio.
1047 *
1048 * If the directory has been deleted, i.e. @dir_ni->link_count is zero, do not
1049 * synthesize entries for "." and "..".
1050 *
1051 * Locking: Caller must hold @dir_ni->lock.
1052 */
1053errno_t ntfs_readdir(ntfs_inode *dir_ni, uio_t uio, int *eofflag,
1054		int *numdirent)
1055{
1056	off_t ofs;
1057	ntfs_volume *vol;
1058	struct dirent *de;
1059	ntfs_inode *ia_ni;
1060	ntfs_index_context *ictx;
1061	ntfs_dirhint *dh;
1062	int eof, entries, err;
1063	unsigned tag;
1064	/*
1065	 * This is quite big to go on the stack but only half the size of the
1066	 * buffers placed on the stack in ntfs_vnop_lookup() so if they are ok
1067	 * so should this be.
1068	 */
1069	u8 de_buf[sizeof(struct dirent) + 4];
1070
1071	ofs = uio_offset(uio);
1072	vol = dir_ni->vol;
1073	de = (struct dirent*)&de_buf;
1074	ia_ni = NULL;
1075	ictx = NULL;
1076	dh = NULL;
1077	err = entries = eof = tag = 0;
1078	ntfs_debug("Entering for directory inode 0x%llx, offset 0x%llx, count "
1079			"0x%llx.", (unsigned long long)dir_ni->mft_no,
1080			(unsigned long long)ofs,
1081			(unsigned long long)uio_resid(uio));
1082	/*
1083	 * If we already reached the end of the directory, there is nothing to
1084	 * do.
1085	 */
1086	if ((unsigned)ofs == (unsigned)-1)
1087		goto eof;
1088	tag = (unsigned)ofs & NTFS_DIR_TAG_MASK;
1089	ofs &= NTFS_DIR_POS_MASK;
1090	/*
1091	 * Sanity check the uio data.  The absolute minimum buffer size
1092	 * required is the number of bytes taken by the entries in the dirent
1093	 * structure up to the beginning of the name plus the minimum length
1094	 * for a filename of one byte plus we need to align each dirent record
1095	 * to a multiple of four bytes thus effectovely the minimum name length
1096	 * is four and not one.
1097	 */
1098	if (uio_resid(uio) < (unsigned)offsetof(struct dirent, d_name) + 4) {
1099		err = EINVAL;
1100		goto err;
1101	}
1102	/*
1103	 * Emulate "." and ".." for all directories unless the directory has
1104	 * been deleted but not closed yet.
1105	 */
1106	while (ofs < 2) {
1107		if (!dir_ni->link_count) {
1108			ofs = 2;
1109			break;
1110		}
1111		*(u32*)de->d_name = 0;
1112		de->d_name[0] = '.';
1113		if (!ofs) {
1114			/*
1115			 * We have to remap the root directory inode to inode
1116			 * number 2, i.e. fsRtDirID, for compatibility with
1117			 * Carbon.
1118			 */
1119			if (dir_ni->mft_no == FILE_root)
1120				de->d_ino = 2;
1121			else {
1122				if (sizeof(de->d_ino) < 8 && dir_ni->mft_no &
1123						0xffffffff00000000ULL) {
1124					ntfs_warning(vol->mp, "Skipping "
1125							"emulated dirent for "
1126							"\".\" because its "
1127							"inode number 0x%llx "
1128							"does not fit in "
1129							"32-bits.",
1130							(unsigned long long)
1131							dir_ni->mft_no);
1132					goto do_next;
1133				}
1134				de->d_ino = dir_ni->mft_no;
1135			}
1136			de->d_namlen = 1;
1137		} else {
1138			vnode_t parent_vn;
1139
1140			/*
1141			 * We have to return 1, i.e. fsRtParID, for the parent
1142			 * inode number of the root directory inode for
1143			 * compatibility with Carbon.
1144			 */
1145			if (dir_ni->mft_no == FILE_root)
1146				de->d_ino = 1;
1147			else if ((parent_vn = vnode_getparent(dir_ni->vn))) {
1148				if (sizeof(de->d_ino) < 8 &&
1149						NTFS_I(parent_vn)->mft_no &
1150						0xffffffff00000000ULL) {
1151					ntfs_warning(vol->mp, "Skipping "
1152							"emulated dirent for "
1153							"\"..\" because its "
1154							"inode number 0x%llx "
1155							"does not fit in "
1156							"32-bits.",
1157							(unsigned long long)
1158							NTFS_I(parent_vn)->
1159							mft_no);
1160					goto do_next;
1161				}
1162				de->d_ino = NTFS_I(parent_vn)->mft_no;
1163				/*
1164				 * Remap the root directory inode to inode
1165				 * number 2 (see above).
1166				 */
1167				if (de->d_ino == FILE_root)
1168					de->d_ino = 2;
1169				(void)vnode_put(parent_vn);
1170			} else {
1171				MFT_REF mref;
1172
1173				/*
1174				 * Look up a filename attribute in the mft
1175				 * record of the directory @dir_ni and use its
1176				 * parent mft reference for "..".
1177				 */
1178				err = ntfs_inode_get_name_and_parent_mref(
1179						dir_ni, FALSE, &mref, NULL);
1180				if (err) {
1181					ntfs_warning(vol->mp, "Skipping "
1182							"emulated dirent for "
1183							"\"..\" because its "
1184							"inode number could "
1185							"not be determined "
1186							"(error %d).", err);
1187					goto do_next;
1188				}
1189				if (sizeof(de->d_ino) < 8 && MREF(mref) &
1190						0xffffffff00000000ULL) {
1191					ntfs_warning(vol->mp, "Skipping "
1192							"emulated dirent for "
1193							"\"..\" because its "
1194							"inode number 0x%llx "
1195							"does not fit in "
1196							"32-bits.",
1197							(unsigned long long)
1198							MREF(mref));
1199					goto do_next;
1200				}
1201				de->d_ino = MREF(mref);
1202				/*
1203				 * Remap the root directory inode to inode
1204				 * number 2 (see above).
1205				 */
1206				if (de->d_ino == FILE_root)
1207					de->d_ino = 2;
1208			}
1209			de->d_namlen = 2;
1210			de->d_name[1] = '.';
1211		}
1212		/*
1213		 * The name is one or two bytes long but we need to align the
1214		 * entry record to a multiple of four bytes, thus add four
1215		 * instead of one or two to the name offset.
1216		 */
1217		de->d_reclen = offsetof(struct dirent, d_name) + 4;
1218		de->d_type = DT_DIR;
1219		ntfs_debug("Returning emulated \"%s\" dirent with d_ino "
1220				"0x%llx, d_reclen 0x%x, d_type DT_DIR, "
1221				"d_namlen %d.", de->d_name,
1222				(unsigned long long)de->d_ino,
1223				(unsigned)de->d_reclen,
1224				(unsigned)de->d_namlen);
1225		err = uiomove((caddr_t)de, de->d_reclen, uio);
1226		if (err) {
1227			ntfs_error(vol->mp, "uiomove() failed for emulated "
1228					"entry (error %d).", err);
1229			goto err;
1230		}
1231		entries++;
1232do_next:
1233		/* We are done with this entry. */
1234		ofs++;
1235		if (uio_resid(uio) < (unsigned)offsetof(struct dirent, d_name)
1236				+ 4) {
1237			err = -1;
1238			goto done;
1239		}
1240	}
1241	/* Get the index allocation inode. */
1242	err = ntfs_index_inode_get(dir_ni, I30, 4, FALSE, &ia_ni);
1243	if (err) {
1244		ntfs_error(vol->mp, "Failed to get index vnode (error %d).",
1245				err);
1246		ia_ni = NULL;
1247		goto err;
1248	}
1249	/* We need the lock exclusive because of the directory hints code. */
1250	lck_rw_lock_exclusive(&ia_ni->lock);
1251	ictx = ntfs_index_ctx_get(ia_ni);
1252	if (!ictx) {
1253		ntfs_error(vol->mp, "Not enough memory to allocate index "
1254				"context.");
1255		err = ENOMEM;
1256		goto err;
1257	}
1258	/*
1259	 * Get the directory hint matching the current tag and offset if it
1260	 * exists and if not get a new directory hint.
1261	 */
1262	dh = ntfs_dirhint_get(ia_ni, ofs | tag);
1263	if (!dh) {
1264		/*
1265		 * We have run out of memory and failed to allocate a new hint.
1266		 * This also implies that the hint was not found thus we might
1267		 * as well reset the tag to zero so we do not bother searching
1268		 * for it next time.  We will just use the numerical position
1269		 * in the directory in order to determine where to continue the
1270		 * directory lookup.
1271		 */
1272		tag = 0;
1273		goto lookup_by_position;
1274	}
1275	/*
1276	 * If there is no filename attached to the directory hint, use lookup
1277	 * by position in stead of by filename.
1278	 */
1279	if (!dh->fn_size)
1280		goto lookup_by_position;
1281	/*
1282	 * The directory hint contains a filename, look it up and return it
1283	 * to the caller.  Then, continue iterating over the directory B+tree
1284	 * returning each entry.  If the directory entry has been deleted, the
1285	 * lookup up will return the next entry in the B+tree.  This needs
1286	 * special handling because the found entry could be an end entry in
1287	 * which case we need to switch to the next real entry.
1288	 */
1289	if (!dh->fn)
1290		panic("%s(): !dh->fn\n", __FUNCTION__);
1291	/* If the lookup fails fall back to looking up by position. */
1292	err = ntfs_index_lookup(dh->fn, dh->fn_size, &ictx);
1293	if (!err)
1294		goto do_dirent;
1295	if (err != ENOENT) {
1296		ntfs_warning(vol->mp, "Failed to look up filename from "
1297				"directory hint (error %d), using position in "
1298				"the B+tree to continue the lookup.", err);
1299		ntfs_index_ctx_reinit(ictx, ia_ni);
1300		goto lookup_by_position;
1301	}
1302	err = 0;
1303	/*
1304	 * Entry was not found, but the next one was returned.  If this is a
1305	 * real entry pretend that this is the entry we were looking for.
1306	 */
1307	if (!(ictx->entry->flags & INDEX_ENTRY_END)) {
1308		ictx->is_match = 1;
1309		goto do_dirent;
1310	}
1311	/*
1312	 * This is an end entry which does not contain a filename.  Switch to
1313	 * the next real entry in the B+tree.
1314	 *
1315	 * Note by definition we must be in a leaf node.
1316	 */
1317	if (ictx->entry->flags & INDEX_ENTRY_NODE)
1318		panic("%s(): ictx->entry->flags & INDEX_ENTRY_NODE\n",
1319				__FUNCTION__);
1320	/*
1321	 * The next entry is the first real entry above the current node thus
1322	 * keep moving up the B+tree until we find a real entry.
1323	 */
1324	do {
1325		ntfs_index_context *itmp;
1326
1327		/* If we are in the index root, we are done. */
1328		if (ictx->is_root)
1329			goto eof;
1330		/* Save the current index context so we can free it. */
1331		itmp = ictx;
1332		/* Move up to the parent node. */
1333		ictx = ictx->up;
1334		/*
1335		 * Disconnect the old index context from its path and free it
1336		 * and all its resources.
1337		 */
1338		ntfs_index_ctx_put_single(itmp);
1339	} while (ictx->entry_nr == ictx->nr_entries - 1);
1340	/*
1341	 * We have reached a node with a real index entry.  Lock it so we can
1342	 * work on it.
1343	 */
1344	err = ntfs_index_ctx_relock(ictx);
1345	if (err)
1346		goto err;
1347	ictx->is_match = 1;
1348	goto do_dirent;
1349lookup_by_position:
1350	/*
1351	 * Start a search at the beginning of the B+tree and look for the entry
1352	 * number @ofs - 2.
1353	 *
1354	 * We need the -2 to account for the synthesized ".." and "." entries.
1355	 */
1356	err = ntfs_index_lookup_by_position(ofs - 2, 0, &ictx);
1357	/*
1358	 * Starting with the current entry, iterate over all remaining entries,
1359	 * returning each via a call to ntfs_do_dirent().
1360	 */
1361	while (!err) {
1362do_dirent:
1363		/* Submit the current directory entry to our helper function. */
1364		err = ntfs_do_dirent(vol, ictx->entry, de, uio, &entries);
1365		if (err) {
1366			/*
1367			 * A negative error code means the destination @uio
1368			 * did not have enough space for the directory entry.
1369			 */
1370			if (err < 0)
1371				goto done;
1372			/* Positive error code; uiomove() returned error. */
1373			ntfs_error(vol->mp, "uiomove() failed for index %s "
1374					"entry (error %d).",
1375					ictx->is_root ? "root" : "allocation",
1376					err);
1377			goto err;
1378		}
1379		/* We are done with this entry. */
1380		ofs++;
1381		/* Go to the next directory entry. */
1382		err = ntfs_index_lookup_next(&ictx);
1383	}
1384	if (err != ENOENT) {
1385		ntfs_error(vol->mp, "Failed to look up index entry with "
1386				"position 0x%llx.",
1387				(unsigned long long)(ofs - 2));
1388		goto err;
1389	}
1390eof:
1391	eof = 1;
1392	ofs = (unsigned)-1;
1393done:
1394	/*
1395	 * If @err is less than zero, we got here because the @uio does not
1396	 * have enough space for the next directory entry.  If we have not
1397	 * returned any directory entries yet, this means the buffer is too
1398	 * small for even one single entry so return the appropriate error code
1399	 * instead of zero.
1400	 */
1401	if (err < 0 && !entries)
1402		err = EINVAL;
1403	else
1404		err = 0;
1405err:
1406	/*
1407	 * If the offset has overflown NTFS_DIR_POS_MASK we cannot record it so
1408	 * just set it to the maximum we can return.  This is not a problem
1409	 * when we record a directory hint as is the common case and then later
1410	 * use it to continue as the offset is then not actually used and
1411	 * instead the name is used which is independent of its location.  In
1412	 * this case however do update the tag so that we return a different
1413	 * apparent offset to the caller between invocations.
1414	 *
1415	 * Note we have to avoid @ofs becomming (unsigned)-1 because we use
1416	 * that to denote end of directory.
1417	 */
1418	if (!eof && ofs & ~(off_t)NTFS_DIR_POS_MASK) {
1419		ofs = NTFS_DIR_POS_MASK;
1420		tag = (unsigned)(++ia_ni->dirhint_tag) << NTFS_DIR_TAG_SHIFT;
1421		if (!tag || (tag | NTFS_DIR_POS_MASK) == (unsigned)-1) {
1422			ia_ni->dirhint_tag = 1;
1423			tag = (unsigned)1 << NTFS_DIR_TAG_SHIFT;
1424		}
1425	}
1426	/*
1427	 * If we have a directory hint, update it with the current search state
1428	 * so the next call can continue where we stopped.
1429	 */
1430	if (dh) {
1431		unsigned size;
1432
1433		if (eof || err) {
1434			/*
1435			 * The end of the directory was reached or an error
1436			 * occurred.  Discard the directory hint.
1437			 */
1438			ntfs_dirhint_put(ia_ni, dh);
1439			goto dh_done;
1440		}
1441		/*
1442		 * Add the current name to the directory hint.  This is the
1443		 * next name we need to return to the caller.  If there is an
1444		 * old name then reuse its buffer if the two are the same size
1445		 * and otherwise free the old name first.
1446		 */
1447		size = le16_to_cpu(ictx->entry->key_length);
1448		if (dh->fn_size != size) {
1449			if (dh->fn_size)
1450				OSFree(dh->fn, dh->fn_size, ntfs_malloc_tag);
1451			dh->fn = OSMalloc(size, ntfs_malloc_tag);
1452			if (!dh->fn) {
1453				/*
1454				 * Not enough memory to set up the directory
1455				 * hint.  Just throw it away and set the tag to
1456				 * zero so we continue by position next time.
1457				 */
1458				dh->fn_size = 0;
1459				ntfs_dirhint_put(ia_ni, dh);
1460				tag = 0;
1461				goto dh_done;
1462			}
1463			dh->fn_size = size;
1464		}
1465		memcpy(dh->fn, &ictx->entry->key.filename, size);
1466		/*
1467		 * If the current tag is zero, we need to assign a new tag.
1468		 *
1469		 * Note we have to avoid @ofs becomming (unsigned)-1 because we
1470		 * use that to denote end of directory.
1471		 */
1472		if (!tag) {
1473			tag = (unsigned)(++ia_ni->dirhint_tag) <<
1474					NTFS_DIR_TAG_SHIFT;
1475			if (!tag || (tag | NTFS_DIR_POS_MASK) == (unsigned)-1) {
1476				ia_ni->dirhint_tag = 1;
1477				tag = (unsigned)1 << NTFS_DIR_TAG_SHIFT;
1478			}
1479		}
1480		/* Finally set the directory hint to the current offset. */
1481		dh->ofs = ofs | tag;
1482	}
1483dh_done:
1484	if (ictx)
1485		ntfs_index_ctx_put(ictx);
1486	if (ia_ni) {
1487		lck_rw_unlock_exclusive(&ia_ni->lock);
1488		(void)vnode_put(ia_ni->vn);
1489	}
1490	ntfs_debug("%s (returned 0x%x entries, %s, now at offset 0x%llx).",
1491			err ? "Failed" : "Done", entries, eof ?
1492			"reached end of directory" : "more entries to follow",
1493			(unsigned long long)ofs);
1494	if (eofflag)
1495		*eofflag = eof;
1496	if (numdirent)
1497		*numdirent = entries;
1498	uio_setoffset(uio, ofs | tag);
1499	return err;
1500}
1501
1502/**
1503 * ntfs_dir_is_empty - check if a directory is empty
1504 * @dir_ni:	ntfs inode of directory to check
1505 *
1506 * Check if the directory inode @ni is empty.
1507 *
1508 * Return 0 if empty, ENOTEMPTY if not empty, and errno (not ENOTEMPTY) on
1509 * error.
1510 *
1511 * Locking: Caller must hold @dir_ni->lock for writing.
1512 */
1513errno_t ntfs_dir_is_empty(ntfs_inode *dir_ni)
1514{
1515	s64 bmp_size, prev_ia_pos, bmp_pos, ia_pos;
1516	ntfs_inode *ia_ni, *bmp_ni = NULL;
1517	ntfs_volume *vol = dir_ni->vol;
1518	MFT_RECORD *m;
1519	ntfs_attr_search_ctx *ctx;
1520	INDEX_ROOT *ir;
1521	u8 *index_end, *bmp, *kaddr;
1522	INDEX_ENTRY *ie;
1523	upl_t bmp_upl, ia_upl = NULL;
1524	upl_page_info_array_t bmp_pl, ia_pl;
1525	INDEX_ALLOCATION *ia;
1526	errno_t err;
1527	int bmp_ofs;
1528	static const char es[] = "%s.  Directory mft_no 0x%llx is corrupt.  "
1529			"Run chkdsk.";
1530	static const char es1[] = ".  Directory mft_no 0x";
1531	static const char es2[] = " is corrupt.  Run chkdsk.";
1532
1533	ntfs_debug("Entering for directory mft_no 0x%llx.",
1534			(unsigned long long)dir_ni->mft_no);
1535	if (!S_ISDIR(dir_ni->mode))
1536		return ENOTDIR;
1537	/* Get the index allocation inode. */
1538	err = ntfs_index_inode_get(dir_ni, I30, 4, FALSE, &ia_ni);
1539	if (err) {
1540		ntfs_error(vol->mp, "Failed to get index inode (error %d).",
1541				err);
1542		return err;
1543	}
1544	lck_rw_lock_shared(&ia_ni->lock);
1545	/* Get the index bitmap inode if there is one. */
1546	if (NInoIndexAllocPresent(ia_ni)) {
1547		err = ntfs_attr_inode_get(dir_ni, AT_BITMAP, I30, 4, FALSE,
1548				LCK_RW_TYPE_SHARED, &bmp_ni);
1549		if (err) {
1550			ntfs_error(vol->mp, "Failed to get index bitmap inode "
1551					"(error %d).", err);
1552			bmp_ni = NULL;
1553			goto err;
1554		}
1555	}
1556	/* Get hold of the mft record for the directory. */
1557	err = ntfs_mft_record_map(dir_ni, &m);
1558	if (err) {
1559		ntfs_error(vol->mp, "Failed to map mft record for directory "
1560				"(error %d).", err);
1561		goto err;
1562	}
1563	ctx = ntfs_attr_search_ctx_get(dir_ni, m);
1564	if (!ctx) {
1565		ntfs_error(vol->mp, "Failed to get attribute search context.");
1566		err = ENOMEM;
1567		goto unm_err;
1568	}
1569	/* Find the index root attribute in the mft record. */
1570	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, 0, NULL, 0, ctx);
1571	if (err) {
1572		if (err == ENOENT) {
1573			ntfs_error(vol->mp, "Index root attribute missing in "
1574					"directory inode 0x%llx.",
1575					(unsigned long long)dir_ni->mft_no);
1576			NVolSetErrors(vol);
1577			err = EIO;
1578		} else
1579			ntfs_error(vol->mp, "Failed to lookup index root "
1580					"attribute in directory inode 0x%llx "
1581					"(error %d).",
1582					(unsigned long long)dir_ni->mft_no,
1583					err);
1584		goto put_err;
1585	}
1586	/*
1587	 * Get to the index root value (it has been verified in
1588	 * ntfs_inode_read()).
1589	 */
1590	ir = (INDEX_ROOT*)((u8*)ctx->a + le16_to_cpu(ctx->a->value_offset));
1591	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
1592	/* The first index entry. */
1593	ie = (INDEX_ENTRY*)((u8*)&ir->index +
1594			le32_to_cpu(ir->index.entries_offset));
1595	/* Bounds checks. */
1596	if ((u8*)ie < (u8*)&ir->index ||
1597			(u8*)ie + sizeof(INDEX_ENTRY_HEADER) > index_end ||
1598			(u8*)ie + le16_to_cpu(ie->key_length) > index_end)
1599		goto dir_err;
1600	/*
1601	 * If this is not the end node, it is a filename and thus the directory
1602	 * is not empty.
1603	 *
1604	 * If it is the end node, and there is no sub-node hanging off it, the
1605	 * directory is empty.
1606	 */
1607	if (!(ie->flags & INDEX_ENTRY_END))
1608		err = ENOTEMPTY;
1609	else if (!(ie->flags & INDEX_ENTRY_NODE)) {
1610		/* Set @err to 1 so we can detect that we are done below. */
1611		err = 1;
1612	}
1613	ntfs_attr_search_ctx_put(ctx);
1614	ntfs_mft_record_unmap(dir_ni);
1615	if (err) {
1616		/* Undo the setting of @err to 1 we did above. */
1617		if (err == 1)
1618			err = 0;
1619		goto done;
1620	}
1621	/*
1622	 * We only get here if the index root indicated that there is a
1623	 * sub-node thus there must be an index allocation attribute.
1624	 */
1625	if (!NInoIndexAllocPresent(ia_ni)) {
1626		ntfs_error(vol->mp, "No index allocation attribute but index "
1627				"entry requires one.  Directory inode 0x%llx "
1628				"is corrupt or driver bug.",
1629				(unsigned long long)dir_ni->mft_no);
1630		goto dir_err;
1631	}
1632	lck_spin_lock(&bmp_ni->size_lock);
1633	bmp_size = bmp_ni->data_size;
1634	lck_spin_unlock(&bmp_ni->size_lock);
1635	ia_pos = bmp_pos = bmp_ofs = 0;
1636	prev_ia_pos = -1;
1637get_next_bmp_page:
1638	ntfs_debug("Reading index bitmap offset 0x%llx, bit offset 0x%x.",
1639			(unsigned long long)bmp_pos >> 3, bmp_ofs);
1640	/*
1641	 * Convert bit position to byte offset in the index bitmap attribute
1642	 * and map the corresponding page.
1643	 */
1644	err = ntfs_page_map(bmp_ni, (bmp_pos >> 3) & ~PAGE_MASK_64, &bmp_upl,
1645			&bmp_pl, &bmp, FALSE);
1646	if (err) {
1647		ntfs_error(vol->mp, "Failed to read directory index bitmap "
1648				"buffer (error %d).", err);
1649		bmp_upl = NULL;
1650		goto page_err;
1651	}
1652	/* Find the next index block which is marked in use. */
1653	while (!(bmp[bmp_ofs >> 3] & (1 << (bmp_ofs & 7)))) {
1654find_next_index_buffer:
1655		bmp_ofs++;
1656		/*
1657		 * If we have reached the end of the bitmap, the directory is
1658		 * empty.
1659		 */
1660		if (((bmp_pos + bmp_ofs) >> 3) >= bmp_size)
1661			goto unm_done;
1662		ia_pos = (bmp_pos + bmp_ofs) << ia_ni->block_size_shift;
1663		/*
1664		 * If we have reached the end of the bitmap block get the next
1665		 * page and unmap away the old one.
1666		 */
1667		if ((bmp_ofs >> 3) >= PAGE_SIZE) {
1668			ntfs_page_unmap(bmp_ni, bmp_upl, bmp_pl, FALSE);
1669			bmp_pos += PAGE_SIZE * 8;
1670			bmp_ofs = 0;
1671			goto get_next_bmp_page;
1672		}
1673	}
1674	ntfs_debug("Handling index allocation block 0x%llx.",
1675			(unsigned long long)bmp_pos + bmp_ofs);
1676	/* If the current index block is in the same buffer we reuse it. */
1677	if ((prev_ia_pos & ~PAGE_MASK_64) != (ia_pos & ~PAGE_MASK_64)) {
1678		prev_ia_pos = ia_pos;
1679		if (ia_upl)
1680			ntfs_page_unmap(ia_ni, ia_upl, ia_pl, FALSE);
1681		/* Map the page containing the index allocation block. */
1682		err = ntfs_page_map(ia_ni, ia_pos & ~PAGE_MASK_64, &ia_upl,
1683				&ia_pl, &kaddr, FALSE);
1684		if (err) {
1685			ntfs_error(vol->mp, "Failed to read directory index "
1686					"allocation page (error %d).", err);
1687			ia_upl = NULL;
1688			goto page_err;
1689		}
1690	}
1691	/* Get the current index allocation block inside the mapped page. */
1692	ia = (INDEX_ALLOCATION*)(kaddr + ((u32)ia_pos & PAGE_MASK &
1693			~(ia_ni->block_size - 1)));
1694	/* Bounds checks. */
1695	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
1696		ntfs_error(vol->mp, es, "Out of bounds check failed",
1697				(unsigned long long)dir_ni->mft_no);
1698		goto vol_err;
1699	}
1700	/* Catch multi sector transfer fixup errors. */
1701	if (!ntfs_is_indx_record(ia->magic)) {
1702		ntfs_error(vol->mp, "Multi sector transfer error detected in "
1703				"index record vcn 0x%llx%s%llx%s",
1704				(unsigned long long)ia_pos >>
1705				ia_ni->vcn_size_shift, es1,
1706				(unsigned long long)dir_ni->mft_no, es2);
1707		goto vol_err;
1708	}
1709	if (sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
1710			~(s64)(ia_ni->block_size - 1)) >>
1711			ia_ni->vcn_size_shift) {
1712		ntfs_error(vol->mp, "Actual VCN (0x%llx) of index record is "
1713				"different from expected VCN (0x%llx)%s%llx%s",
1714				(unsigned long long)
1715				sle64_to_cpu(ia->index_block_vcn),
1716				(unsigned long long)ia_pos >>
1717				ia_ni->vcn_size_shift, es1,
1718				(unsigned long long)dir_ni->mft_no, es2);
1719		goto vol_err;
1720	}
1721	if (offsetof(INDEX_BLOCK, index) +
1722			le32_to_cpu(ia->index.allocated_size) !=
1723			ia_ni->block_size) {
1724		ntfs_error(vol->mp, "Index buffer (VCN 0x%llx) has a size "
1725				"(%u) differing from the directory specified "
1726				"size (%u)%s%llx%s", (unsigned long long)
1727				(unsigned long long)
1728				sle64_to_cpu(ia->index_block_vcn),
1729				(unsigned)(offsetof(INDEX_BLOCK, index) +
1730				le32_to_cpu(ia->index.allocated_size)),
1731				(unsigned)ia_ni->block_size, es1,
1732				(unsigned long long)dir_ni->mft_no, es2);
1733		goto vol_err;
1734	}
1735	index_end = (u8*)ia + ia_ni->block_size;
1736	if (index_end > kaddr + PAGE_SIZE) {
1737		ntfs_error(vol->mp, "Index buffer (VCN 0x%llx) of directory "
1738				"inode 0x%llx crosses page boundary.  This "
1739				"cannot happen and points either to memory "
1740				"corruption or to a driver bug.",
1741				(unsigned long long)
1742				sle64_to_cpu(ia->index_block_vcn),
1743				(unsigned long long)dir_ni->mft_no);
1744		goto vol_err;
1745	}
1746	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
1747	if (index_end > (u8*)ia + ia_ni->block_size) {
1748		ntfs_error(vol->mp, "Size of index block (VCN 0x%llx) "
1749				"exceeds maximum size%s%llx%s",
1750				(unsigned long long)
1751				sle64_to_cpu(ia->index_block_vcn), es1,
1752				(unsigned long long)dir_ni->mft_no, es2);
1753		goto vol_err;
1754	}
1755	/* The first index entry. */
1756	ie = (INDEX_ENTRY*)((u8*)&ia->index +
1757			le32_to_cpu(ia->index.entries_offset));
1758	/* Bounds checks. */
1759	if ((u8*)ie < (u8*)&ia->index ||
1760			(u8*)ie + sizeof(INDEX_ENTRY_HEADER) > index_end ||
1761			(u8*)ie + le16_to_cpu(ie->key_length) > index_end)
1762		goto dir_err;
1763	/*
1764	 * If this is the end node, it is not a filename so we continue to the
1765	 * next index block.
1766	 */
1767	if (ie->flags & INDEX_ENTRY_END)
1768		goto find_next_index_buffer;
1769	/*
1770	 * This is not the end node, i.e. it is a filename and thus the
1771	 * directory is not empty.
1772	 */
1773	err = ENOTEMPTY;
1774unm_done:
1775	if (ia_upl)
1776		ntfs_page_unmap(ia_ni, ia_upl, ia_pl, FALSE);
1777	ntfs_page_unmap(bmp_ni, bmp_upl, bmp_pl, FALSE);
1778done:
1779	if (bmp_ni) {
1780		lck_rw_unlock_shared(&bmp_ni->lock);
1781		(void)vnode_put(bmp_ni->vn);
1782	}
1783	lck_rw_unlock_shared(&ia_ni->lock);
1784	(void)vnode_put(ia_ni->vn);
1785	ntfs_debug("Done (directory is%s empty).", !err ? "" : " not");
1786	return err;
1787dir_err:
1788	ntfs_error(vol->mp, "Corrupt directory inode 0x%llx.  Run chkdsk.",
1789			(unsigned long long)dir_ni->mft_no);
1790	NVolSetErrors(vol);
1791	/*
1792	 * If @ia_upl is not NULL we got here from the index allocation related
1793	 * code paths and if it is NULL we got here from the index root related
1794	 * code paths.
1795	 */
1796	if (ia_upl)
1797		goto page_err;
1798	err = EIO;
1799put_err:
1800	ntfs_attr_search_ctx_put(ctx);
1801unm_err:
1802	ntfs_mft_record_unmap(dir_ni);
1803err:
1804	if (bmp_ni) {
1805		lck_rw_unlock_shared(&bmp_ni->lock);
1806		(void)vnode_put(bmp_ni->vn);
1807	}
1808	lck_rw_unlock_shared(&ia_ni->lock);
1809	(void)vnode_put(ia_ni->vn);
1810	return err;
1811vol_err:
1812	NVolSetErrors(vol);
1813page_err:
1814	if (!err)
1815		err = EIO;
1816	if (ia_upl)
1817		ntfs_page_unmap(ia_ni, ia_upl, ia_pl, FALSE);
1818	if (bmp_upl)
1819		ntfs_page_unmap(bmp_ni, bmp_upl, bmp_pl, FALSE);
1820	goto err;
1821}
1822
1823/**
1824 * ntfs_dir_entry_delete - delete a directory index entry
1825 * @dir_ni:	directory ntfs inode from which to delete the index entry
1826 * @ni:		base ntfs inode which the filename @fn links to
1827 * @fn:		filename attribute describing index entry to delete
1828 * @fn_len:	size of filename attribute in bytes
1829 *
1830 * Find the directory index entry corresponding to the filename attribute @fn
1831 * of size @fn_len bytes in the directory index of the directory ntfs inode
1832 * @dir_ni.
1833 *
1834 * Assuming the filename is present in the directory index, delete it from the
1835 * index.  @ni is the inode which the filename @fn links to.
1836 *
1837 * Return 0 on success and errno on error.
1838 *
1839 * Locking: Caller must hold both @dir_ni->lock and @ni->lock for writing.
1840 */
1841errno_t ntfs_dir_entry_delete(ntfs_inode *dir_ni, ntfs_inode *ni,
1842		const FILENAME_ATTR *fn, const u32 fn_len)
1843{
1844	ntfs_volume *vol = ni->vol;
1845	ntfs_inode *ia_ni;
1846	ntfs_index_context *ictx;
1847	INDEX_ENTRY *ie;
1848	int err;
1849	FILENAME_TYPE_FLAGS fn_type;
1850
1851	ntfs_debug("Entering for mft_no 0x%llx, parent directory mft_no "
1852			"0x%llx.", (unsigned long long)ni->mft_no,
1853			(unsigned long long)dir_ni->mft_no);
1854	if (!S_ISDIR(dir_ni->mode))
1855		panic("%s(): !S_ISDIR(dir_ni->mode\n", __FUNCTION__);
1856	/*
1857	 * Verify that the mft reference of the parent directory specified in
1858	 * the filename to be removed matches the mft reference of the parent
1859	 * directory inode.
1860	 */
1861	if (fn->parent_directory != MK_LE_MREF(dir_ni->mft_no,
1862			dir_ni->seq_no)) {
1863		ntfs_error(vol->mp, "The reference of the parent directory "
1864				"specified in the filename to be removed does "
1865				"not match the reference of the parent "
1866				"directory inode.  Volume is corrupt.  Run "
1867				"chkdsk.");
1868		NVolSetErrors(vol);
1869		return EIO;
1870	}
1871	/*
1872	 * We are now ok to go ahead and delete the directory index entry.
1873	 *
1874	 * Get the index allocation inode.
1875	 */
1876	err = ntfs_index_inode_get(dir_ni, I30, 4, FALSE, &ia_ni);
1877	if (err) {
1878		ntfs_error(vol->mp, "Failed to get index vnode (error %d).",
1879				err);
1880		return EIO;
1881	}
1882	/* Need exclusive access to the index throughout. */
1883	lck_rw_lock_exclusive(&ia_ni->lock);
1884	ictx = ntfs_index_ctx_get(ia_ni);
1885	if (!ictx) {
1886		ntfs_error(vol->mp, "Not enough memory to allocate index "
1887				"context.");
1888		err = ENOMEM;
1889		goto err;
1890	}
1891restart:
1892	/* Get the index entry matching the filename @fn. */
1893	err = ntfs_index_lookup(fn, fn_len, &ictx);
1894	if (err) {
1895		if (err == ENOENT) {
1896			ntfs_error(vol->mp, "Failed to delete directory index "
1897					"entry of mft_no 0x%llx because the "
1898					"filename was not found in its parent "
1899					"directory index.  Directory 0x%llx "
1900					"is corrupt.  Run chkdsk.",
1901					(unsigned long long)ni->mft_no,
1902					(unsigned long long)dir_ni->mft_no);
1903			NVolSetErrors(vol);
1904		} else
1905			ntfs_error(vol->mp, "Failed to delete directory index "
1906					"entry of mft_no 0x%llx because "
1907					"looking up the filename in its "
1908					"parent directory 0x%llx failed "
1909					"(error %d).",
1910					(unsigned long long)ni->mft_no,
1911					(unsigned long long)dir_ni->mft_no,
1912					err);
1913		goto put_err;
1914	}
1915	ie = ictx->entry;
1916	/*
1917	 * Verify that the mft reference of the parent directory specified in
1918	 * the filename to be removed matches the mft reference of the parent
1919	 * directory specified in the found index entry.
1920	 */
1921	if (fn->parent_directory != ie->key.filename.parent_directory) {
1922		ntfs_error(vol->mp, "The reference of the parent directory "
1923				"(0x%llx) specified in the filename to be "
1924				"removed does not match the reference of the "
1925				"parent directory (0x%llx) specified in the "
1926				"matching directory index entry.  Volume is "
1927				"corrupt.  Run chkdsk.", (unsigned long long)
1928				le64_to_cpu(fn->parent_directory),
1929				(unsigned long long)le64_to_cpu(
1930				ie->key.filename.parent_directory));
1931		NVolSetErrors(vol);
1932		err = EIO;
1933		goto put_err;
1934	}
1935	/*
1936	 * Verify that the mft reference of the inode to which the filename to
1937	 * be removed belongs matches the mft reference of the inode pointed to
1938	 * by the found index entry.
1939	 */
1940	if (MK_LE_MREF(ni->mft_no, ni->seq_no) != ie->indexed_file) {
1941		ntfs_error(vol->mp, "The reference of the inode (0x%llx) to "
1942				"which the filename to be removed belongs "
1943				"does not match the reference of the inode "
1944				"(0x%llx) specified in the matching directory "
1945				"index entry.  Volume is corrupt.  Run "
1946				"chkdsk.", (unsigned long long)
1947				MK_MREF(ni->mft_no, ni->seq_no),
1948				(unsigned long long)
1949				le64_to_cpu(ie->indexed_file));
1950		NVolSetErrors(vol);
1951		err = EIO;
1952		goto put_err;
1953	}
1954	fn_type = ie->key.filename.filename_type;
1955	/* We now have the directory index entry, delete it. */
1956	err = ntfs_index_entry_delete(ictx);
1957	if (!err) {
1958		ntfs_index_ctx_put(ictx);
1959		/* Update the mtime and ctime in the parent directory inode. */
1960		dir_ni->last_mft_change_time = dir_ni->last_data_change_time =
1961				ntfs_utc_current_time();
1962		NInoSetDirtyTimes(dir_ni);
1963		lck_rw_unlock_exclusive(&ia_ni->lock);
1964		(void)vnode_put(ia_ni->vn);
1965		ntfs_debug("Done.");
1966		return 0;
1967	}
1968	/*
1969	 * If the tree got rearranged in some unpredictable way and we
1970	 * chickened out of working through it, we now reinitialize the index
1971	 * context (as it is now invalid) and then redo the lookup and delete.
1972	 *
1973	 * Note we use a negative -EAGAIN to distinguish from a potential real
1974	 * EAGAIN error.
1975	 */
1976	if (err == -EAGAIN) {
1977		ntfs_debug("Restarting delete as tree was rearranged.");
1978		ntfs_index_ctx_reinit(ictx, ia_ni);
1979		goto restart;
1980	}
1981	/*
1982	 * Failed to delete the directory index entry.
1983	 *
1984	 * If the filename @fn is in the POSIX namespace but the directory
1985	 * index entry is in the WIN32 namespace, convert the directory index
1986	 * entry to the POSIX namespace.  See comments above @restart_name
1987	 * label in ntfs_vnops.c::ntfs_vnop_remove() for an explanation of when
1988	 * this happens and why we need to do this.
1989	 */
1990	if (fn_type == FILENAME_WIN32 && fn->filename_type == FILENAME_POSIX) {
1991		errno_t err2;
1992
1993		ntfs_debug("Switching namespace of directory index entry from "
1994				"WIN32 to POSIX to match the namespace of the "
1995				"corresponding filename attribute.");
1996		/*
1997		 * The old index context is now invalid, so need to redo the
1998		 * index lookup.
1999		 */
2000		ntfs_index_ctx_reinit(ictx, ia_ni);
2001		err2 = ntfs_index_lookup(fn, fn_len, &ictx);
2002		if (err2) {
2003			ntfs_error(vol->mp, "Failed to switch namespace of "
2004					"directory index entry of inode "
2005					"0x%llx from WIN32 to POSIX because "
2006					"re-looking up the filename in its "
2007					"parent directory inode 0x%llx failed "
2008					"(error %d).  Leaving inconsistent "
2009					"metadata.  Run chkdsk.",
2010					(unsigned long long)ni->mft_no,
2011					(unsigned long long)dir_ni->mft_no,
2012					err2);
2013			NVolSetErrors(vol);
2014			goto put_err;
2015		}
2016		ictx->entry->key.filename.filename_type = FILENAME_POSIX;
2017		ntfs_index_entry_mark_dirty(ictx);
2018		dir_ni->last_mft_change_time = dir_ni->last_data_change_time =
2019				ntfs_utc_current_time();
2020		NInoSetDirtyTimes(dir_ni);
2021	}
2022put_err:
2023	ntfs_index_ctx_put(ictx);
2024err:
2025	lck_rw_unlock_exclusive(&ia_ni->lock);
2026	(void)vnode_put(ia_ni->vn);
2027	ntfs_debug("Failed (error %d).", err);
2028	return err;
2029}
2030
2031/**
2032 * ntfs_dir_entry_add - add a directory index entry
2033 * @dir_ni:	directory ntfs inode to which to add the index entry
2034 * @fn:		filename attribute describing index entry to add
2035 * @fn_len:	size of filename attribute in bytes
2036 * @mref:	mft reference of the inode the filename @fn belongs to
2037 *
2038 * Find the directory index entry corresponding to the filename attribute @fn
2039 * of size @fn_len bytes in the directory index of the directory ntfs inode
2040 * @dir_ni.
2041 *
2042 * Assuming the filename is not already present in the directory index, add it
2043 * to the index and point the inserted index entry at the mft reference @mref
2044 * which is the little endian mft reference of the inode to which the filename
2045 * attribute @fn belongs.
2046 *
2047 * If the filename is already present in the directory index, abort and return
2048 * the error code EEXIST.
2049 *
2050 * Return 0 on success and errno on error.
2051 *
2052 * Locking: Caller must hold @dir_ni->lock for writing.
2053 */
2054errno_t ntfs_dir_entry_add(ntfs_inode *dir_ni, const FILENAME_ATTR *fn,
2055		const u32 fn_len, const leMFT_REF mref)
2056{
2057	const leMFT_REF tmp_mref = mref;
2058	ntfs_inode *ia_ni;
2059	ntfs_index_context *ictx;
2060	errno_t err;
2061
2062	ntfs_debug("Entering for mft_no 0x%llx, parent directory mft_no "
2063			"0x%llx.", (unsigned long long)MREF_LE(tmp_mref),
2064			(unsigned long long)dir_ni->mft_no);
2065	if (!S_ISDIR(dir_ni->mode))
2066		panic("%s(): !S_ISDIR(dir_ni->mode\n", __FUNCTION__);
2067	/* Get the index allocation inode. */
2068	err = ntfs_index_inode_get(dir_ni, I30, 4, FALSE, &ia_ni);
2069	if (err) {
2070		ntfs_error(dir_ni->vol->mp, "Failed to get index vnode (error "
2071				"%d).", err);
2072		return err;
2073	}
2074	/* Need exclusive access to the index throughout. */
2075	lck_rw_lock_exclusive(&ia_ni->lock);
2076	ictx = ntfs_index_ctx_get(ia_ni);
2077	if (!ictx) {
2078		ntfs_error(dir_ni->vol->mp, "Not enough memory to allocate "
2079				"index context.");
2080		err = ENOMEM;
2081		goto err;
2082	}
2083	/*
2084	 * Get the index entry matching the filename @fn and if not present get
2085	 * the position at which the new index entry needs to be inserted.
2086	 */
2087	err = ntfs_index_lookup(fn, fn_len, &ictx);
2088	if (err != ENOENT) {
2089		if (!err) {
2090			ntfs_debug("Failed (filename already present in "
2091					"directory index).");
2092			err = EEXIST;
2093		} else
2094			ntfs_error(dir_ni->vol->mp, "Failed to add directory "
2095					"index entry of mft_no 0x%llx to "
2096					"directory mft_no 0x%llx because "
2097					"looking up the filename in the "
2098					"directory index failed (error %d).",
2099					(unsigned long long)MREF_LE(tmp_mref),
2100					(unsigned long long)dir_ni->mft_no,
2101					err);
2102		ntfs_index_ctx_put(ictx);
2103		goto err;
2104	}
2105	/*
2106	 * Create a new directory index entry inserting it in front of the
2107	 * entry described by the index context.
2108	 */
2109	err = ntfs_index_entry_add(ictx, fn, fn_len, &tmp_mref, 0);
2110	ntfs_index_ctx_put(ictx);
2111	if (!err) {
2112		lck_rw_unlock_exclusive(&ia_ni->lock);
2113		(void)vnode_put(ia_ni->vn);
2114		/* Update the mtime and ctime of the parent directory inode. */
2115		dir_ni->last_mft_change_time = dir_ni->last_data_change_time =
2116				ntfs_utc_current_time();
2117		NInoSetDirtyTimes(dir_ni);
2118		ntfs_debug("Done.");
2119		return 0;
2120	}
2121err:
2122	lck_rw_unlock_exclusive(&ia_ni->lock);
2123	(void)vnode_put(ia_ni->vn);
2124	return err;
2125}
2126