1/*
2 * ntfs_inode.c - NTFS kernel inode operations.
3 *
4 * Copyright (c) 2006-2011 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2011 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/cdefs.h>
39
40#include <sys/errno.h>
41#include <sys/kernel_types.h>
42#include <sys/queue.h>
43#include <sys/time.h>
44#include <sys/stat.h>
45#include <sys/ucred.h>
46#include <sys/ubc.h>
47#include <sys/vnode.h>
48
49#include <string.h>
50
51#include <libkern/libkern.h>
52#include <libkern/OSAtomic.h>
53#include <libkern/OSMalloc.h>
54
55#include <kern/debug.h>
56#include <kern/locks.h>
57#include <kern/sched_prim.h>
58
59#include <mach/machine/vm_param.h>
60
61#include "ntfs.h"
62#include "ntfs_attr.h"
63#include "ntfs_debug.h"
64#include "ntfs_dir.h"
65#include "ntfs_hash.h"
66#include "ntfs_inode.h"
67#include "ntfs_mft.h"
68#include "ntfs_page.h"
69#include "ntfs_runlist.h"
70#include "ntfs_sfm.h"
71#include "ntfs_time.h"
72#include "ntfs_types.h"
73#include "ntfs_unistr.h"
74#include "ntfs_volume.h"
75#include "ntfs_vnops.h"
76
77/**
78 * ntfs_inode_test - compare two (possibly fake) ntfs inodes for equality
79 * @ni:		ntfs inode which to test
80 * @na:		ntfs attribute which is being tested with
81 *
82 * Compare the ntfs attribute embedded in the ntfs inode @ni for equality with
83 * the ntfs attribute @na.
84 *
85 * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
86 * @na->name and @na->name_len are then ignored.
87 *
88 * Return true if the attributes match and false if not.
89 *
90 * Locking: Caller must hold the @ntfs_inode_hash_lock.
91 */
92BOOL ntfs_inode_test(ntfs_inode *ni, const ntfs_attr *na)
93{
94	if (ni->mft_no != na->mft_no)
95		return FALSE;
96	/* If !NInoAttr(ni), @ni is a normal file or directory inode. */
97	if (!NInoAttr(ni)) {
98		/* If not looking for a normal inode this is a mismatch. */
99		if (na->type != AT_UNUSED)
100			return FALSE;
101	} else {
102		ntfs_volume *vol;
103
104		/* A fake inode describing an attribute. */
105		if (ni->type != na->type)
106			return FALSE;
107		vol = ni->vol;
108		if (!ntfs_are_names_equal(ni->name, ni->name_len,
109				na->name, na->name_len, NVolCaseSensitive(vol),
110				vol->upcase, vol->upcase_len))
111			return FALSE;
112	}
113	/*
114	 * If looking for raw inode but found non-raw one or looking for
115	 * non-raw inode and found raw one this is a mismatch.
116	 */
117	if ((BOOL)NInoRaw(ni) != na->raw)
118		return FALSE;
119	/* Match! */
120	return TRUE;
121}
122
123/**
124 * __ntfs_inode_init - initialize an ntfs inode
125 * @vol:	ntfs volume to which @ni belongs.
126 * @ni:		ntfs inode to initialize
127 *
128 * Initialize the ntfs inode @ni to defaults.
129 *
130 * NOTE: ni->mft_no, ni->flags, ni->type, ni->name, and ni->name_len are left
131 * untouched.  Make sure to initialize them elsewhere.
132 */
133static inline void __ntfs_inode_init(ntfs_volume *vol, ntfs_inode *ni)
134{
135	ni->vol = vol;
136	ni->vn = NULL;
137	ni->nr_refs = 0;
138	ni->nr_opens = 0;
139	lck_rw_init(&ni->lock, ntfs_lock_grp, ntfs_lock_attr);
140	/*
141	 * By default do i/o in sectors.  This for example gets overridden for
142	 * mst protected attributes for which the size is set to the ntfs
143	 * record size being protected by the mst fixups.
144	 */
145	ni->block_size = vol->sector_size;
146	ni->block_size_shift = vol->sector_size_shift;
147	lck_spin_init(&ni->size_lock, ntfs_lock_grp, ntfs_lock_attr);
148	ni->allocated_size = ni->data_size = ni->initialized_size = 0;
149	ni->seq_no = 0;
150	ni->link_count = 0;
151	ni->uid = vol->uid;
152	ni->gid = vol->gid;
153	ni->mode = 0;
154	ni->rdev = (dev_t)0;
155	ni->file_attributes = 0;
156	ni->last_access_time = ni->last_mft_change_time =
157			ni->last_data_change_time = ni->creation_time =
158			(struct timespec) {
159		.tv_sec = 0,
160		.tv_nsec = 0,
161	};
162	ntfs_rl_init(&ni->rl);
163	ni->mft_ni = NULL;
164	ni->m_buf = NULL;
165	ni->m = NULL;
166	ni->attr_list_size = 0;
167	ni->attr_list_alloc = 0;
168	ni->attr_list = NULL;
169	ntfs_rl_init(&ni->attr_list_rl);
170	ni->last_set_bit = -1;
171	ni->vcn_size = 0;
172	ni->collation_rule = 0;
173	ni->vcn_size_shift = 0;
174	ni->nr_dirhints = 0;
175	ni->dirhint_tag = 0;
176	TAILQ_INIT(&ni->dirhint_list);
177	lck_mtx_init(&ni->extent_lock, ntfs_lock_grp, ntfs_lock_attr);
178	ni->nr_extents = 0;
179	ni->extent_alloc = 0;
180	lck_mtx_init(&ni->attr_nis_lock, ntfs_lock_grp, ntfs_lock_attr);
181	ni->nr_attr_nis = 0;
182	ni->attr_nis_alloc = 0;
183	ni->base_ni = NULL;
184	ni->base_attr_nis_lock = NULL;
185}
186
187/**
188 * ntfs_inode_init - initialize an ntfs inode
189 * @vol:	ntfs volume to which @ni belongs.
190 * @ni:		ntfs inode to initialize
191 * @na:		ntfs attribute which to initialize @ni to
192 *
193 * Initialize the ntfs inode @ni with the values from the ntfs attribute @na in
194 * order to enable ntfs_inode_test() to do its work.
195 *
196 * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
197 * In that case, @na->name and @na->name_len should be set to NULL and 0,
198 * respectively.  Although that is not strictly necessary as ntfs_inode_read()
199 * will fill them in later.
200 *
201 * Return 0 on success and errno on error.
202 *
203 * The only defined error code is ENOMEM.
204 */
205errno_t ntfs_inode_init(ntfs_volume *vol, ntfs_inode *ni, const ntfs_attr *na)
206{
207	ni->flags = (1 << NI_Locked) | (1 << NI_Alloc);
208	ni->mft_no = na->mft_no;
209	ni->type = na->type;
210	if (na->type == AT_INDEX_ALLOCATION)
211		NInoSetMstProtected(ni);
212	ni->name = na->name;
213	ni->name_len = na->name_len;
214	if (na->raw)
215		NInoSetRaw(ni);
216	__ntfs_inode_init(vol, ni);
217	/* If initializing a normal inode, we are done. */
218	if (na->type == AT_UNUSED)
219		return 0;
220	/* It is a fake inode. */
221	NInoSetAttr(ni);
222	/*
223	 * We have I30 global constant as an optimization as it is the name
224	 * in >99.9% of named attributes!  The other <0.1% incur an allocation
225	 * but that is ok.  And most attributes are unnamed anyway, thus the
226	 * fraction of named attributes with name != I30 is actually absolutely
227	 * tiny.
228	 *
229	 * We now also have a second common name and that is the name of the
230	 * resource fork so special case this, too.  This also allows us to
231	 * identify resource fork attribute inodes easily by simply comparing
232	 * their name for equality with the global constant
233	 * NTFS_SFM_RESOURCEFORK_NAME.
234	 *
235	 * Simillarly we also add NTFS_SFM_AFPINFO_NAME as this is also quite
236	 * common as it holds the backup time and the Finder info.
237	 */
238	if (na->name_len && na->name != I30 &&
239			na->name != NTFS_SFM_RESOURCEFORK_NAME &&
240			na->name != NTFS_SFM_AFPINFO_NAME) {
241		unsigned i = na->name_len * sizeof(ntfschar);
242		ni->name = OSMalloc(i + sizeof(ntfschar), ntfs_malloc_tag);
243		if (!ni->name)
244			return ENOMEM;
245		memcpy(ni->name, na->name, i);
246		ni->name[na->name_len] = 0;
247	}
248	return 0;
249}
250
251static errno_t ntfs_inode_read(ntfs_inode *ni);
252static errno_t ntfs_attr_inode_read_or_create(ntfs_inode *base_ni,
253		ntfs_inode *ni, const int options);
254static errno_t ntfs_index_inode_read(ntfs_inode *base_ni, ntfs_inode *ni);
255
256/**
257 * ntfs_inode_get_vtype - return the vtype of an ntfs inode
258 * @ni:		ntfs inode whose vtype to return
259 *
260 * Figure out the vtype of the ntfs inode @ni and return it.
261 *
262 * Valid vtypes are:
263 *	VNON = No type.
264 *	VREG = Regular file.
265 *	VDIR = Directory.
266 *	VBLK = Block device.
267 *	VCHR = Character device.
268 *	VLNK = Symbolic link.
269 *	VSOCK = Socket.
270 *	VFIFO = Named pipe / fifo.
271 *	VBAD = Dead vnode.
272 *	VSTR = Not used in current OS X kernel.
273 *	VCPLX = Not used in current OS X kernel.
274 */
275static inline enum vtype ntfs_inode_get_vtype(ntfs_inode *ni)
276{
277	/*
278	 * Attribute inodes do not really have a type.
279	 *
280	 * However, the current OS X kernel does not allow use of ubc with
281	 * anything other than regular files (i.e. VREG vtype), thus we need to
282	 * return VREG for named $DATA attributes, i.e. named streams, so that
283	 * they can be accessed via mmap like regular files.  And the same goes
284	 * for index inodes which we need to be able to read via the ubc.
285	 *
286	 * And a further however is that ntfs_unmount() uses vnode_iterate() to
287	 * flush all inodes of the mounted volume and vnode_iterate() skips
288	 * over all VNON vnodes, thus we cannot have any vnodes marked VNON or
289	 * unmounting would fail.  (Note we cannote use vflush() instead of
290	 * vnode_iterate() because vflush() calls vnode_umount_preflight()
291	 * which in turn aborts the vflush() if any vnodes are busy and in our
292	 * case we want to evict the non-system vnodes only thus the system
293	 * vnodes are busy thus vflush() is aborted from the preflight call.)
294	 */
295	if (NInoAttr(ni))
296		return VREG;
297	/*
298	 * Not an attribute inode, thus the mode will be a proper POSIX mode,
299	 * which we just need to convert to V*** type.
300	 */
301	return IFTOVT(ni->mode);
302}
303
304/**
305 * ntfs_inode_add_vnode - create and attach a vnode to an ntfs inode
306 * @ni:		ntfs inode to which to attach a new vnode
307 * @is_system:	true if @ni is a system inode and false otherwise
308 * @parent_vn:	vnode of directory containing @ni or NULL
309 * @cn:		componentname containing the name of @ni or NULL
310 *
311 * Create a new vnode for the ntfs inode @ni and attach it to the ntfs inode.
312 * If @is_system is true the created vnode is marked as a system vnode (via the
313 * VSYSTEM flag).
314 *
315 * If @parent_vn is not NULL, set it up as the parent directory vnode of the
316 * newly created vnode.
317 *
318 * If @cn is not NULL, set it up as the name of the newly created vnode and
319 * optionally enter the name in the name cache.
320 *
321 * If the the inode is an attribute inode, set it up as a named stream vnode so
322 * it does not block non-forced unmounts in the VFS.
323 *
324 * Return 0 on success and errno on error.
325 */
326errno_t ntfs_inode_add_vnode_attr(ntfs_inode *ni, const BOOL is_system,
327		vnode_t parent_vn, struct componentname *cn, BOOL isstream)
328{
329	s64 data_size;
330	errno_t err;
331	enum vtype vtype;
332	struct vnode_fsparam vn_fsp;
333	BOOL cache_name = FALSE;
334
335	ntfs_debug("Entering.");
336	/* Get the vnode type corresponding to the inode mode type. */
337	vtype = ntfs_inode_get_vtype(ni);
338	/*
339	 * Get the data size for regular files, attributes, directories, and
340	 * symbolic links.
341	 */
342	data_size = 0;
343	if (vtype == VREG || vtype == VDIR || vtype == VLNK)
344		data_size = ni->data_size;
345	vn_fsp = (struct vnode_fsparam) {
346		.vnfs_mp = ni->vol->mp,	/* Mount of volume. */
347		.vnfs_vtype = vtype,	/* Vnode type. */
348		.vnfs_str = "ntfs",	/* Debug aid. */
349		.vnfs_dvp = parent_vn,	/* Parent directory vnode. */
350		.vnfs_fsnode = ni,	/* Ntfs inode to attach to the vnode. */
351		.vnfs_vops = ntfs_vnodeop_p, /* Operations for this vnode. */
352		.vnfs_markroot = ((ni->mft_no != FILE_root) || NInoAttr(ni)) ?
353				0 : 1,	/* Is this the ntfs volume root? */
354		.vnfs_marksystem = is_system ? 1 : 0, /* Mark the vnode as
355					   VSYSTEM if this is a system inode. */
356		.vnfs_rdev = ni->rdev,	/* Device if vtype is VBLK or VCHR.  We
357					   can just return @ni->rdev as that is
358					   zero for all other vtypes. */
359		.vnfs_filesize = data_size, /* Data size of attribute.  No
360					   need for size lock as we are only
361					   user of inode at present. */
362		.vnfs_cnp = cn,		/* Component name to assign as the name
363					   of the vnode and optionally to add
364					   it to the namecache. */
365		.vnfs_flags = VNFS_ADDFSREF, /* VNFS_* flags.  We want to have
366						an fs reference on the vnode. */
367	};
368	/*
369	 * If the name is not meant to be cached cause vnode_create() not to
370	 * add it to the name cache.
371	 */
372	if (cn && cn->cn_flags & MAKEENTRY) {
373		/*
374		 * Do not want the caller to try to add the name to the cache
375		 * as well.
376		 */
377		cn->cn_flags &= ~MAKEENTRY;
378		cache_name = TRUE;
379	}
380	if (!parent_vn || !cache_name)
381		vn_fsp.vnfs_flags |= VNFS_NOCACHE;
382	/*
383	 * If this is a named stream inode, then set it's parent to
384	 * NULL.  This way the VFS will set up the parent vnode and then
385	 * at the end of the VNOP_GETNAMEDSTREAM call, the VFS will call
386	 *  vnode_update_identity, which sets the parent and increments the
387	 * kusecount on the vnode.  If the parent is already set our kusecount
388	 * can go negative!
389	 */
390	if (isstream) {
391		vn_fsp.vnfs_dvp = NULL;
392	}
393
394	err = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vn_fsp, &ni->vn);
395	if (!err) {
396		vnode_t vn = ni->vn;
397		/*
398		 * Vnode tag types are deprecated thus we use VT_OTHER given
399		 * there is no VT_NTFS.
400		 */
401		vnode_settag(vn, VT_OTHER);
402		ntfs_debug("Done.");
403		return err;
404	}
405	ntfs_debug("Failed (error %d).", err);
406	return err;
407}
408
409/**
410 * ntfs_inode_get - obtain a normal ntfs inode
411 * @vol:	mounted ntfs volume
412 * @mft_no:	mft record number / inode number to obtain
413 * @is_system:	true if the inode is a system inode and false otherwise
414 * @lock:	locking options (see below)
415 * @nni:	destination pointer for the obtained ntfs inode
416 * @parent_vn:	vnode of directory containing the inode to return or NULL
417 * @cn:		componentname containing the name of the inode to return
418 *
419 * Obtain the ntfs inode corresponding to a specific normal inode (i.e. a
420 * file or directory).  If @is_system is true the created vnode is marked as a
421 * system vnode (via the VSYSTEM flag).
422 *
423 * If @lock is LCK_RW_TYPE_SHARED the inode will be returned locked for reading
424 * (@nni->lock) and if it is LCK_RW_TYPE_EXCLUSIVE the inode will be returned
425 * locked for writing (@nni->lock).  As a special case if @lock is 0 it means
426 * the inode to be returned is already locked so do not lock it.  This requires
427 * that the inode is already present in the inode cache.  If it is not it
428 * cannot already be locked and thus you will get a panic().
429 *
430 * If the inode is in the cache, it is returned.  If the inode has an attached
431 * vnode, an iocount reference is obtained on the vnode before returning the
432 * inode.  If @parent_vn is not NULL, the inode has an attached vnode, and the
433 * parent of the vnode is not @parent_vn, the identity of the vnode is updated
434 * changing the current parent of the vnode to @parent_vn.  If @cn is not NULL
435 * and the current name of the vnode does not match the name described by @cn,
436 * the identity of the vnode is updated changing the current name of the vnode
437 * to the name described by @cn.
438 *
439 * If the inode is not in the cache, a new ntfs inode is allocated and
440 * initialized, ntfs_inode_read() is called to read it in and fill in the
441 * remainder of the ntfs inode structure before finally a new vnode is created
442 * and attached to the new ntfs inode.  The inode is then returned with an
443 * iocount reference taken on its vnode.  If @parent_vn is not NULL, it is set
444 * up as the parent directory vnode of the newly created vnode.  If @cn is not
445 * NULL, it is set up as the name of the newly created vnode.
446 *
447 * We do not need a reference count for the ntfs inode as the ntfs inode either
448 * has a vnode, in which case the life-time and reference counting on the vnode
449 * ensure there are no life-time problems with the ntfs inode or it does not
450 * have a vnode in which case either there was an error and we are about to
451 * destroy the ntfs inode or it is an extent inode, in which case the inode is
452 * attached to the base inode and thus it is bound by the life-time and
453 * reference count of the vnode of the base inode given that the extent inodes
454 * are destroyed at the same time as their base inode is destroyed so we should
455 * never get into life-time problems as it is now.
456 *
457 * Return 0 on success and errno on error.
458 */
459errno_t ntfs_inode_get(ntfs_volume *vol, ino64_t mft_no, const BOOL is_system,
460		const lck_rw_type_t lock, ntfs_inode **nni, vnode_t parent_vn,
461		struct componentname *cn)
462{
463	ntfs_inode *ni;
464	vnode_t vn;
465	errno_t err;
466	ntfs_attr na;
467
468	ntfs_debug("Entering for mft_no 0x%llx, is_system is %s, lock 0x%x.",
469			(unsigned long long)mft_no,
470			is_system ? "true" : "false", (unsigned)lock);
471retry:
472	na = (ntfs_attr) {
473		.mft_no = mft_no,
474		.type = AT_UNUSED,
475		.raw = FALSE,
476	};
477	ni = ntfs_inode_hash_get(vol, &na);
478	if (!ni) {
479		ntfs_debug("Failed (ENOMEM).");
480		return ENOMEM;
481	}
482	/*
483	 * Lock the inode for reading/writing as requested by the caller.
484	 *
485	 * If the caller specified that the inode is already locked, verify
486	 * that the inode was already in the cache and panic() if not.
487	 */
488	switch (lock) {
489	case LCK_RW_TYPE_EXCLUSIVE:
490		lck_rw_lock_exclusive(&ni->lock);
491		break;
492	case LCK_RW_TYPE_SHARED:
493		lck_rw_lock_shared(&ni->lock);
494		break;
495	case 0:
496		if (NInoAlloc(ni))
497			panic("%s(): !lock but NInoAlloc(ni)\n", __FUNCTION__);
498		break;
499	default:
500		panic("%s(): lock is 0x%x which is invalid!\n", __FUNCTION__,
501				lock);
502	}
503	if (!NInoAlloc(ni)) {
504		/* The inode was already cached. */
505		vn = ni->vn;
506		/*
507		 * Do not allow open-unlinked files to be opened again and
508		 * retry for NInoDeleted() inodes.
509		 *
510		 * Otherwise this could for example happen via NFS or VolFS
511		 * style access for example.
512		 */
513		if (!ni->link_count) {
514			ntfs_debug("Mft_no 0x%llx has been unlinked, "
515					"returning ENOENT.",
516					(unsigned long long)ni->mft_no);
517			err = ENOENT;
518			goto err;
519		}
520		if (NInoDeleted(ni)) {
521			if (lock == LCK_RW_TYPE_EXCLUSIVE)
522				lck_rw_unlock_exclusive(&ni->lock);
523			else if (lock == LCK_RW_TYPE_SHARED)
524				lck_rw_unlock_shared(&ni->lock);
525			if (vn) {
526				/* Remove the inode from the name cache. */
527				cache_purge(vn);
528				(void)vnode_put(vn);
529			} else
530				ntfs_inode_reclaim(ni);
531			goto retry;
532		}
533		/*
534		 * If the vnode is present and either the inode has multiple
535		 * hard links or it has no parent and/or name, update the
536		 * vnode identity with the supplied information if any.
537		 */
538		if (vn) {
539			vnode_t old_parent_vn;
540			const char *old_name;
541
542			if (is_system && !vnode_issystem(vn))
543				panic("%s(): mft_no 0x%llx, is_system is TRUE "
544						"but vnode exists and is not "
545						"marked VSYSTEM\n",
546						__FUNCTION__,
547						(unsigned long long)mft_no);
548			old_parent_vn = vnode_getparent(vn);
549			old_name = vnode_getname(vn);
550			if (ni->link_count > 1 || !old_parent_vn || !old_name) {
551				char *name = NULL;
552				int len, hash, flags;
553
554				flags = hash = len = 0;
555				/*
556				 * If a parent vnode was supplied and it is
557				 * different from the current one, update it.
558				 */
559				if (parent_vn && old_parent_vn != parent_vn) {
560					ntfs_debug("Updating vnode identity "
561							"with new parent "
562							"vnode.");
563					flags |= VNODE_UPDATE_PARENT;
564				}
565				/*
566				 * If a name was supplied and the vnode has no
567				 * name at present or the names are not the
568				 * same, update it.
569				 */
570				if (cn && (!old_name ||
571						(long)strlen(old_name) !=
572						cn->cn_namelen ||
573						bcmp(old_name, cn->cn_nameptr,
574						cn->cn_namelen))) {
575					ntfs_debug("Updating vnode identity "
576							"with new name.");
577					name = cn->cn_nameptr;
578					len = cn->cn_namelen;
579					hash = cn->cn_hash;
580					flags |= VNODE_UPDATE_NAME |
581							VNODE_UPDATE_CACHE;
582				}
583				if (flags)
584					vnode_update_identity(vn, parent_vn,
585							name, len, hash, flags);
586			}
587			if (old_name)
588				(void)vnode_putname(old_name);
589			if (!parent_vn)
590				parent_vn = old_parent_vn;
591			if (cn && cn->cn_flags & MAKEENTRY) {
592				if (parent_vn)
593					cache_enter(parent_vn, vn, cn);
594				/*
595				 * Do not want the caller to try to add the
596				 * name to the cache as well.
597				 */
598				cn->cn_flags &= ~MAKEENTRY;
599			}
600			if (old_parent_vn)
601				(void)vnode_put(old_parent_vn);
602		}
603		*nni = ni;
604		ntfs_debug("Done (found in cache).");
605		return 0;
606	}
607	/*
608	 * This is a freshly allocated inode, need to read it in now.  Also,
609	 * need to allocate and attach a vnode to the new ntfs inode.
610	 */
611	err = ntfs_inode_read(ni);
612	if (!err)
613		err = ntfs_inode_add_vnode(ni, is_system, parent_vn, cn);
614	if (!err) {
615		/*
616		 * If the inode is a directory, get the index inode now.  We
617		 * postpone this to here because we did not have the directory
618		 * vnode until now.
619		 */
620		if (S_ISDIR(ni->mode)) {
621			ntfs_inode *ini;
622
623			err = ntfs_index_inode_get(ni, I30, 4, is_system, &ini);
624			if (err) {
625				ntfs_error(vol->mp, "Failed to get index "
626						"inode.");
627				/* Kill the bad inode. */
628				vn = ni->vn;
629				(void)vnode_recycle(vn);
630				goto err;
631			}
632			/*
633			 * Copy the sizes from the index inode to the directory
634			 * inode so we do not need to get the index inode in
635			 * ntfs_vnop_getattr().
636			 *
637			 * Note @ni is totally private to us thus no need to
638			 * lock the sizes for modification.  On the other hand
639			 * @ini is not private thus we need to lock its sizes.
640			 */
641			lck_spin_lock(&ini->size_lock);
642			ni->allocated_size = ini->allocated_size;
643			ni->data_size = ini->data_size;
644			ni->initialized_size = ini->initialized_size;
645			lck_spin_unlock(&ini->size_lock);
646			/* We are done with the index vnode. */
647			(void)vnode_put(ini->vn);
648		}
649		ntfs_inode_unlock_alloc(ni);
650		*nni = ni;
651		ntfs_debug("Done (added to cache).");
652		return err;
653	}
654	if (lock == LCK_RW_TYPE_EXCLUSIVE)
655		lck_rw_unlock_exclusive(&ni->lock);
656	else if (lock == LCK_RW_TYPE_SHARED)
657		lck_rw_unlock_shared(&ni->lock);
658	ntfs_inode_reclaim(ni);
659	ntfs_debug("Failed (inode read/vnode create).");
660	return err;
661err:
662	if (lock == LCK_RW_TYPE_EXCLUSIVE)
663		lck_rw_unlock_exclusive(&ni->lock);
664	else if (lock == LCK_RW_TYPE_SHARED)
665		lck_rw_unlock_shared(&ni->lock);
666	if (vn)
667		(void)vnode_put(vn);
668	else
669		ntfs_inode_reclaim(ni);
670	return err;
671}
672
673/**
674 * ntfs_attr_inode_lookup - obtain an ntfs attribute inode if it is cached
675 * @base_ni:	base inode if @ni is not raw and non-raw inode of @ni otherwise
676 * @type:	attribute type
677 * @name:	Unicode name of the attribute (NULL if unnamed)
678 * @name_len:	length of @name in Unicode characters (0 if unnamed)
679 * @raw:	whether to get the raw inode (TRUE) or not (FALSE)
680 * @nni:	destination pointer for the obtained attribute ntfs inode
681 *
682 * Check if the ntfs inode corresponding to the attribute specified by @type,
683 * @name, and @name_len, which is present in the base mft record specified by
684 * the ntfs inode @base_ni is cached in the inode cache and if so return it
685 * taking a reference on its vnode.
686 *
687 * If @raw is true @base_ni is the non-raw inode to which @ni belongs rather
688 * than the base inode.
689 *
690 * If the attribute inode is in the cache, it is returned with an iocount
691 * reference on the attached vnode.
692 *
693 * Return 0 on success and errno on error.
694 *
695 * Locking: The base ntfs inode @base_ni must be locked (@base_ni->lock).
696 */
697errno_t ntfs_attr_inode_lookup(ntfs_inode *base_ni, ATTR_TYPE type,
698		ntfschar *name, u32 name_len, const BOOL raw, ntfs_inode **nni)
699{
700	ntfs_inode *ni;
701	ntfs_attr na;
702
703	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, "
704			"raw is %s.", (unsigned long long)base_ni->mft_no,
705			le32_to_cpu(type), (unsigned)name_len,
706			raw ? "true" : "false");
707	/* Make sure no one calls ntfs_attr_inode_get() for indices. */
708	if (type == AT_INDEX_ALLOCATION)
709		panic("%s() called for an index.\n", __FUNCTION__);
710	if (!base_ni->vn)
711		panic("%s() called with a base inode that does not have a "
712				"vnode attached.\n", __FUNCTION__);
713	na = (ntfs_attr) {
714		.mft_no = base_ni->mft_no,
715		.type = type,
716		.name = name,
717		.name_len = name_len,
718		.raw = raw,
719	};
720	ni = ntfs_inode_hash_lookup(base_ni->vol, &na);
721	if (!ni) {
722		ntfs_debug("Not cached (ENOENT).");
723		return ENOENT;
724	}
725	*nni = ni;
726	ntfs_debug("Done (found in cache).");
727	return 0;
728}
729
730/**
731 * ntfs_attr_inode_get_or_create - obtain/create an ntfs attribute inode
732 * @base_ni:	base inode if @ni is not raw and non-raw inode of @ni otherwise
733 * @type:	attribute type
734 * @name:	Unicode name of the attribute (NULL if unnamed)
735 * @name_len:	length of @name in Unicode characters (0 if unnamed)
736 * @is_system:	true if the inode is a system inode and false otherwise
737 * @raw:	whether to get the raw inode (TRUE) or not (FALSE)
738 * @options:	options specifying the get and/or create behaviour
739 * @lock:	locking options (see below)
740 * @nni:	destination pointer for the obtained attribute ntfs inode
741 *
742 * Obtain the ntfs inode corresponding to the attribute specified by @type,
743 * @name, and @name_len, which is present in the base mft record specified by
744 * the ntfs inode @base_ni.  If @is_system is true the created vnode is marked
745 * as a system vnode (via the VSYSTEM flag).
746 *
747 * If @raw is true @base_ni is the non-raw inode to which @ni belongs rather
748 * than the base inode.
749 *
750 * If @options does not specify XATTR_CREATE nor XATTR_REPLACE the attribute
751 * will be created if it does not exist already and then will be opened.
752 *
753 * If @options specifies XATTR_CREATE the call will fail if the attribute
754 * already exists, i.e. the existing attribute will not be opened.
755 *
756 * If @options specifies XATTR_REPLACE the call will fail if the attribute does
757 * not exist, i.e. the new attribute will not be created, i.e. this is the
758 * equivalent of ntfs_attr_inode_get().
759 *
760 * A special case is the resource fork (@name == NTFS_SFM_RESOURCEFORK_NAME).
761 * If it exists but has zero size it is treated as if it does not exist when
762 * handling the XATTR_CREATE and XATTR_REPLACE flags in @options.  Thus if the
763 * resource fork exists but is zero size, a call with XATTR_CREATE set in
764 * @options will succeed as if it did not already exist and a call with
765 * XATTR_REPLACE set in @options will fail as if it did not already exist.
766 *
767 * If @lock is LCK_RW_TYPE_SHARED the attribute inode will be returned locked
768 * for reading (@nni->lock) and if it is LCK_RW_TYPE_EXCLUSIVE the attribute
769 * inode will be returned locked for writing (@nni->lock).  As a special case
770 * if @lock is 0 it means the inode to be returned is already locked so do not
771 * lock it.  This requires that the inode is already present in the inode
772 * cache.  If it is not it cannot already be locked and thus you will get a
773 * panic().
774 *
775 * If the attribute inode is in the cache, it is returned with an iocount
776 * reference on the attached vnode.
777 *
778 * If the inode is not in the cache, a new ntfs inode is allocated and
779 * initialized, ntfs_attr_inode_read_or_create() is called to read it in/create
780 * it and fill in the remainder of the ntfs inode structure before finally a
781 * new vnode is created and attached to the new ntfs inode.  The inode is then
782 * returned with an iocount reference taken on its vnode.
783 *
784 * Note we use the base vnode as the parent vnode of the attribute vnode to be
785 * in line with how OS X treats named stream vnodes.
786 *
787 * Note, for index allocation attributes, you need to use ntfs_index_inode_get()
788 * instead of ntfs_attr_inode_get() as working with indices is a lot more
789 * complex.
790 *
791 * Return 0 on success and errno on error.  In the error case the lock state of
792 * the inode is left in the same state as it was before this function was
793 * called.
794 *
795 * TODO: For now we do not store a name for attribute inodes.
796 */
797errno_t ntfs_attr_inode_get_or_create(ntfs_inode *base_ni, ATTR_TYPE type,
798		ntfschar *name, u32 name_len, const BOOL is_system,
799		const BOOL raw, const int options, const lck_rw_type_t lock,
800		ntfs_inode **nni)
801{
802	ntfs_inode *ni;
803	vnode_t vn;
804	int err;
805	BOOL promoted;
806	ntfs_attr na;
807	BOOL isstream = FALSE;
808
809	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, "
810			"is_system is %s, raw is %s, options 0x%x, lock 0x%x.",
811			(unsigned long long)base_ni->mft_no, le32_to_cpu(type),
812			(unsigned)name_len, is_system ? "true" : "false",
813			raw ? "true" : "false", (unsigned)options,
814			(unsigned)lock);
815	/* Make sure no one calls us for indices. */
816	if (type == AT_INDEX_ALLOCATION)
817		panic("%s() called for an index.\n", __FUNCTION__);
818	if (!base_ni->vn)
819		panic("%s() called with a base inode that does not have a "
820				"vnode attached.\n", __FUNCTION__);
821	promoted = FALSE;
822retry:
823	na = (ntfs_attr) {
824		.mft_no = base_ni->mft_no,
825		.type = type,
826		.name = name,
827		.name_len = name_len,
828		.raw = raw,
829	};
830	ni = ntfs_inode_hash_get(base_ni->vol, &na);
831	if (!ni) {
832		ntfs_debug("Failed (ENOMEM).");
833		return ENOMEM;
834	}
835	/*
836	 * Lock the inode for reading/writing as requested by the caller.
837	 *
838	 * If the caller specified that the inode is already locked, verify
839	 * that the inode was already in the cache and panic() if not.
840	 */
841	if (lock) {
842		if (promoted || lock == LCK_RW_TYPE_EXCLUSIVE)
843			lck_rw_lock_exclusive(&ni->lock);
844		else if (lock == LCK_RW_TYPE_SHARED)
845			lck_rw_lock_shared(&ni->lock);
846		else
847			panic("%s(): lock is 0x%x which is invalid!\n",
848					__FUNCTION__, lock);
849	} else if (NInoAlloc(ni))
850		panic("%s(): !lock but NInoAlloc(ni)\n", __FUNCTION__);
851	if (!NInoAlloc(ni)) {
852		/* The inode was already cached. */
853		vn = ni->vn;
854		/*
855		 * If @options specifies XATTR_REPLACE do not allow
856		 * open-unlinked or NInoDeleted() attribute inodes to be opened
857		 * again.
858		 *
859		 * Otherwise retry if the attribute inode is NInoDeleted() and
860		 * re-link it if it is open-unlinked.  In the latter case also
861		 * truncate it to zero size.
862		 */
863		if (NInoDeleted(ni) || !ni->link_count) {
864			if (NInoDeleted(ni)) {
865				/* Remove the inode from the name cache. */
866				if (vn)
867					cache_purge(vn);
868			}
869			if (options & XATTR_REPLACE) {
870				ntfs_debug("Attribute in mft_no 0x%llx is "
871						"deleted/unlinked, returning "
872						"ENOENT.",
873						(unsigned long long)ni->mft_no);
874				err = ENOENT;
875				goto err;
876			}
877			/*
878			 * XATTR_REPLACE is not specified thus retry if the
879			 * attribute inode is NInoDeleted().
880			 */
881relocked:
882			if (NInoDeleted(ni)) {
883				if (lock) {
884					if (promoted || lock ==
885							LCK_RW_TYPE_EXCLUSIVE)
886						lck_rw_unlock_exclusive(
887								&ni->lock);
888					else
889						lck_rw_unlock_shared(&ni->lock);
890				}
891				if (vn)
892					(void)vnode_put(vn);
893				else
894					ntfs_inode_reclaim(ni);
895				goto retry;
896			}
897			/*
898			 * The attribute inode is open-unlinked, we need it
899			 * locked exclusive before we can re-link it.
900			 */
901			if (lock == LCK_RW_TYPE_SHARED && !promoted) {
902				promoted = TRUE;
903				if (!lck_rw_lock_shared_to_exclusive(
904						&ni->lock)) {
905					/*
906					 * We dropped the lock so take it
907					 * again and then redo the checking for
908					 * the inode being deleted.
909					 */
910					lck_rw_lock_exclusive(&ni->lock);
911					goto relocked;
912				}
913			}
914			if (ni->link_count) {
915				/*
916				 * Someone else already re-linked it.  If
917				 * @options specifies XATTR_CREATE we need to
918				 * abort.
919				 */
920				goto exists;
921			}
922			/*
923			 * Re-link the attribute inode and truncate it
924			 * to zero size thus pretending we created it.
925			 */
926			ntfs_debug("Re-instantiating open-unlinked attribute "
927					"in mft_no 0x%llx.",
928					(unsigned long long)ni->mft_no);
929			ni->link_count = 1;
930			err = ntfs_attr_resize(ni, 0, 0, NULL);
931			if (err) {
932				ntfs_error(ni->vol->mp, "Failed to truncate "
933						"re-linked attribute in "
934						"mft_no 0x%llx (error %d).",
935						(unsigned long long)ni->mft_no,
936						err);
937				goto err;
938			}
939		} else {
940			/*
941			 * The attribute inode already exists.
942			 *
943			 * If it is the empty resource fork we need to fail if
944			 * @options specifies XATTR_REPLACE.
945			 *
946			 * If @options specifies XATTR_CREATE we need to abort
947			 * unless this is the resource fork and it is empty.
948			 */
949exists:
950			if (name == NTFS_SFM_RESOURCEFORK_NAME) {
951				s64 size;
952
953				if (vn)
954					size = ubc_getsize(vn);
955				else {
956					lck_spin_lock(&ni->size_lock);
957					size = ni->data_size;
958					lck_spin_unlock(&ni->size_lock);
959				}
960				if (!size) {
961					if (options & XATTR_REPLACE) {
962						ntfs_debug("Attribute mft_no "
963								"0x%llx does "
964								"not exist, "
965								"returning "
966								"ENOENT.",
967								(unsigned long
968								long)
969								ni->mft_no);
970						err = ENOENT;
971						goto err;
972					}
973					if (options & XATTR_CREATE)
974						goto allow_rsrc_fork;
975				}
976			}
977			if (options & XATTR_CREATE) {
978				ntfs_debug("Attribute mft_no 0x%llx already "
979						"exists, returning EEXIST.",
980						(unsigned long long)ni->mft_no);
981				err = EEXIST;
982				goto err;
983			}
984		}
985allow_rsrc_fork:
986		if (vn) {
987			vnode_t parent_vn;
988
989			parent_vn = vnode_getparent(vn);
990			if (parent_vn != base_ni->vn) {
991				ntfs_debug("Updating vnode identity with new "
992						"parent vnode.");
993				vnode_update_identity(vn, base_ni->vn, NULL,
994						0, 0, VNODE_UPDATE_PARENT);
995			}
996			if (parent_vn)
997				(void)vnode_put(parent_vn);
998		}
999		if (promoted)
1000			lck_rw_lock_exclusive_to_shared(&ni->lock);
1001		*nni = ni;
1002		ntfs_debug("Done (found in cache).");
1003		return 0;
1004	}
1005	/*
1006	 * We do not need to hold the inode lock exclusive as we already have
1007	 * guaranteed exclusive access to the attribute inode as NInoAlloc() is
1008	 * still set and we do not clear it until we are done thus demote it to
1009	 * a shared lock if we promoted it earlier.
1010	 */
1011	if (promoted)
1012		lck_rw_lock_exclusive_to_shared(&ni->lock);
1013	/*
1014	 * This is a freshly allocated inode, need to read it in/create it now.
1015	 * Also, need to allocate and attach a vnode to the new ntfs inode.
1016	 */
1017	err = ntfs_attr_inode_read_or_create(base_ni, ni, options);
1018	if (!err) {
1019		if (name == NTFS_SFM_RESOURCEFORK_NAME)
1020			isstream = TRUE;
1021		err = ntfs_inode_add_vnode_attr(ni, is_system, base_ni->vn, NULL, isstream);
1022	}
1023	if (!err) {
1024		ntfs_inode_unlock_alloc(ni);
1025		*nni = ni;
1026		ntfs_debug("Done (added to cache).");
1027		return err;
1028	}
1029	if (lock) {
1030		if (lock == LCK_RW_TYPE_SHARED)
1031			lck_rw_unlock_shared(&ni->lock);
1032		else
1033			lck_rw_unlock_exclusive(&ni->lock);
1034	}
1035	ntfs_inode_reclaim(ni);
1036	ntfs_debug("Failed (inode read/vnode create, error %d).", err);
1037	return err;
1038err:
1039	if (lock) {
1040		if (promoted || lock == LCK_RW_TYPE_EXCLUSIVE)
1041			lck_rw_unlock_exclusive(&ni->lock);
1042		else
1043			lck_rw_unlock_shared(&ni->lock);
1044	}
1045	if (vn)
1046		(void)vnode_put(vn);
1047	else
1048		ntfs_inode_reclaim(ni);
1049	return err;
1050}
1051
1052/**
1053 * ntfs_index_inode_get - obtain an ntfs inode corresponding to an index
1054 * @base_ni:	ntfs base inode containing the index related attributes
1055 * @name:	Unicode name of the index
1056 * @name_len:	length of @name in Unicode characters
1057 * @is_system:	true if the inode is a system inode and false otherwise
1058 * @nni:	destination pointer for the obtained index ntfs inode
1059 *
1060 * Obtain the ntfs inode corresponding to the index specified by @name and
1061 * @name_len, which is present in the base mft record specified by the ntfs
1062 * inode @base_ni.  If @is_system is true the created vnode is marked as a
1063 * system vnode (via the VSYSTEM flag).
1064 *
1065 * If the index inode is in the cache, it is returned with an iocount reference
1066 * on the attached vnode.
1067 *
1068 * If the inode is not in the cache, a new ntfs inode is allocated and
1069 * initialized, ntfs_index_inode_read() is called to read it in and fill in the
1070 * remainder of the ntfs inode structure before finally a new vnode is created
1071 * and attached to the new ntfs inode.  The inode is then returned with an
1072 * iocount reference taken on its vnode.
1073 *
1074 * Note we use the base vnode as the parent vnode of the index vnode to be in
1075 * line with how OS X treats named stream vnodes.
1076 *
1077 * Return 0 on success and errno on error.
1078 *
1079 * TODO: For now we do not store a name for attribute inodes.
1080 */
1081errno_t ntfs_index_inode_get(ntfs_inode *base_ni, ntfschar *name, u32 name_len,
1082		const BOOL is_system, ntfs_inode **nni)
1083{
1084	ntfs_inode *ni;
1085	ntfs_attr na;
1086	int err;
1087
1088	ntfs_debug("Entering for mft_no 0x%llx, name_len 0x%x, is_system is "
1089			"%s.", (unsigned long long)base_ni->mft_no,
1090			(unsigned)name_len, is_system ? "true" : "false");
1091	if (!base_ni->vn)
1092		panic("%s() called with a base inode that does not have a "
1093				"vnode attached.\n", __FUNCTION__);
1094	na = (ntfs_attr) {
1095		.mft_no = base_ni->mft_no,
1096		.type = AT_INDEX_ALLOCATION,
1097		.name = name,
1098		.name_len = name_len,
1099		.raw = FALSE,
1100	};
1101	ni = ntfs_inode_hash_get(base_ni->vol, &na);
1102	if (!ni) {
1103		ntfs_debug("Failed (ENOMEM).");
1104		return ENOMEM;
1105	}
1106	if (!NInoAlloc(ni)) {
1107		vnode_t vn;
1108
1109		vn = ni->vn;
1110		/*
1111		 * Do not allow open-unlinked attribute inodes to be opened
1112		 * again.
1113		 */
1114		if (!ni->link_count) {
1115			ntfs_debug("Mft_no 0x%llx has been unlinked, "
1116					"returning ENOENT.",
1117					(unsigned long long)ni->mft_no);
1118			if (vn)
1119				(void)vnode_put(vn);
1120			else
1121				ntfs_inode_reclaim(ni);
1122			return ENOENT;
1123		}
1124		if (vn) {
1125			vnode_t parent_vn;
1126
1127			parent_vn = vnode_getparent(vn);
1128			if (parent_vn != base_ni->vn) {
1129				ntfs_debug("Updating vnode identity with new "
1130						"parent vnode.");
1131				vnode_update_identity(vn, base_ni->vn, NULL,
1132						0, 0, VNODE_UPDATE_PARENT);
1133			}
1134			if (parent_vn)
1135				(void)vnode_put(parent_vn);
1136		}
1137		*nni = ni;
1138		ntfs_debug("Done (found in cache).");
1139		return 0;
1140	}
1141	/*
1142	 * This is a freshly allocated inode, need to read it in now.  Also,
1143	 * need to allocate and attach a vnode to the new ntfs inode.
1144	 */
1145	err = ntfs_index_inode_read(base_ni, ni);
1146	if (!err)
1147		err = ntfs_inode_add_vnode(ni, is_system, base_ni->vn, NULL);
1148	if (!err) {
1149		ntfs_inode_unlock_alloc(ni);
1150		*nni = ni;
1151		ntfs_debug("Done (added to cache).");
1152		return err;
1153	}
1154	ntfs_inode_reclaim(ni);
1155	ntfs_debug("Failed (inode read/vnode create).");
1156	return err;
1157}
1158
1159/**
1160 * ntfs_extent_inode_get - obtain an extent inode belonging to a base inode
1161 * @base_ni:	ntfs base inode whose extent inode to get
1162 * @mref:	mft reference of extent inode to obtain
1163 * @ext_ni:	destination pointer for the obtained extent ntfs inode
1164 *
1165 * Obtain the extent ntfs inode with mft reference @mref belonging to the base
1166 * inode @base_ni.
1167 *
1168 * If the inode is in the cache, it is returned.
1169 *
1170 * If the inode is not in the cache, a new ntfs inode is allocated, initialized
1171 * and then returned.
1172 *
1173 * Return 0 on success and errno on error.
1174 *
1175 * Note: No vnode is attached to the extent ntfs inode.
1176 */
1177errno_t ntfs_extent_inode_get(ntfs_inode *base_ni, MFT_REF mref,
1178		ntfs_inode **ext_ni)
1179{
1180	ntfs_inode *ni;
1181	ntfs_attr na;
1182	u16 seq_no = MSEQNO(mref);
1183
1184	ntfs_debug("Entering for mft_no 0x%llx.",
1185			(unsigned long long)MREF(mref));
1186	na = (ntfs_attr) {
1187		.mft_no = MREF(mref),
1188		.type = AT_UNUSED,
1189		.raw = FALSE,
1190	};
1191	ni = ntfs_inode_hash_get(base_ni->vol, &na);
1192	if (!ni) {
1193		ntfs_debug("Failed (ENOMEM).");
1194		return ENOMEM;
1195	}
1196	if (!NInoAlloc(ni)) {
1197		if (!seq_no || ni->seq_no == seq_no) {
1198			*ext_ni = ni;
1199			ntfs_debug("Done (found in cache).");
1200			return 0;
1201		}
1202		ntfs_inode_reclaim(ni);
1203		ntfs_error(base_ni->vol->mp, "Found stale extent mft "
1204				"reference!  Corrupt file system.  Run "
1205				"chkdsk.");
1206		return EIO;
1207	}
1208	/*
1209	 * This is a freshly allocated inode, need to finish setting it up as
1210	 * an extent inode now.  Note we do not take a reference on the vnode
1211	 * of the base inode because that would pin the base inode which would
1212	 * make it unfreeable.  This is not a problem as when the base vnode is
1213	 * reclaimed, we release all attached extent inodes, too.  Also, we
1214	 * simply set the sequence number rather than verify it against the one
1215	 * in the extent mft record and we leave it to the caller to verify the
1216	 * sequence number after mapping the extent mft record.
1217	 */
1218	ni->seq_no = seq_no;
1219	ni->nr_extents = -1;
1220	ni->base_ni = base_ni;
1221	ntfs_inode_unlock_alloc(ni);
1222	*ext_ni = ni;
1223	ntfs_debug("Done (added to cache).");
1224	return 0;
1225}
1226
1227/**
1228 * ntfs_inode_is_extended_system - check if an inode is in the $Extend directory
1229 * @ctx:	initialized attribute search context
1230 * @is_system:	pointer in which to return whether the inode is a system one
1231 *
1232 * Search all filename attributes in the inode described by the attribute
1233 * search context @ctx and check if any of the names are in the $Extend system
1234 * directory.
1235 *
1236 * If the inode is a system inode *@is_system is true and if it is not a system
1237 * inode it is false.
1238 *
1239 * Return 0 on success and errno on error.  On error, *@is_system is undefined.
1240 */
1241static errno_t ntfs_inode_is_extended_system(ntfs_attr_search_ctx *ctx,
1242		BOOL *is_system)
1243{
1244	ntfs_volume *vol;
1245	unsigned nr_links;
1246	errno_t err;
1247
1248	ntfs_debug("Entering for mft_no 0x%llx.",
1249			(unsigned long long)ctx->ni->mft_no);
1250	vol = ctx->ni->vol;
1251	/* Restart search. */
1252	ntfs_attr_search_ctx_reinit(ctx);
1253	/* Get number of hard links. */
1254	nr_links = le16_to_cpu(ctx->m->link_count);
1255	if (!nr_links) {
1256		ntfs_error(vol->mp, "Hard link count is zero.");
1257		return EIO;
1258	}
1259	/* Loop through all hard links. */
1260	while (!(err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0,
1261			ctx))) {
1262		FILENAME_ATTR *fn;
1263		ATTR_RECORD *a = ctx->a;
1264		u8 *a_end, *fn_end;
1265
1266		nr_links--;
1267		/*
1268		 * Maximum sanity checking as we are called on an inode that we
1269		 * suspect might be corrupt.
1270		 */
1271		if (a->non_resident) {
1272			ntfs_error(vol->mp, "Filename is non-resident.");
1273			return EIO;
1274		}
1275		if (a->flags) {
1276			ntfs_error(vol->mp, "Filename has invalid flags.");
1277			return EIO;
1278		}
1279		if (!(a->resident_flags & RESIDENT_ATTR_IS_INDEXED)) {
1280			ntfs_error(vol->mp, "Filename is not indexed.");
1281			return EIO;
1282		}
1283		a_end = (u8*)a + le32_to_cpu(a->length);
1284		fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset));
1285		fn_end = (u8*)fn + le32_to_cpu(a->value_length);
1286		if ((u8*)fn < (u8*)a || fn_end < (u8*)a || fn_end > a_end ||
1287				a_end > (u8*)ctx->m + vol->mft_record_size) {
1288			ntfs_error(vol->mp, "Filename attribute is corrupt.");
1289			return EIO;
1290		}
1291		/* This attribute is ok, but is it in the $Extend directory? */
1292		if (MREF_LE(fn->parent_directory) == FILE_Extend) {
1293			ntfs_debug("Done (system).");
1294			*is_system = TRUE;
1295			return 0;
1296		}
1297	}
1298	if (err != ENOENT) {
1299		ntfs_error(vol->mp, "Failed to lookup filename attribute.");
1300		return err;
1301	}
1302	if (nr_links) {
1303		ntfs_error(vol->mp, "Hard link count does not match number of "
1304				"filename attributes.");
1305		return EIO;
1306	}
1307	ntfs_debug("Done (not system).");
1308	*is_system = FALSE;
1309	return 0;
1310}
1311
1312/**
1313 * ntfs_inode_afpinfo_cache - cache the AfpInfo in the corresponding ntfs inode
1314 * @ni:		base ntfs inode in which to cache the AfpInfo
1315 * @afp:	AfpInfo to cache
1316 * @afp_size:	size in bytes of AfpInfo
1317 *
1318 * If @afp is not NULL copy the backup time and the Finder info from the
1319 * AfpInfo @afp of size @afp_size bytes to the base ntfs inode @ni.
1320 *
1321 * If @afp is NULL or the AfpInfo is invalid (wrong signature, version, or
1322 * size), we ignore the AfpInfo data and set up @ni with defaults for both
1323 * @ni->backup_time and @ni->finder_info.
1324 *
1325 * This function has no return value.
1326 */
1327void ntfs_inode_afpinfo_cache(ntfs_inode *ni, AFPINFO *afp,
1328		const unsigned afp_size)
1329{
1330	if (afp && (afp->signature != AfpInfo_Signature ||
1331			afp->version != AfpInfo_Version ||
1332			afp_size < sizeof(*afp))) {
1333		ntfs_warning(ni->vol->mp, "AFP_AfpInfo data attribute of "
1334				"mft_no 0x%llx contains invalid data (wrong "
1335				"signature, wrong version, or wrong size), "
1336				"ignoring and using defaults.",
1337				(unsigned long long)ni->mft_no);
1338		afp = NULL;
1339	}
1340	if (!NInoValidBackupTime(ni)) {
1341		if (afp)
1342			ni->backup_time = ntfs_ad2utc(afp->backup_time);
1343		else
1344			ni->backup_time = ntfs_ad2utc(const_cpu_to_sle32(
1345					INT32_MIN));
1346		NInoSetValidBackupTime(ni);
1347	}
1348	if (!NInoValidFinderInfo(ni)) {
1349		if (afp)
1350			memcpy(&ni->finder_info, &afp->finder_info,
1351					sizeof(ni->finder_info));
1352		else
1353			bzero(&ni->finder_info, sizeof(ni->finder_info));
1354		/*
1355		 * If the file is hidden we need to mirror this fact to the
1356		 * Finder hidden bit as SFM does not set the Finder hidden bit
1357		 * on disk but VNOP_GETATTR() does return it as set so it gets
1358		 * kept in sync in memory only.
1359		 *
1360		 * Just in case we will also set the FILE_ATTR_HIDDEN bit in
1361		 * the file_attributes if the Finder hidden bit is set but
1362		 * FILE_ATTR_HIDDEN is not set.  This should never happen but
1363		 * it does not harm to have the sync go both ways so we do it
1364		 * especially as that is effectively what HFS and AFP (client)
1365		 * do, too.
1366		 */
1367		if (ni->file_attributes & FILE_ATTR_HIDDEN)
1368			ni->finder_info.attrs |= FINDER_ATTR_IS_HIDDEN;
1369		else if (ni->finder_info.attrs & FINDER_ATTR_IS_HIDDEN) {
1370			ni->file_attributes |= FILE_ATTR_HIDDEN;
1371			NInoSetDirtyFileAttributes(ni);
1372		}
1373		NInoSetValidFinderInfo(ni);
1374	}
1375}
1376
1377/**
1378 * ntfs_inode_afpinfo_read - load the non-resident AfpInfo and cache it
1379 * @ni:		base ntfs inode whose AfpInfo to load and cache
1380 *
1381 * Load the AfpInfo attribute into memory and copy the backup time and Finder
1382 * info to the base ntfs inode @ni.
1383 *
1384 * Return 0 on success and errno on error.
1385 *
1386 * Note if the AfpInfo is invalid (wrong signature, wrong version, or wrong
1387 * size), we still return success but we do not copy anything thus the caller
1388 * has to check that NInoValidBackupTime(@ni) and NInoValidFinderInfo(@ni) are
1389 * true before using @ni->backup_time and @ni->finder_info, respectively.
1390 *
1391 * Locking: Caller must hold @ni->lock for writing.
1392 */
1393errno_t ntfs_inode_afpinfo_read(ntfs_inode *ni)
1394{
1395	ntfs_inode *afp_ni;
1396	upl_t upl;
1397	upl_page_info_array_t pl;
1398	AFPINFO *afp;
1399	unsigned afp_size;
1400	errno_t err;
1401
1402	ntfs_debug("Entering for mft_no 0x%llx.",
1403			(unsigned long long)ni->mft_no);
1404	if (NInoValidBackupTime(ni) && NInoValidFinderInfo(ni)) {
1405		ntfs_debug("Done (both backup time and Finder info are "
1406				"already valid).");
1407		return 0;
1408	}
1409	/* Get the attribute inode for the AFP_AfpInfo named stream. */
1410	err = ntfs_attr_inode_get(ni, AT_DATA, NTFS_SFM_AFPINFO_NAME, 11,
1411			FALSE, LCK_RW_TYPE_SHARED, &afp_ni);
1412	if (err) {
1413		ntfs_error(ni->vol->mp, "Failed to get $DATA/AFP_AfpInfo "
1414				"attribute inode mft_no 0x%llx (error %d).",
1415				(unsigned long long)ni->mft_no, err);
1416		return err;
1417	}
1418	err = ntfs_page_map(afp_ni, 0, &upl, &pl, (u8**)&afp, FALSE);
1419	if (err) {
1420		ntfs_error(ni->vol->mp, "Failed to read AfpInfo from "
1421				"$DATA/AFP_AfpInfo attribute inode mft_no "
1422				"0x%llx (error %d).",
1423				(unsigned long long)ni->mft_no, err);
1424		goto err;
1425	}
1426	lck_spin_lock(&afp_ni->size_lock);
1427	afp_size = afp_ni->data_size;
1428	lck_spin_unlock(&afp_ni->size_lock);
1429	if (afp_size > PAGE_SIZE)
1430		afp_size = PAGE_SIZE;
1431	ntfs_inode_afpinfo_cache(ni, afp, afp_size);
1432	ntfs_page_unmap(afp_ni, upl, pl, FALSE);
1433	ntfs_debug("Done.");
1434err:
1435	lck_rw_unlock_shared(&afp_ni->lock);
1436	(void)vnode_put(afp_ni->vn);
1437	return err;
1438}
1439
1440/**
1441 * ntfs_finder_info_is_unused - check if a Finder info is not in use
1442 * @ni:		ntfs info whose Finder info to check
1443 *
1444 * Return true if the Finder info of the ntfs inode @ni is unused and false
1445 * otherwise.
1446 *
1447 * This function takes into account that a set FINDER_ATTR_IS_HIDDEN bit is
1448 * masked out as the FINDER_ATTR_IS_HIDDEN bit is not stored on disk in the
1449 * Finder info.
1450 *
1451 * Note the Finder info must be valid or this function will cause a panic().
1452 *
1453 * Locking: Caller must hold the inode lock (@ni->lock).
1454 */
1455static BOOL ntfs_finder_info_is_unused(ntfs_inode *ni)
1456{
1457	FINDER_INFO fi;
1458
1459	if (!NInoValidFinderInfo(ni))
1460		panic("%s(): !NInoValidFinderInfo(ni)\n", __FUNCTION__);
1461	memcpy(&fi, &ni->finder_info, sizeof(fi));
1462	fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
1463	return !bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi));
1464}
1465
1466/**
1467 * ntfs_inode_afpinfo_sync - sync the cached AfpInfo
1468 * @afp:	AfpInfo to sync to
1469 * @afp_size:	size in bytes of AfpInfo
1470 * @ni:		base ntfs inode which contains the cache of the AfpInfo
1471 *
1472 * Copy @ni->backup_time and @ni->finder_info from the base ntfs inode @ni to
1473 * the AfpInfo @afp of size @afp_size bytes.
1474 *
1475 * This function has no return value.
1476 */
1477static void ntfs_inode_afpinfo_sync(AFPINFO *afp, const unsigned afp_size,
1478		ntfs_inode *ni)
1479{
1480	if (NInoTestClearDirtyBackupTime(ni))
1481		afp->backup_time = ntfs_utc2ad(ni->backup_time);
1482	if (NInoTestClearDirtyFinderInfo(ni)) {
1483		if (afp_size < sizeof(ni->finder_info))
1484			panic("%s(): afp_size < sizeof(ni->finder_info)!\n",
1485					__FUNCTION__);
1486		memcpy(&afp->finder_info, &ni->finder_info,
1487				sizeof(ni->finder_info));
1488		/*
1489		 * If the file is hidden we need to clear the Finder hidden bit
1490		 * on disk as SFM does not set it on disk either as it just
1491		 * sets the FILE_ATTR_HIDDEN bit in the file_attributes of the
1492		 * $STANDARD_INFORMATION attribute.  We do this unconditionally
1493		 * for efficiency.
1494		 *
1495		 * Just in case we will also set the FILE_ATTR_HIDDEN bit in
1496		 * the file_attributes if the Finder hidden bit is set but
1497		 * FILE_ATTR_HIDDEN is not set.  This should never happen but
1498		 * it does not harm to have the sync go both ways so we do it
1499		 * especially as that is effectively what HFS and AFP (client)
1500		 * do, too.
1501		 */
1502		if (ni->finder_info.attrs & FINDER_ATTR_IS_HIDDEN &&
1503				!(ni->file_attributes & FILE_ATTR_HIDDEN)) {
1504			ni->file_attributes |= FILE_ATTR_HIDDEN;
1505			NInoSetDirtyFileAttributes(ni);
1506		}
1507		afp->finder_info.attrs &= ~FINDER_ATTR_IS_HIDDEN;
1508	}
1509}
1510
1511/**
1512 * ntfs_inode_afpinfo_write - update the non-resident AfpInfo on disk
1513 * @ni:		base ntfs inode whose AfpInfo to update on disk from cache
1514 *
1515 * Update the non-resident AfpInfo attribute from the cached backup time and
1516 * Finder info in the base ntfs inode @ni and write it to disk.
1517 *
1518 * If the new backup time and Finder info are the defaults then delete the
1519 * AfpInfo attribute instead of updating it.
1520 *
1521 * Return 0 on success and errno on error.
1522 *
1523 * Locking: Caller must hold @ni->lock for writing.
1524 */
1525errno_t ntfs_inode_afpinfo_write(ntfs_inode *ni)
1526{
1527	ntfs_inode *afp_ni;
1528	upl_t upl;
1529	upl_page_info_array_t pl;
1530	AFPINFO *afp;
1531	unsigned afp_size;
1532	sle32 backup_time;
1533	errno_t err;
1534	BOOL delete, update;
1535
1536	backup_time = ntfs_utc2ad(ni->backup_time);
1537	delete = FALSE;
1538	if (backup_time == const_cpu_to_sle32(INT32_MIN) &&
1539			ntfs_finder_info_is_unused(ni))
1540		delete = TRUE;
1541	ntfs_debug("Entering for mft_no 0x%llx, delete is %s.",
1542			(unsigned long long)ni->mft_no,
1543			delete ? "true" : "false");
1544	/*
1545	 * FIXME: If the inode is encrypted we cannot access the AFP_AfpInfo
1546	 * named stream so no point in trying to do it.  We just pretend to
1547	 * succeed even though we do not do anything.
1548	 *
1549	 * We warn the user about this so they do not get confused.
1550	 */
1551	if (NInoEncrypted(ni)) {
1552		ntfs_warning(ni->vol->mp, "Inode 0x%llx is encrypted thus "
1553				"cannot write AFP_AfpInfo attribute.  "
1554				"Pretending the update succeeded to keep the "
1555				"system happy.",
1556				(unsigned long long)ni->mft_no);
1557		err = 0;
1558		goto err;
1559	}
1560	if (!NInoValidBackupTime(ni) || !NInoValidFinderInfo(ni)) {
1561		/*
1562		 * Load the AFP_AfpInfo stream and initialize the backup time
1563		 * and Finder info (if they are not already valid).
1564		 */
1565		err = ntfs_inode_afpinfo_read(ni);
1566		if (err) {
1567			ntfs_error(ni->vol->mp, "Failed to read AFP_AfpInfo "
1568					"attribute from inode mft_no 0x%llx "
1569					"(error %d).",
1570					(unsigned long long)ni->mft_no, err);
1571			goto err;
1572		}
1573	}
1574	/*
1575	 * Get the attribute inode for the AFP_AfpInfo named stream.  If
1576	 * @delete is false create it if it does not exist and if @delete is
1577	 * true only get the inode if it exists.
1578	 */
1579	err = ntfs_attr_inode_get_or_create(ni, AT_DATA, NTFS_SFM_AFPINFO_NAME,
1580			11, FALSE, FALSE, delete ? XATTR_REPLACE : 0,
1581			LCK_RW_TYPE_EXCLUSIVE, &afp_ni);
1582	if (err) {
1583		if (err == ENOENT && delete) {
1584			ntfs_debug("AFP_AfpInfo attribute does not exist in "
1585					"mft_no 0x%llx, no need to delete it.",
1586					(unsigned long long)ni->mft_no);
1587			err = 0;
1588		} else
1589			ntfs_error(ni->vol->mp, "Failed to get or create "
1590					"$DATA/AFP_AfpInfo attribute inode "
1591					"mft_no 0x%llx (error %d).",
1592					(unsigned long long)ni->mft_no, err);
1593		goto err;
1594	}
1595	if (delete) {
1596		ntfs_debug("Unlinking AFP_AfpInfo attribute inode mft_no "
1597				"0x%llx.", (unsigned long long)ni->mft_no);
1598		/*
1599		 * Unlink the attribute inode.  The last close will cause the
1600		 * VFS to call ntfs_vnop_inactive() which will do the actual
1601		 * removal.
1602		 */
1603		afp_ni->link_count = 0;
1604		/*
1605		 * Update the last_mft_change_time (ctime) in the inode as
1606		 * named stream/extended attribute semantics expect on OS X.
1607		 */
1608		ni->last_mft_change_time = ntfs_utc_current_time();
1609		NInoSetDirtyTimes(ni);
1610		/*
1611		 * If this is not a directory or it is an encrypted directory,
1612		 * set the needs archiving bit except for the core system
1613		 * files.
1614		 */
1615		if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
1616			BOOL need_set_archive_bit = TRUE;
1617			if (ni->vol->major_ver >= 2) {
1618				if (ni->mft_no <= FILE_Extend)
1619					need_set_archive_bit = FALSE;
1620			} else {
1621				if (ni->mft_no <= FILE_UpCase)
1622					need_set_archive_bit = FALSE;
1623			}
1624			if (need_set_archive_bit) {
1625				ni->file_attributes |= FILE_ATTR_ARCHIVE;
1626				NInoSetDirtyFileAttributes(ni);
1627			}
1628		}
1629		goto done;
1630	}
1631	update = TRUE;
1632	lck_spin_lock(&afp_ni->size_lock);
1633	afp_size = afp_ni->data_size;
1634	lck_spin_unlock(&afp_ni->size_lock);
1635	if (afp_ni->data_size != sizeof(AFPINFO)) {
1636		err = ntfs_attr_resize(afp_ni, sizeof(AFPINFO), 0, NULL);
1637		if (err) {
1638			ntfs_warning(ni->vol->mp, "Failed to set size of "
1639					"$DATA/AFP_AfpInfo attribute inode "
1640					"mft_no 0x%llx (error %d).  Cannot "
1641					"update AfpInfo.",
1642					(unsigned long long)ni->mft_no, err);
1643			goto unl_err;
1644		}
1645		ntfs_debug("Set size of $DATA/AFP_AfpInfo attribute inode "
1646				"mft_no 0x%llx to sizeof(AFPINFO) (%ld) "
1647				"bytes.", (unsigned long long)ni->mft_no,
1648				sizeof(AFPINFO));
1649		lck_spin_lock(&afp_ni->size_lock);
1650		afp_size = afp_ni->data_size;
1651		lck_spin_unlock(&afp_ni->size_lock);
1652		if (afp_size != sizeof(AFPINFO))
1653			panic("%s(): afp_size != sizeof(AFPINFO)\n",
1654					__FUNCTION__);
1655		update = FALSE;
1656	}
1657	/*
1658	 * If we resized the attribute then we do not care for the old contents
1659	 * so we grab the page instead of mapping it (@update is false in this
1660	 * case).
1661	 */
1662	err = ntfs_page_map_ext(afp_ni, 0, &upl, &pl, (u8**)&afp, update, TRUE);
1663	if (err) {
1664		ntfs_error(ni->vol->mp, "Failed to map AfpInfo data of "
1665				"$DATA/AFP_AfpInfo attribute inode mft_no "
1666				"0x%llx (error %d).",
1667				(unsigned long long)ni->mft_no, err);
1668		goto unl_err;
1669	}
1670	if (!update) {
1671		/*
1672		 * We need to rewrite the AfpInfo from scratch so for
1673		 * simplicity start with a clean slate.
1674		 */
1675		bzero(afp, PAGE_SIZE);
1676		afp->signature = AfpInfo_Signature;
1677		afp->version = AfpInfo_Version;
1678		afp->backup_time = const_cpu_to_sle32(INT32_MIN);
1679	}
1680	ntfs_inode_afpinfo_sync(afp, afp_size, ni);
1681	ntfs_page_unmap(afp_ni, upl, pl, TRUE);
1682done:
1683	lck_rw_unlock_exclusive(&afp_ni->lock);
1684	(void)vnode_put(afp_ni->vn);
1685	ntfs_debug("Done.");
1686	return 0;
1687unl_err:
1688	lck_rw_unlock_exclusive(&afp_ni->lock);
1689	(void)vnode_put(afp_ni->vn);
1690err:
1691	NInoClearDirtyBackupTime(ni);
1692	NInoClearDirtyFinderInfo(ni);
1693	return err;
1694}
1695
1696/**
1697 * ntfs_inode_read - read an inode from its device
1698 * @ni:		ntfs inode to read
1699 *
1700 * ntfs_inode_read() is called from ntfs_inode_get() to read the inode
1701 * described by @ni into memory from the device.
1702 *
1703 * The only fields in @ni that we need to/can look at when the function is
1704 * called are @ni->vol, pointing to the mounted ntfs volume, and @ni->mft_no,
1705 * the number of the inode to load.
1706 *
1707 * ntfs_inode_read() maps, pins and locks the mft record number @ni->mft_no and
1708 * sets up the ntfs inode.
1709 *
1710 * Return 0 on success and errno on error.
1711 */
1712static errno_t ntfs_inode_read(ntfs_inode *ni)
1713{
1714	ntfs_volume *vol = ni->vol;
1715	MFT_RECORD *m;
1716	ntfs_attr_search_ctx *ctx;
1717	ATTR_RECORD *a;
1718	STANDARD_INFORMATION *si;
1719	errno_t err;
1720
1721	ntfs_debug("Entering for mft_no 0x%llx.",
1722			(unsigned long long)ni->mft_no);
1723	err = ntfs_mft_record_map(ni, &m);
1724	if (err) {
1725		ntfs_error(vol->mp, "Failed to map mft record.");
1726		m = NULL;
1727		ctx = NULL;
1728		goto err;
1729	}
1730	ctx = ntfs_attr_search_ctx_get(ni, m);
1731	if (!ctx) {
1732		ntfs_error(vol->mp, "Failed to get attribute search context.");
1733		err = ENOMEM;
1734		goto err;
1735	}
1736	if (!(m->flags & MFT_RECORD_IN_USE)) {
1737		ntfs_error(vol->mp, "Inode is not in use.");
1738		err = ENOENT;
1739		goto err;
1740	}
1741	if (m->base_mft_record) {
1742		ntfs_error(vol->mp, "Inode is an extent inode.");
1743		err = ENOENT;
1744		goto err;
1745	}
1746	/* Cache information from mft record in ntfs inode. */
1747	ni->seq_no = le16_to_cpu(m->sequence_number);
1748	/*
1749	 * FIXME: Keep in mind that link_count is two for files which have both
1750	 * a long filename and a short filename as separate entries, so if
1751	 * we are hiding short filenames this will be too high.  Either we
1752	 * need to account for the short filenames by subtracting them or we
1753	 * need to make sure we delete files even though the number of links
1754	 * is not zero which might be tricky due to vfs interactions.  Need to
1755	 * think about this some more when implementing the unlink call.
1756	 */
1757	ni->link_count = le16_to_cpu(m->link_count);
1758	if (!ni->link_count) {
1759		ntfs_error(vol->mp, "Inode had been deleted.");
1760		err = ENOENT;
1761		goto err;
1762	}
1763	/* Everyone gets all permissions. */
1764	ni->mode |= ACCESSPERMS;
1765	/*
1766	 * FIXME: Reparse points can have the directory bit set even though
1767	 * they should really be S_IFLNK.  For now we do not support reparse
1768	 * points so this does not matter.
1769	 */
1770	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
1771		ni->mode |= S_IFDIR;
1772		/*
1773		 * Apply the directory permissions mask set in the mount
1774		 * options.
1775		 */
1776		ni->mode &= ~vol->dmask;
1777	} else {
1778		/*
1779		 * We set S_IFREG and apply the permissions mask for files even
1780		 * though it could be a symbolic link, socket, fifo, or block
1781		 * or character device special file for example.
1782		 *
1783		 * We will update the mode if/when we determine that this inode
1784		 * is not a regular file.
1785		 */
1786		ni->mode |= S_IFREG;
1787		/* Apply the file permissions mask set in the mount options. */
1788		ni->mode &= ~vol->fmask;
1789	}
1790	/*
1791	 * Find the standard information attribute in the mft record.  At this
1792	 * stage we have not setup the attribute list stuff yet, so this could
1793	 * in fact fail if the standard information is in an extent record, but
1794	 * this is not allowed hence not a problem.
1795	 */
1796	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, AT_UNNAMED, 0, 0, NULL,
1797			0, ctx);
1798	a = ctx->a;
1799	if (err || a->non_resident || a->flags) {
1800		if (err) {
1801			if (err == ENOENT) {
1802				/*
1803				 * TODO: We should be performing a hot fix here
1804				 * (if the recover mount option is set) by
1805				 * creating a new attribute.
1806				 */
1807				ntfs_error(vol->mp, "Standard information "
1808						"attribute is missing.");
1809			} else
1810				ntfs_error(vol->mp, "Failed to lookup "
1811						"standard information "
1812						"attribute.");
1813		} else {
1814info_err:
1815			ntfs_error(vol->mp, "Standard information attribute "
1816					"is corrupt.");
1817		}
1818		goto err;
1819	}
1820	si = (STANDARD_INFORMATION*)((u8*)a + le16_to_cpu(a->value_offset));
1821	/* Some bounds checks. */
1822	if ((u8*)si < (u8*)a || (u8*)si + le32_to_cpu(a->value_length) >
1823			(u8*)a + le32_to_cpu(a->length) ||
1824			(u8*)a + le32_to_cpu(a->length) > (u8*)ctx->m +
1825			vol->mft_record_size)
1826		goto info_err;
1827	/* Cache the file attributes in the ntfs inode. */
1828	ni->file_attributes = si->file_attributes;
1829	/*
1830	 * Cache the create, the last data and mft modified, and the last
1831	 * access times in the ntfs inode.
1832	 */
1833	ni->creation_time = ntfs2utc(si->creation_time);
1834	ni->last_data_change_time = ntfs2utc(si->last_data_change_time);
1835	ni->last_mft_change_time = ntfs2utc(si->last_mft_change_time);
1836	ni->last_access_time = ntfs2utc(si->last_access_time);
1837	/* Find the attribute list attribute if present. */
1838	ntfs_attr_search_ctx_reinit(ctx);
1839	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, AT_UNNAMED, 0, 0, NULL, 0,
1840			ctx);
1841	a = ctx->a;
1842	if (err) {
1843		if (err != ENOENT) {
1844			ntfs_error(vol->mp, "Failed to lookup attribute list "
1845					"attribute.");
1846			goto err;
1847		}
1848	} else /* if (!err) */ {
1849		ntfs_debug("Attribute list found in inode 0x%llx.",
1850				(unsigned long long)ni->mft_no);
1851		NInoSetAttrList(ni);
1852		if (a->flags & ATTR_COMPRESSION_MASK) {
1853			ntfs_error(vol->mp, "Attribute list attribute is "
1854					"compressed.  Not allowed.");
1855			goto err;
1856		}
1857		if (a->flags & (ATTR_IS_ENCRYPTED | ATTR_IS_SPARSE)) {
1858			if (a->non_resident) {
1859				ntfs_error(vol->mp, "Non-resident attribute "
1860						"list attribute is encrypted/"
1861						"sparse.  Not allowed.");
1862				goto err;
1863			}
1864			ntfs_warning(vol->mp, "Resident attribute list "
1865					"attribute is marked encrypted/sparse "
1866					"which is not true.  However, Windows "
1867					"allows this and chkdsk does not "
1868					"detect or correct it so we will just "
1869					"ignore the invalid flags and pretend "
1870					"they are not set.");
1871		}
1872		/* Now allocate memory for the attribute list. */
1873		ni->attr_list_size = (u32)ntfs_attr_size(a);
1874		ni->attr_list_alloc = (ni->attr_list_size + NTFS_ALLOC_BLOCK -
1875				1) & ~(NTFS_ALLOC_BLOCK - 1);
1876		ni->attr_list = OSMalloc(ni->attr_list_alloc, ntfs_malloc_tag);
1877		if (!ni->attr_list) {
1878			ni->attr_list_alloc = 0;
1879			ntfs_error(vol->mp, "Not enough memory to allocate "
1880					"buffer for attribute list.");
1881			err = ENOMEM;
1882			goto err;
1883		}
1884		if (a->non_resident) {
1885			NInoSetAttrListNonResident(ni);
1886			if (a->lowest_vcn) {
1887				ntfs_error(vol->mp, "Attribute list has non "
1888						"zero lowest_vcn.");
1889				goto err;
1890			}
1891			/*
1892			 * Setup the runlist.  No need for locking as we have
1893			 * exclusive access to the inode at this time.
1894			 */
1895			err = ntfs_mapping_pairs_decompress(vol, a,
1896					&ni->attr_list_rl);
1897			if (err) {
1898				ntfs_error(vol->mp, "Mapping pairs "
1899						"decompression failed.");
1900				goto err;
1901			}
1902			/* Now load the attribute list. */
1903			err = ntfs_rl_read(vol, &ni->attr_list_rl,
1904					ni->attr_list, ni->attr_list_size,
1905					sle64_to_cpu(a->initialized_size));
1906			if (err) {
1907				ntfs_error(vol->mp, "Failed to load attribute "
1908						"list attribute.");
1909				goto err;
1910			}
1911		} else /* if (!a->non_resident) */ {
1912			u8 *a_end, *al;
1913			u32 al_len;
1914
1915			a_end = (u8*)a + le32_to_cpu(a->length);
1916			al = (u8*)a + le16_to_cpu(a->value_offset);
1917			al_len = le32_to_cpu(a->value_length);
1918			if (al < (u8*)a || al + al_len > a_end || (u8*)a_end >
1919					(u8*)ctx->m + vol->mft_record_size) {
1920				ntfs_error(vol->mp, "Resident attribute list "
1921						"attribute is corrupt.");
1922				goto err;
1923			}
1924			/* Now copy the attribute list attribute. */
1925			memcpy(ni->attr_list, al, al_len);
1926		}
1927	}
1928	/*
1929	 * If an attribute list is present we now have the attribute list value
1930	 * in @ni->attr_list and it is @ni->attr_list_size bytes in size.
1931	 */
1932	if (S_ISDIR(ni->mode)) {
1933		/* It is a directory. */
1934		NInoSetMstProtected(ni);
1935		ni->type = AT_INDEX_ALLOCATION;
1936		ni->name = I30;
1937		ni->name_len = 4;
1938		ni->vcn_size = 0;
1939		ni->collation_rule = 0;
1940		ni->vcn_size_shift = 0;
1941	} else {
1942		/* It is a file. */
1943		ntfs_attr_search_ctx_reinit(ctx);
1944		/* Setup the data attribute, even if not present. */
1945		ni->type = AT_DATA;
1946		ni->name = NULL;
1947		ni->name_len = 0;
1948		/* Find first extent of the unnamed data attribute. */
1949		err = ntfs_attr_lookup(AT_DATA, AT_UNNAMED, 0, 0, NULL, 0, ctx);
1950		if (err) {
1951			BOOL is_system;
1952
1953			ni->allocated_size = ni->data_size =
1954					ni->initialized_size = 0;
1955			if (err != ENOENT) {
1956				ntfs_error(vol->mp, "Failed to lookup data "
1957						"attribute.");
1958				goto err;
1959			}
1960			/*
1961			 * FILE_Secure does not have an unnamed data attribute,
1962			 * so we special case it here.
1963			 */
1964			if (ni->mft_no == FILE_Secure)
1965				goto no_data_attr_special_case;
1966			/*
1967			 * Most if not all the system files in the $Extend
1968			 * system directory do not have unnamed data
1969			 * attributes so we need to check if the parent
1970			 * directory of the file is FILE_Extend and if it is
1971			 * ignore this error.  To do this we need to get the
1972			 * name of this inode from the mft record as the name
1973			 * contains the back reference to the parent directory.
1974			 */
1975			err = ntfs_inode_is_extended_system(ctx, &is_system);
1976			if (!err && is_system)
1977				goto no_data_attr_special_case;
1978			// FIXME: File is corrupt! Hot-fix with empty data
1979			// attribute if recovery option is set.
1980			ntfs_error(vol->mp, "Data attribute is missing.");
1981			goto err;
1982		}
1983		a = ctx->a;
1984		/* Setup the state. */
1985		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
1986			if (a->flags & ATTR_COMPRESSION_MASK) {
1987				NInoSetCompressed(ni);
1988				if (!NVolCompressionEnabled(vol)) {
1989					ntfs_error(vol->mp, "Found compressed "
1990							"data but compression "
1991							"is disabled on this "
1992							"volume and/or mount.");
1993					goto err;
1994				}
1995				if ((a->flags & ATTR_COMPRESSION_MASK)
1996						!= ATTR_IS_COMPRESSED) {
1997					ntfs_error(vol->mp, "Found unknown "
1998							"compression method "
1999							"or corrupt file.");
2000					goto err;
2001				}
2002			}
2003			if (a->flags & ATTR_IS_SPARSE)
2004				NInoSetSparse(ni);
2005		}
2006		if (a->flags & ATTR_IS_ENCRYPTED) {
2007			if (NInoCompressed(ni)) {
2008				ntfs_error(vol->mp, "Found encrypted and "
2009						"compressed data.");
2010				goto err;
2011			}
2012			NInoSetEncrypted(ni);
2013		}
2014		if (a->non_resident) {
2015			NInoSetNonResident(ni);
2016			if (NInoCompressed(ni) || NInoSparse(ni)) {
2017				if (NInoCompressed(ni) &&
2018						a->compression_unit !=
2019						NTFS_COMPRESSION_UNIT) {
2020					ntfs_error(vol->mp, "Found "
2021							"non-standard "
2022							"compression unit (%d "
2023							"instead of %d).  "
2024							"Cannot handle this.",
2025							a->compression_unit,
2026							NTFS_COMPRESSION_UNIT);
2027					err = ENOTSUP;
2028					goto err;
2029				}
2030				if (!NInoCompressed(ni) &&
2031						a->compression_unit != 0 &&
2032						a->compression_unit !=
2033						NTFS_COMPRESSION_UNIT) {
2034					ntfs_error(vol->mp, "Found "
2035							"non-standard "
2036							"compression unit (%d "
2037							"instead of 0 or %d).  "
2038							"Cannot handle this.",
2039							a->compression_unit,
2040							NTFS_COMPRESSION_UNIT);
2041					err = ENOTSUP;
2042					goto err;
2043				}
2044				if (a->compression_unit) {
2045					ni->compression_block_clusters = 1U <<
2046							a->compression_unit;
2047					ni->compression_block_size = 1U << (
2048							a->compression_unit +
2049							vol->
2050							cluster_size_shift);
2051					ni->compression_block_size_shift = ffs(
2052							ni->
2053							compression_block_size)
2054							- 1;
2055				} else {
2056					ni->compression_block_clusters = 0;
2057					ni->compression_block_size = 0;
2058					ni->compression_block_size_shift = 0;
2059				}
2060				ni->compressed_size = sle64_to_cpu(
2061						a->compressed_size);
2062			}
2063			if (a->lowest_vcn) {
2064				ntfs_error(vol->mp, "First extent of data "
2065						"attribute has non-zero "
2066						"lowest_vcn.");
2067				goto err;
2068			}
2069			ni->allocated_size = sle64_to_cpu(a->allocated_size);
2070			ni->data_size = sle64_to_cpu(a->data_size);
2071			ni->initialized_size = sle64_to_cpu(
2072					a->initialized_size);
2073		} else { /* Resident attribute. */
2074			u8 *a_end, *data;
2075			u32 data_len;
2076
2077			a_end = (u8*)a + le32_to_cpu(a->length);
2078			data = (u8*)a + le16_to_cpu(a->value_offset);
2079			data_len = le32_to_cpu(a->value_length);
2080			if (data < (u8*)a || data + data_len > a_end ||
2081					(u8*)a_end > (u8*)ctx->m +
2082					vol->mft_record_size) {
2083				ntfs_error(vol->mp, "Resident data attribute "
2084						"is corrupt.");
2085				goto err;
2086			}
2087			ni->allocated_size = a_end - data;
2088			ni->data_size = ni->initialized_size = data_len;
2089			/*
2090			 * On Services for Unix on Windows, a fifo is a system
2091			 * file with a zero-length $DATA attribute whilst a
2092			 * socket is a system file with a $DATA attribute of
2093			 * length 1.  Block and character device special files
2094			 * in turn are system files containing an INTX_FILE
2095			 * structure.
2096			 */
2097			if (ni->file_attributes & FILE_ATTR_SYSTEM) {
2098				INTX_FILE *ix;
2099
2100				ix = (INTX_FILE*)data;
2101				if (!ni->data_size) {
2102					ni->mode &= ~S_IFREG;
2103					ni->mode |= S_IFIFO;
2104				} else if (ni->data_size == 1) {
2105					ni->mode &= ~S_IFREG;
2106					ni->mode |= S_IFSOCK;
2107				} else if (data_len == offsetof(INTX_FILE,
2108						device) + sizeof(ix->device) &&
2109						(ix->magic ==
2110						INTX_BLOCK_DEVICE ||
2111						ix->magic ==
2112						INTX_CHAR_DEVICE)) {
2113					ni->mode &= ~S_IFREG;
2114					if (ix->magic == INTX_BLOCK_DEVICE)
2115						ni->mode |= S_IFBLK;
2116					else
2117						ni->mode |= S_IFCHR;
2118					ni->rdev = makedev(le64_to_cpu(
2119							ix->device.major),
2120							le64_to_cpu(
2121							ix->device.minor));
2122				}
2123			}
2124		}
2125	}
2126no_data_attr_special_case:
2127	/*
2128	 * Check if there is an AFP_AfpInfo named stream.
2129	 *
2130	 * FIXME: Note we do not bother if the inode is encrypted as we would
2131	 * not be able to understand its contents anyway.  We need to implement
2132	 * this once we support encryption.  For now we pretend the AFP_AfpInfo
2133	 * stream does not exist to make everything smooth going.
2134	 */
2135	if (NInoEncrypted(ni)) {
2136		ntfs_inode_afpinfo_cache(ni, NULL, 0);
2137		goto done;
2138	}
2139	ntfs_attr_search_ctx_reinit(ctx);
2140	err = ntfs_attr_lookup(AT_DATA, NTFS_SFM_AFPINFO_NAME, 11, 0, NULL, 0,
2141			ctx);
2142	if (err) {
2143		if (err != ENOENT) {
2144			ntfs_error(vol->mp, "Failed to lookup AfpInfo "
2145					"attribute (error %d).", err);
2146			goto err;
2147		}
2148		/* The AFP_AfpInfo attribute does not exist. */
2149		ntfs_inode_afpinfo_cache(ni, NULL, 0);
2150	} else {
2151		s64 ai_size;
2152		ntfs_runlist ai_runlist;
2153		AFPINFO ai;
2154
2155		/* The found $DATA/AFP_AfpInfo attribute is now in @ctx->a. */
2156		a = ctx->a;
2157		/*
2158		 * If the attribute is resident (as it usually will be) we have
2159		 * the data at hand so copy the backup time and Finder info
2160		 * into the ntfs_inode.
2161		 */
2162		if (!a->non_resident) {
2163			u8 *a_end, *val;
2164			unsigned val_len;
2165
2166			a_end = (u8*)a + le32_to_cpu(a->length);
2167			val = (u8*)a + le16_to_cpu(a->value_offset);
2168			val_len = le32_to_cpu(a->value_length);
2169			if (val < (u8*)a || val + val_len > a_end ||
2170					(u8*)a_end >
2171					(u8*)ctx->m + vol->mft_record_size ||
2172					a->flags & ATTR_IS_ENCRYPTED) {
2173				ntfs_error(vol->mp, "Resident AfpInfo "
2174						"attribute is corrupt.");
2175				goto err;
2176			}
2177			ntfs_inode_afpinfo_cache(ni, (AFPINFO*)val, val_len);
2178			goto done;
2179		}
2180		ai_size = sle64_to_cpu(a->data_size);
2181		if (a->lowest_vcn ||
2182				sle64_to_cpu(a->initialized_size) > ai_size ||
2183				ai_size > sle64_to_cpu(a->allocated_size)) {
2184			ntfs_error(vol->mp, "AfpInfo attribute is corrupt.");
2185			goto err;
2186		}
2187		/*
2188		 * The attribute is non-resident.  If this is a regular file
2189		 * inode and its data size is less than or equal to
2190		 * MAXPATHLEN it could actually be a symbolic link.  In this
2191		 * case we need to read the AFP_AfpInfo attribute now.
2192		 * Otherwise postpone it till later when it is actually needed.
2193		 *
2194		 * We read it in by hand as it will likely not be modified so
2195		 * no point in wasting system resources by instantiating an
2196		 * attribute inode for it.  Also we do not have a vnode for the
2197		 * base inode yet thus cannot obtain an attribute inode at this
2198		 * point in time even if we wanted to.
2199		 */
2200		if (!S_ISREG(ni->mode) || ni->data_size > MAXPATHLEN)
2201			goto done;
2202		/*
2203		 * We only need the AFPINFO structure so ignore any further
2204		 * data there may be.
2205		 */
2206		if (ai_size > (s64)sizeof(AFPINFO))
2207			ai_size = sizeof(AFPINFO);
2208		/*
2209		 * If the attribute is compressed (which it should never be as
2210		 * Windows only compresses the unnamed $DATA attribute) we
2211		 * cannot read it here so bail out.
2212		 */
2213		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED)) {
2214			if (a->flags & ATTR_COMPRESSION_MASK)
2215				ntfs_warning(vol->mp, "AfpInfo is compressed, "
2216						"ignoring it.  %s",
2217						ntfs_please_email);
2218			ntfs_inode_afpinfo_cache(ni, NULL, 0);
2219			goto done;
2220		}
2221		/*
2222		 * Setup the runlist.  No need for locking as we have exclusive
2223		 * access to the inode at this time.
2224		 */
2225		ai_runlist.rl = NULL;
2226		ai_runlist.alloc = ai_runlist.elements = 0;
2227		err = ntfs_mapping_pairs_decompress(vol, a, &ai_runlist);
2228		if (err) {
2229			ntfs_error(vol->mp, "Mapping pairs decompression "
2230					"failed for AfpInfo (error %d).", err);
2231			goto err;
2232		}
2233		/* Now load the attribute data. */
2234		err = ntfs_rl_read(vol, &ai_runlist, (u8*)&ai, ai_size,
2235				sle64_to_cpu(a->initialized_size));
2236		if (err) {
2237			ntfs_error(vol->mp, "Failed to load AfpInfo (error "
2238					"%d).", err);
2239			OSFree(ai_runlist.rl, ai_runlist.alloc,
2240					ntfs_malloc_tag);
2241			goto err;
2242		}
2243		/* We do not need the runlist any more so free it. */
2244		OSFree(ai_runlist.rl, ai_runlist.alloc, ntfs_malloc_tag);
2245		/* Finally cache the AFP_AfpInfo data in the base inode. */
2246		ntfs_inode_afpinfo_cache(ni, &ai, ai_size);
2247	}
2248done:
2249	/*
2250	 * If it is a regular file and the data size is less than or equal to
2251	 * MAXPATHLEN it could be a symbolic link so check for this case here.
2252	 */
2253	if (S_ISREG(ni->mode) && ni->data_size <= MAXPATHLEN) {
2254		if (!NInoValidFinderInfo(ni))
2255			panic("%s(): !NInoValidFinderInfo(ni)\n",
2256					__FUNCTION__);
2257		if (ni->finder_info.type == FINDER_TYPE_SYMBOLIC_LINK &&
2258				ni->finder_info.creator ==
2259				FINDER_CREATOR_SYMBOLIC_LINK) {
2260			/*
2261			 * FIXME: At present the kernel does not allow VLNK
2262			 * vnodes to use the UBC (<rdar://problem/5794900>)
2263			 * thus we need to use a shadow VREG vnode to do the
2264			 * actual read of the symbolic link data.  Fortunately
2265			 * we already implemented this functionality for
2266			 * compressed files where we need to read the
2267			 * compressed data using a shadow vnode so we use the
2268			 * same implementation here, thus our shadow vnode is a
2269			 * raw inode.
2270			 *
2271			 * Doing this has the unfortunate consequence that if
2272			 * the symbolic link inode is compressed or encrypted
2273			 * we cannot read it as we are already using the raw
2274			 * inode and we can only have one raw inode.  Thus if
2275			 * the inode is non-resident and compressed or
2276			 * encrypted we do not change the mode to S_IFLNK thus
2277			 * causing the symbolic link to appear as a regular
2278			 * file instead of a symbolic link.
2279			 */
2280			if (NInoNonResident(ni) && (NInoCompressed(ni) ||
2281					NInoEncrypted(ni)))
2282				ntfs_warning(vol->mp, "Treating %s symbolic "
2283						"link mft_no 0x%llx as a "
2284						"regular file due to "
2285						"<rdar://problem/5794900>.",
2286						NInoCompressed(ni) ?
2287						"compressed" : "encrypted",
2288						(unsigned long long)
2289						ni->mft_no);
2290			else {
2291				/*
2292				 * Change the mode to indicate this is a
2293				 * symbolic link and not a regular file.
2294				 *
2295				 * Also, symbolic links always grant all
2296				 * permissions as the real permissions checking
2297				 * is done after the symbolic link is resolved.
2298				 */
2299				ni->mode = S_IFLNK | ACCESSPERMS;
2300			}
2301		}
2302	}
2303	ntfs_attr_search_ctx_put(ctx);
2304	ntfs_mft_record_unmap(ni);
2305	ntfs_debug("Done.");
2306	return 0;
2307err:
2308	if (ctx)
2309		ntfs_attr_search_ctx_put(ctx);
2310	if (m)
2311		ntfs_mft_record_unmap(ni);
2312	if (!err)
2313		err = EIO;
2314	ntfs_error(vol->mp, "Failed (error %d) for inode 0x%llx.  Run chkdsk.",
2315			(int)err, (unsigned long long)ni->mft_no);
2316	if (err != ENOTSUP && err != ENOMEM)
2317		NVolSetErrors(vol);
2318	return err;
2319}
2320
2321/**
2322 * ntfs_attr_inode_read_or_create - read an attribute inode from its base inode
2323 * @base_ni:	base inode if @ni is not raw and non-raw inode of @ni otherwise
2324 * @ni:		attribute inode to read
2325 * @options:	options specifying the read and/or create behaviour
2326 *
2327 * ntfs_attr_inode_read_or_create() is called from
2328 * ntfs_attr_inode_get_or_create() to read the attribute inode described by @ni
2329 * into memory from the base mft record described by @base_ni possibly creating
2330 * the attribute first.
2331 *
2332 * If @ni is a raw inode @base_ni is the non-raw inode to which @ni belongs
2333 * rather than the base inode.
2334 *
2335 * If @options does not specify XATTR_CREATE nor XATTR_REPLACE the attribute
2336 * will be created if it does not exist already and then will be opened.
2337 *
2338 * If @options specifies XATTR_CREATE the call will fail if the attribute
2339 * already exists, i.e. the existing attribute will not be opened.
2340 *
2341 * If @options specifies XATTR_REPLACE the call will fail if the attribute does
2342 * not exist, i.e. the new attribute will not be created, i.e. this is the
2343 * equivalent of ntfs_attr_inode_get().
2344 *
2345 * A special case is the resource fork (@name == NTFS_SFM_RESOURCEFORK_NAME).
2346 * If it exists but has zero size it is treated as if it does not exist when
2347 * handling the XATTR_CREATE and XATTR_REPLACE flags in @options.  Thus if the
2348 * resource fork exists but is zero size, a call with XATTR_CREATE set in
2349 * @options will succeed as if it did not already exist and a call with
2350 * XATTR_REPLACE set in @options will fail as if it did not already exist.
2351 *
2352 * ntfs_attr_inode_read_or_create() maps, pins and locks the base mft record
2353 * and looks up the attribute described by @ni before setting up the ntfs
2354 * inode.  If it is not found and creation is desired, a new attribute is
2355 * inserted into the mft record.
2356 *
2357 * Return 0 on success and errno on error.
2358 *
2359 * Note ntfs_attr_inode_read_or_create() cannot be called for
2360 * AT_INDEX_ALLOCATION, call ntfs_index_inode_read() instead.
2361 */
2362static errno_t ntfs_attr_inode_read_or_create(ntfs_inode *base_ni,
2363		ntfs_inode *ni, const int options)
2364{
2365	ntfs_volume *vol = ni->vol;
2366	MFT_RECORD *m;
2367	ntfs_attr_search_ctx *ctx;
2368	ATTR_RECORD *a;
2369	errno_t err;
2370
2371	ntfs_debug("Entering for mft_no 0x%llx, attribute type 0x%x, "
2372			"attribute name length 0x%x.",
2373			(unsigned long long)ni->mft_no,
2374			(unsigned)le32_to_cpu(ni->type),
2375			(unsigned)ni->name_len);
2376	if (!NInoAttr(ni))
2377		panic("%s(): !NInoAttr(ni)\n", __FUNCTION__);
2378	/* Mirror the values from the base inode. */
2379	ni->seq_no = base_ni->seq_no;
2380	ni->uid	= base_ni->uid;
2381	ni->gid	= base_ni->gid;
2382	/* Attributes cannot be hard-linked so link count is always 1. */
2383	ni->link_count = 1;
2384	/* Set inode type to zero but preserve permissions. */
2385	ni->mode = base_ni->mode & ~S_IFMT;
2386	/*
2387	 * If this is our special case of loading the secondary inode for
2388	 * accessing the raw data of compressed files or symbolic links, we can
2389	 * simply copy the relevant fields from the base inode rather than
2390	 * mapping the mft record and looking up the data attribute again.
2391	 */
2392	if (NInoRaw(ni)) {
2393		if (NInoCompressed(base_ni))
2394			NInoSetCompressed(ni);
2395		if (NInoSparse(base_ni))
2396			NInoSetSparse(ni);
2397		if (NInoEncrypted(base_ni))
2398			NInoSetEncrypted(ni);
2399		if (NInoNonResident(base_ni))
2400			NInoSetNonResident(ni);
2401		lck_spin_lock(&base_ni->size_lock);
2402		if (NInoCompressed(base_ni) || NInoSparse(base_ni)) {
2403			ni->compression_block_clusters =
2404					base_ni->compression_block_clusters;
2405			ni->compression_block_size =
2406					base_ni->compression_block_size;
2407			ni->compression_block_size_shift =
2408					base_ni->compression_block_size_shift;
2409			ni->compressed_size = base_ni->compressed_size;
2410		}
2411		/*
2412		 * For symbolic links we need the real sizes.  For compressed
2413		 * and encrypted files we need all values to be the same and
2414		 * equal to the allocated size so we can access the entirety of
2415		 * the compressed/encrypted data.
2416		 *
2417		 * FIXME: The symbolic link case is done this way because we
2418		 * cannot use the UBC for VLNK vnodes so we use a raw inode
2419		 * which has a VREG vnode to do the actual disk i/o (see
2420		 * <rdar://problem/5794900>).
2421		 */
2422		if (S_ISLNK(base_ni->mode)) {
2423			ni->allocated_size = base_ni->allocated_size;
2424			ni->data_size = base_ni->data_size;
2425			ni->initialized_size = base_ni->initialized_size;
2426		} else {
2427			ni->initialized_size = ni->data_size =
2428					ni->allocated_size =
2429					base_ni->allocated_size;
2430		}
2431		lck_spin_unlock(&base_ni->size_lock);
2432		if (NInoAttr(base_ni)) {
2433			/* Set @base_ni to point to the real base inode. */
2434			if (base_ni->nr_extents != -1)
2435				panic("%s(): Called for non-raw attribute "
2436						"inode which does not have a "
2437						"base inode.", __FUNCTION__);
2438			base_ni = base_ni->base_ni;
2439		}
2440		goto done;
2441	}
2442	/*
2443	 * We are looking for a real attribute.
2444	 *
2445	 * Map the mft record for the base inode.
2446	 */
2447	err = ntfs_mft_record_map(base_ni, &m);
2448	if (err) {
2449		ntfs_error(vol->mp, "Failed to map base mft record.");
2450		m = NULL;
2451		ctx = NULL;
2452		goto err;
2453	}
2454	ctx = ntfs_attr_search_ctx_get(base_ni, m);
2455	if (!ctx) {
2456		ntfs_error(vol->mp, "Failed to get attribute search context.");
2457		err = ENOMEM;
2458		goto err;
2459	}
2460	/* Find the attribute. */
2461	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
2462			ctx);
2463	a = ctx->a;
2464	if (err) {
2465		if (err != ENOENT) {
2466			ntfs_error(vol->mp, "Failed to lookup attribute "
2467					"(error %d).", err);
2468			goto err;
2469		}
2470		/*
2471		 * The attribute does not exist.  If @options specifies
2472		 * XATTR_REPLACE do not allow it to be created.
2473		 */
2474		if (options & XATTR_REPLACE) {
2475			ntfs_debug("Attribute in mft_no 0x%llx does not "
2476					"exist, returning ENOENT.",
2477					(unsigned long long)ni->mft_no);
2478			err = ENOENT;
2479			goto err;
2480		}
2481		ntfs_debug("Attribute does not exist, creating it.");
2482		/*
2483		 * FIXME: Cannot create attribute if it has to be non-resident.
2484		 * With present code this will never happen so no point in
2485		 * coding it until it is needed.
2486		 */
2487		if (ntfs_attr_can_be_resident(vol, ni->type)) {
2488			ntfs_warning(vol->mp, "Attribute type 0x%x cannot be "
2489					"resident.  Cannot create "
2490					"non-resident attributes yet.",
2491					le32_to_cpu(ni->type));
2492			err = ENOTSUP;
2493			goto err;
2494		}
2495		/*
2496		 * Create a new resident attribute.  @a now points to the
2497		 * location in the mft record at which we need to insert the
2498		 * attribute so insert it now.
2499		 */
2500		err = ntfs_resident_attr_record_insert(base_ni, ctx, ni->type,
2501				ni->name, ni->name_len, NULL, 0);
2502		if (err || ctx->is_error) {
2503			if (!err)
2504				err = ctx->error;
2505			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx "
2506					"(error %d).", ctx->is_error ?
2507					"remap extent mft record of" :
2508					"add resident attribute to",
2509					(unsigned long long)ni->mft_no, err);
2510			goto err;
2511		}
2512		a = ctx->a;
2513		ni->allocated_size = le32_to_cpu(a->length) -
2514				le16_to_cpu(a->value_offset);
2515		ni->initialized_size = ni->data_size =
2516				le32_to_cpu(a->value_length);
2517		/*
2518		 * Ensure the mft record containing the new attribute gets
2519		 * written out.
2520		 */
2521		NInoSetMrecNeedsDirtying(ctx->ni);
2522		/*
2523		 * Update the last_mft_change_time (ctime) in the inode as
2524		 * named stream/extended attribute semantics expect on OS X.
2525		 */
2526		base_ni->last_mft_change_time = ntfs_utc_current_time();
2527		NInoSetDirtyTimes(base_ni);
2528		/*
2529		 * If this is not a directory or it is an encrypted directory,
2530		 * set the needs archiving bit except for the core system
2531		 * files.
2532		 */
2533		if (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni)) {
2534			BOOL need_set_archive_bit = TRUE;
2535			if (vol->major_ver >= 2) {
2536				if (base_ni->mft_no <= FILE_Extend)
2537					need_set_archive_bit = FALSE;
2538			} else {
2539				if (base_ni->mft_no <= FILE_UpCase)
2540					need_set_archive_bit = FALSE;
2541			}
2542			if (need_set_archive_bit) {
2543				base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
2544				NInoSetDirtyFileAttributes(base_ni);
2545			}
2546		}
2547		goto put_done;
2548	}
2549	/*
2550	 * The attribute already exists.
2551	 *
2552	 * If it is the empty resource fork we need to fail if @options
2553	 * specifies XATTR_REPLACE.
2554	 *
2555	 * If @options specifies XATTR_CREATE we need to abort unless this is
2556	 * the resource fork and it is empty.
2557	 */
2558	if (ni->name == NTFS_SFM_RESOURCEFORK_NAME && !a->value_length) {
2559		if (options & XATTR_REPLACE) {
2560			ntfs_debug("Attribute mft_no 0x%llx does not exist, "
2561					"returning ENOENT.",
2562					(unsigned long long)ni->mft_no);
2563			err = ENOENT;
2564			goto err;
2565		}
2566	} else if (options & XATTR_CREATE) {
2567		ntfs_debug("Attribute mft_no 0x%llx already exists, returning "
2568				"EEXIST.", (unsigned long long)ni->mft_no);
2569		err = EEXIST;
2570		goto err;
2571	}
2572	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
2573		if (a->flags & ATTR_COMPRESSION_MASK) {
2574			NInoSetCompressed(ni);
2575			if (ni->type != AT_DATA) {
2576				ntfs_error(vol->mp, "Found compressed "
2577						"non-data attribute.  Please "
2578						"report you saw this message "
2579						"to %s.", ntfs_dev_email);
2580				goto err;
2581			}
2582			if (!NVolCompressionEnabled(vol)) {
2583				ntfs_error(vol->mp, "Found compressed data "
2584						"but compression is disabled "
2585						"on this volume and/or "
2586						"mount.");
2587				goto err;
2588			}
2589			if ((a->flags & ATTR_COMPRESSION_MASK) !=
2590					ATTR_IS_COMPRESSED) {
2591				ntfs_error(vol->mp, "Found unknown "
2592						"compression method or "
2593						"corrupt file.");
2594				goto err;
2595			}
2596		}
2597		if (a->flags & ATTR_IS_SPARSE)
2598			NInoSetSparse(ni);
2599		if (NInoMstProtected(ni)) {
2600			ntfs_error(vol->mp, "Found mst protected attribute "
2601					"but the attribute is %s.  Please "
2602					"report you saw this message to %s.",
2603					NInoCompressed(ni) ?
2604					"compressed" : "sparse",
2605					ntfs_dev_email);
2606			goto err;
2607		}
2608	}
2609	if (a->flags & ATTR_IS_ENCRYPTED) {
2610		if (ni->type != AT_DATA) {
2611			ntfs_error(vol->mp, "Found encrypted non-data "
2612					"attribute.  Please report you saw "
2613					"this message to %s.", ntfs_dev_email);
2614			goto err;
2615		}
2616		if (NInoMstProtected(ni)) {
2617			ntfs_error(vol->mp, "Found mst protected attribute "
2618					"but the attribute is encrypted.  "
2619					"Please report you saw this message "
2620					"to %s.", ntfs_dev_email);
2621			goto err;
2622		}
2623		if (NInoCompressed(ni)) {
2624			ntfs_error(vol->mp, "Found encrypted and compressed "
2625					"data.");
2626			goto err;
2627		}
2628		NInoSetEncrypted(ni);
2629	}
2630	if (!a->non_resident) {
2631		u8 *a_end, *val;
2632		u32 val_len;
2633
2634		/* Ensure the attribute name is placed before the value. */
2635		if (a->name_length && (le16_to_cpu(a->name_offset) >=
2636				le16_to_cpu(a->value_offset))) {
2637			ntfs_error(vol->mp, "Attribute name is placed after "
2638					"the attribute value.");
2639			goto err;
2640		}
2641		if (NInoMstProtected(ni)) {
2642			ntfs_error(vol->mp, "Found mst protected attribute "
2643					"but the attribute is resident.  "
2644					"Please report you saw this message "
2645					"to %s.", ntfs_dev_email);
2646			goto err;
2647		}
2648		a_end = (u8*)a + le32_to_cpu(a->length);
2649		val = (u8*)a + le16_to_cpu(a->value_offset);
2650		val_len = le32_to_cpu(a->value_length);
2651		if (val < (u8*)a || val + val_len > a_end || (u8*)a_end >
2652				(u8*)ctx->m + vol->mft_record_size) {
2653			ntfs_error(vol->mp, "Resident attribute is corrupt.");
2654			goto err;
2655		}
2656		ni->allocated_size = a_end - val;
2657		ni->data_size = ni->initialized_size = val_len;
2658	} else {
2659		NInoSetNonResident(ni);
2660		/*
2661		 * Ensure the attribute name is placed before the mapping pairs
2662		 * array.
2663		 */
2664		if (a->name_length && (le16_to_cpu(a->name_offset) >=
2665				le16_to_cpu(a->mapping_pairs_offset))) {
2666			ntfs_error(vol->mp, "Attribute name is placed after "
2667					"the mapping pairs array.");
2668			goto err;
2669		}
2670		if (NInoCompressed(ni) || NInoSparse(ni)) {
2671			if (NInoCompressed(ni) && a->compression_unit !=
2672					NTFS_COMPRESSION_UNIT) {
2673				ntfs_error(vol->mp, "Found non-standard "
2674						"compression unit (%d instead "
2675						"of %d).  Cannot handle this.",
2676						a->compression_unit,
2677						NTFS_COMPRESSION_UNIT);
2678				err = ENOTSUP;
2679				goto err;
2680			}
2681			if (!NInoCompressed(ni) && a->compression_unit != 0 &&
2682					a->compression_unit !=
2683					NTFS_COMPRESSION_UNIT) {
2684				ntfs_error(vol->mp, "Found non-standard "
2685						"compression unit (%d instead "
2686						"of 0 or %d).  Cannot handle "
2687						"this.", a->compression_unit,
2688						NTFS_COMPRESSION_UNIT);
2689				err = ENOTSUP;
2690				goto err;
2691			}
2692			if (a->compression_unit) {
2693				ni->compression_block_clusters = 1U <<
2694						a->compression_unit;
2695				ni->compression_block_size = 1U << (
2696						a->compression_unit +
2697						vol->cluster_size_shift);
2698				ni->compression_block_size_shift = ffs(
2699						ni->compression_block_size) - 1;
2700			} else {
2701				ni->compression_block_clusters = 0;
2702				ni->compression_block_size = 0;
2703				ni->compression_block_size_shift = 0;
2704			}
2705			ni->compressed_size = sle64_to_cpu(a->compressed_size);
2706		}
2707		if (a->lowest_vcn) {
2708			ntfs_error(vol->mp, "First extent of attribute has "
2709					"non-zero lowest_vcn.");
2710			goto err;
2711		}
2712		ni->allocated_size = sle64_to_cpu(a->allocated_size);
2713		ni->data_size = sle64_to_cpu(a->data_size);
2714		ni->initialized_size = sle64_to_cpu(a->initialized_size);
2715	}
2716put_done:
2717	ntfs_attr_search_ctx_put(ctx);
2718	ntfs_mft_record_unmap(base_ni);
2719done:
2720	/*
2721	 * Attach the base inode to the attribute inode and vice versa.  Note
2722	 * we do not need to lock the new inode as we still have exclusive
2723	 * access to it.
2724	 */
2725	lck_mtx_lock(&base_ni->attr_nis_lock);
2726	if (NInoDeleted(base_ni)) {
2727		lck_mtx_unlock(&base_ni->attr_nis_lock);
2728		return EDEADLK;
2729	}
2730	if ((base_ni->nr_attr_nis + 1) * sizeof(ntfs_inode *) >
2731			base_ni->attr_nis_alloc) {
2732		ntfs_inode **tmp;
2733		int new_size;
2734
2735		new_size = base_ni->attr_nis_alloc + 4 * sizeof(ntfs_inode *);
2736		tmp = OSMalloc(new_size, ntfs_malloc_tag);
2737		if (!tmp) {
2738			ntfs_error(vol->mp, "Failed to allocated internal "
2739					"buffer.");
2740			lck_mtx_unlock(&base_ni->attr_nis_lock);
2741			return ENOMEM;
2742		}
2743		if (base_ni->attr_nis_alloc) {
2744			if (base_ni->nr_attr_nis > 0)
2745				memcpy(tmp, base_ni->attr_nis,
2746						base_ni->nr_attr_nis *
2747						sizeof(ntfs_inode *));
2748			OSFree(base_ni->attr_nis, base_ni->attr_nis_alloc,
2749					ntfs_malloc_tag);
2750		}
2751		base_ni->attr_nis_alloc = new_size;
2752		base_ni->attr_nis = tmp;
2753	}
2754	base_ni->attr_nis[base_ni->nr_attr_nis++] = ni;
2755	ni->nr_extents = -1;
2756	ni->base_ni = base_ni;
2757	ni->base_attr_nis_lock = &base_ni->attr_nis_lock;
2758	lck_mtx_unlock(&base_ni->attr_nis_lock);
2759	ntfs_debug("Done.");
2760	return 0;
2761err:
2762	if (ctx)
2763		ntfs_attr_search_ctx_put(ctx);
2764	if (m)
2765		ntfs_mft_record_unmap(base_ni);
2766	if (!err)
2767		err = EIO;
2768	if (err != ENOENT) {
2769		ntfs_error(vol->mp, "Failed (error %d) for attribute inode "
2770				"0x%llx, attribute type 0x%x, name_len 0x%x.  "
2771				"Run chkdsk.", (int)err,
2772				(unsigned long long)ni->mft_no,
2773				(unsigned)le32_to_cpu(ni->type),
2774				(unsigned)ni->name_len);
2775		if (err != ENOTSUP && err != ENOMEM)
2776			NVolSetErrors(vol);
2777	}
2778	return err;
2779}
2780
2781/**
2782 * ntfs_index_inode_read - read an index inode from its base inode
2783 * @base_ni:	base inode
2784 * @ni:		index inode to read
2785 *
2786 * ntfs_index_inode_read() is called from ntfs_index_inode_get() to read the
2787 * index inode described by @ni into memory from the base mft record described
2788 * by @base_ni.
2789 *
2790 * ntfs_index_inode_read() maps, pins and locks the base mft record and looks
2791 * up the attributes relating to the index described by @ni before setting up
2792 * the ntfs inode.
2793 *
2794 * Return 0 on success and errno on error.
2795 *
2796 * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
2797 * with the attribute type set to AT_INDEX_ALLOCATION.  Most importantly, for
2798 * small indices the index allocation attribute might not actually exist.
2799 * However, the index root attribute always exists but this does not need to
2800 * have an inode associated with it and this is why we define a new inode type
2801 * index.  Also, we need to have an attribute inode for the bitmap attribute
2802 * corresponding to the index allocation attribute and we can store this in the
2803 * appropriate field of the inode.
2804 */
2805static errno_t ntfs_index_inode_read(ntfs_inode *base_ni, ntfs_inode *ni)
2806{
2807	ntfs_volume *vol = ni->vol;
2808	MFT_RECORD *m;
2809	ATTR_RECORD *a;
2810	ntfs_attr_search_ctx *ctx;
2811	INDEX_ROOT *ir;
2812	u8 *ir_end, *index_end;
2813	ntfs_inode *bni;
2814	errno_t err;
2815	BOOL is_dir_index = (S_ISDIR(base_ni->mode) && ni->name == I30);
2816
2817	ntfs_debug("Entering for mft_no 0x%llx, index name length 0x%x.",
2818			(unsigned long long)ni->mft_no,
2819			(unsigned)ni->name_len);
2820	/* Mirror the values from the base inode. */
2821	ni->seq_no = base_ni->seq_no;
2822	ni->uid	= base_ni->uid;
2823	ni->gid	= base_ni->gid;
2824	/* Indices cannot be hard-linked so link count is always 1. */
2825	ni->link_count = 1;
2826	/* Set inode type to zero but preserve permissions. */
2827	ni->mode = base_ni->mode & ~S_IFMT;
2828	/* Map the mft record for the base inode. */
2829	err = ntfs_mft_record_map(base_ni, &m);
2830	if (err) {
2831		ntfs_error(vol->mp, "Failed to map base mft record.");
2832		m = NULL;
2833		ctx = NULL;
2834		goto err;
2835	}
2836	ctx = ntfs_attr_search_ctx_get(base_ni, m);
2837	if (!ctx) {
2838		ntfs_error(vol->mp, "Failed to get attribute search context.");
2839		err = ENOMEM;
2840		goto err;
2841	}
2842	/* Find the index root attribute. */
2843	err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, 0, NULL,
2844			0, ctx);
2845	if (err) {
2846		if (err == ENOENT)
2847			ntfs_error(vol->mp, "$INDEX_ROOT attribute is "
2848					"missing.");
2849		else
2850			ntfs_error(vol->mp, "Failed to lookup index root "
2851					"attribute.");
2852		goto err;
2853	}
2854	a = ctx->a;
2855	/* Set up the state. */
2856	if (a->non_resident) {
2857		ntfs_error(vol->mp, "Index root attribute is not resident.");
2858		goto err;
2859	}
2860	/* Ensure the attribute name is placed before the value. */
2861	if (a->name_length && (le16_to_cpu(a->name_offset) >=
2862			le16_to_cpu(a->value_offset))) {
2863		ntfs_error(vol->mp, "Index root attribute name is placed "
2864				"after the attribute value.");
2865		goto err;
2866	}
2867	/*
2868	 * Compressed/encrypted/sparse index root is not allowed, except for
2869	 * directories, where the flags just mean that newly created files in
2870	 * that directory should be created compressed/encrytped.  However,
2871	 * index root cannot be both compressed and encrypted.
2872	 */
2873	if (is_dir_index) {
2874		if (a->flags & ATTR_COMPRESSION_MASK)
2875			NInoSetCompressed(ni);
2876		if (a->flags & ATTR_IS_ENCRYPTED) {
2877			if (a->flags & ATTR_COMPRESSION_MASK) {
2878				ntfs_error(vol->mp, "Found encrypted and "
2879						"compressed index root "
2880						"attribute.");
2881				goto err;
2882			}
2883		}
2884		if (a->flags & ATTR_IS_SPARSE)
2885			NInoSetSparse(ni);
2886	} else if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
2887			ATTR_IS_SPARSE)) {
2888		ntfs_error(vol->mp, "Found compressed/encrypted/sparse index "
2889				"root attribute on non-directory index.");
2890		goto err;
2891	}
2892	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->value_offset));
2893	ir_end = (u8*)ir + le32_to_cpu(a->value_length);
2894	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
2895	if (ir_end > (u8*)ctx->m + vol->mft_record_size ||
2896			index_end > ir_end ||
2897			ir->index.index_length != ir->index.allocated_size) {
2898		ntfs_error(vol->mp, "Index root attribute is corrupt.");
2899		goto err;
2900	}
2901	if (is_dir_index) {
2902		if (ir->type != AT_FILENAME) {
2903			ntfs_error(vol->mp, "Indexed attribute is not the "
2904					"filename attribute.");
2905			goto err;
2906		}
2907		if (ir->collation_rule != COLLATION_FILENAME) {
2908			ntfs_error(vol->mp, "Index collation rule is not "
2909					"COLLATION_FILENAME.");
2910			goto err;
2911		}
2912	} else if (ir->type) {
2913		ntfs_error(vol->mp, "Index type is not 0 (type is 0x%x).",
2914				(unsigned)le32_to_cpu(ir->type));
2915		goto err;
2916	}
2917	ntfs_debug("Index collation rule is 0x%x.",
2918			(unsigned)le32_to_cpu(ir->collation_rule));
2919	ni->collation_rule = ir->collation_rule;
2920	ni->block_size = le32_to_cpu(ir->index_block_size);
2921	if (ni->block_size & (ni->block_size - 1)) {
2922		ntfs_error(vol->mp, "Index block size (%u) is not a power of "
2923				"two.", (unsigned)ni->block_size);
2924		goto err;
2925	}
2926	if (ni->block_size > PAGE_SIZE) {
2927		ntfs_error(vol->mp, "Index block size (%u) > PAGE_SIZE (%u) "
2928				"is not supported.  Sorry.",
2929				(unsigned)ni->block_size, PAGE_SIZE);
2930		err = ENOTSUP;
2931		goto err;
2932	}
2933	if (ni->block_size < NTFS_BLOCK_SIZE) {
2934		ntfs_error(vol->mp, "Index block size (%u) < NTFS_BLOCK_SIZE "
2935				"(%d) is not supported.  Sorry.",
2936				(unsigned)ni->block_size, NTFS_BLOCK_SIZE);
2937		err = ENOTSUP;
2938		goto err;
2939	}
2940	ni->block_size_shift = ffs(ni->block_size) - 1;
2941	/* Determine the size of a vcn in the index. */
2942	if (vol->cluster_size <= ni->block_size) {
2943		ni->vcn_size = vol->cluster_size;
2944		ni->vcn_size_shift = vol->cluster_size_shift;
2945	} else {
2946		ni->vcn_size = vol->sector_size;
2947		ni->vcn_size_shift = vol->sector_size_shift;
2948	}
2949	/* Check for presence of index allocation attribute. */
2950	err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, 0,
2951			NULL, 0, ctx);
2952	if (err) {
2953		if (err != ENOENT) {
2954			ntfs_error(vol->mp, "Failed to lookup index "
2955					"allocation attribute.");
2956			goto err;
2957		}
2958		if (ir->index.flags & LARGE_INDEX) {
2959			ntfs_error(vol->mp, "Index allocation attribute is "
2960					"not present but the index root "
2961					"attribute indicated it is.");
2962			goto err;
2963		}
2964		/* No index allocation. */
2965		ni->allocated_size = ni->data_size = ni->initialized_size = 0;
2966		/* We are done with the mft record, so we release it. */
2967		ntfs_attr_search_ctx_put(ctx);
2968		ntfs_mft_record_unmap(base_ni);
2969	} else {
2970		unsigned block_mask;
2971
2972		/* Index allocation present.  Setup state. */
2973		NInoSetIndexAllocPresent(ni);
2974		a = ctx->a;
2975		if (!a->non_resident) {
2976			ntfs_error(vol->mp, "Index allocation attribute is "
2977					"resident.");
2978			goto err;
2979		}
2980		/*
2981		 * Ensure the attribute name is placed before the mapping pairs
2982		 * array.
2983		 */
2984		if (a->name_length && (le16_to_cpu(a->name_offset) >=
2985				le16_to_cpu(a->mapping_pairs_offset))) {
2986			ntfs_error(vol->mp, "Index allocation attribute name "
2987					"is placed after the mapping pairs "
2988					"array.");
2989			goto err;
2990		}
2991		if (a->flags & ATTR_IS_ENCRYPTED) {
2992			ntfs_error(vol->mp, "Index allocation attribute is "
2993					"encrypted.");
2994			goto err;
2995		}
2996		if (a->flags & ATTR_IS_SPARSE) {
2997			ntfs_error(vol->mp, "Index allocation attribute is "
2998					"sparse.");
2999			goto err;
3000		}
3001		if (a->flags & ATTR_COMPRESSION_MASK) {
3002			ntfs_error(vol->mp, "Index allocation attribute is "
3003					"compressed.");
3004			goto err;
3005		}
3006		if (a->lowest_vcn) {
3007			ntfs_error(vol->mp, "First extent of index allocation "
3008					"attribute has non-zero lowest_vcn.");
3009			goto err;
3010		}
3011		ni->allocated_size = sle64_to_cpu(a->allocated_size);
3012		ni->data_size = sle64_to_cpu(a->data_size);
3013		ni->initialized_size = sle64_to_cpu(a->initialized_size);
3014		/*
3015		 * Verify the sizes are sane.  In particular both the data size
3016		 * and the initialized size must be multiples of the index
3017		 * block size or we will panic() when reading the boundary in
3018		 * ntfs_cluster_iodone().
3019		 *
3020		 * Also the allocated size must be a multiple of the volume
3021		 * cluster size.
3022		 */
3023		block_mask = ni->block_size - 1;
3024		if (ni->allocated_size & vol->cluster_size_mask ||
3025				ni->data_size & block_mask ||
3026				ni->initialized_size & block_mask) {
3027			ntfs_error(vol->mp, "$INDEX_ALLOCATION attribute "
3028					"contains invalid size.  Inode 0x%llx "
3029					"is corrupt.  Run chkdsk.",
3030					(unsigned long long)ni->mft_no);
3031			goto err;
3032		}
3033		/*
3034		 * We are done with the mft record, so we release it.
3035		 * Otherwise we would deadlock in ntfs_attr_inode_get().
3036		 */
3037		ntfs_attr_search_ctx_put(ctx);
3038		ntfs_mft_record_unmap(base_ni);
3039		m = NULL;
3040		ctx = NULL;
3041		/* Get the index bitmap attribute inode. */
3042		err = ntfs_attr_inode_get(base_ni, AT_BITMAP, ni->name,
3043				ni->name_len, FALSE, LCK_RW_TYPE_SHARED, &bni);
3044		if (err) {
3045			ntfs_error(vol->mp, "Failed to get bitmap attribute.");
3046			goto err;
3047		}
3048		if (NInoCompressed(bni) || NInoEncrypted(bni) ||
3049				NInoSparse(bni)) {
3050			ntfs_error(vol->mp, "Bitmap attribute is compressed "
3051					"and/or encrypted and/or sparse.");
3052			lck_rw_unlock_shared(&bni->lock);
3053			(void)vnode_put(bni->vn);
3054			goto err;
3055		}
3056		/* Consistency check bitmap size vs. index allocation size. */
3057		if ((bni->data_size << 3) < (ni->data_size >>
3058				ni->block_size_shift)) {
3059			ntfs_error(vol->mp, "Index bitmap too small (0x%llx) "
3060					"for index allocation (0x%llx).",
3061					(unsigned long long)bni->data_size,
3062					(unsigned long long)ni->data_size);
3063			lck_rw_unlock_shared(&bni->lock);
3064			(void)vnode_put(bni->vn);
3065			goto err;
3066		}
3067		lck_rw_unlock_shared(&bni->lock);
3068		(void)vnode_put(bni->vn);
3069	}
3070	/*
3071	 * Attach the base inode to the attribute inode and vice versa.  Note
3072	 * we do not need to lock the new inode as we still have exclusive
3073	 * access to it.
3074	 */
3075	lck_mtx_lock(&base_ni->attr_nis_lock);
3076	if (NInoDeleted(base_ni)) {
3077		lck_mtx_unlock(&base_ni->attr_nis_lock);
3078		return EDEADLK;
3079	}
3080	if ((base_ni->nr_attr_nis + 1) * sizeof(ntfs_inode *) >
3081			base_ni->attr_nis_alloc) {
3082		ntfs_inode **tmp;
3083		int new_size;
3084
3085		new_size = base_ni->attr_nis_alloc + 4 * sizeof(ntfs_inode *);
3086		tmp = OSMalloc(new_size, ntfs_malloc_tag);
3087		if (!tmp) {
3088			ntfs_error(vol->mp, "Failed to allocated internal "
3089					"buffer.");
3090			lck_mtx_unlock(&base_ni->attr_nis_lock);
3091			return ENOMEM;
3092		}
3093		if (base_ni->attr_nis_alloc) {
3094			if (base_ni->nr_attr_nis > 0)
3095				memcpy(tmp, base_ni->attr_nis,
3096						base_ni->nr_attr_nis *
3097						sizeof(ntfs_inode *));
3098			OSFree(base_ni->attr_nis, base_ni->attr_nis_alloc,
3099					ntfs_malloc_tag);
3100		}
3101		base_ni->attr_nis_alloc = new_size;
3102		base_ni->attr_nis = tmp;
3103	}
3104	base_ni->attr_nis[base_ni->nr_attr_nis++] = ni;
3105	ni->nr_extents = -1;
3106	ni->base_ni = base_ni;
3107	ni->base_attr_nis_lock = &base_ni->attr_nis_lock;
3108	lck_mtx_unlock(&base_ni->attr_nis_lock);
3109	ntfs_debug("Done.");
3110	return 0;
3111err:
3112	if (ctx)
3113		ntfs_attr_search_ctx_put(ctx);
3114	if (m)
3115		ntfs_mft_record_unmap(base_ni);
3116	if (!err)
3117		err = EIO;
3118	ntfs_error(vol->mp, "Failed (error %d) for index inode 0x%llx, "
3119			"index name_len 0x%x.  Run chkdsk.", (int)err,
3120			(unsigned long long)ni->mft_no,
3121			(unsigned)ni->name_len);
3122	if (err != ENOTSUP && err != ENOMEM)
3123		NVolSetErrors(vol);
3124	return err;
3125}
3126
3127/**
3128 * ntfs_inode_free - free an ntfs inode
3129 * @ni:		ntfs inode to free
3130 *
3131 * Free the resources used by the ntfs inode @ni as well as @ni itself, which
3132 * is NInoReclaim(), unhashed, and all waiters have been woken up.
3133 */
3134static inline void ntfs_inode_free(ntfs_inode *ni)
3135{
3136	ntfs_volume *vol = ni->vol;
3137	BOOL do_release;
3138
3139	/* No need to lock at this stage as no one else has a reference. */
3140	if (ni->nr_extents > 0) {
3141		int i;
3142
3143		for (i = 0; i < ni->nr_extents; i++)
3144			ntfs_inode_reclaim(ni->extent_nis[i]);
3145		OSFree(ni->extent_nis, ni->extent_alloc, ntfs_malloc_tag);
3146	}
3147	/*
3148	 * If this is an attribute or index inode, detach it from the base
3149	 * inode if it is attached.
3150	 */
3151	if (NInoAttr(ni) && ni->nr_extents == -1) {
3152		ntfs_inode *base_ni, **attr_nis;
3153		int i;
3154
3155		/* Lock the base inode. */
3156		lck_mtx_lock(ni->base_attr_nis_lock);
3157		base_ni = ni->base_ni;
3158		/* Find the current inode in the base inode array. */
3159		attr_nis = base_ni->attr_nis;
3160		for (i = 0; i < base_ni->nr_attr_nis; i++) {
3161			if (attr_nis[i] != ni)
3162				continue;
3163			/*
3164			 * Delete the inode from the array and move any
3165			 * following entries forward over the current entry.
3166			 */
3167			if (i + 1 < base_ni->nr_attr_nis)
3168				memmove(attr_nis + i, attr_nis + i + 1,
3169						(base_ni->nr_attr_nis -
3170						(i + 1)) *
3171						sizeof(ntfs_inode *));
3172			base_ni->nr_attr_nis--;
3173			break;
3174		}
3175		ni->nr_extents = 0;
3176		ni->base_ni = NULL;
3177		lck_mtx_unlock(ni->base_attr_nis_lock);
3178		ni->base_attr_nis_lock = NULL;
3179	}
3180	if (ni->rl.alloc)
3181		OSFree(ni->rl.rl, ni->rl.alloc, ntfs_malloc_tag);
3182	if (ni->attr_list_alloc)
3183		OSFree(ni->attr_list, ni->attr_list_alloc, ntfs_malloc_tag);
3184	if (ni->attr_list_rl.alloc)
3185		OSFree(ni->attr_list_rl.rl, ni->attr_list_rl.alloc,
3186				ntfs_malloc_tag);
3187	ntfs_dirhints_put(ni, 0);
3188	if (ni->name_len && ni->name != I30 &&
3189			ni->name != NTFS_SFM_RESOURCEFORK_NAME &&
3190			ni->name != NTFS_SFM_AFPINFO_NAME)
3191		OSFree(ni->name, (ni->name_len + 1) * sizeof(ntfschar),
3192				ntfs_malloc_tag);
3193	/* Remove the inode from the list of inodes in the volume. */
3194	lck_mtx_lock(&vol->inodes_lock);
3195	LIST_REMOVE(ni, inodes);
3196	/*
3197	 * If this was the last inode and the release of the volume was
3198	 * postponed then release the volume now.
3199	 */
3200	do_release = FALSE;
3201	if (LIST_EMPTY(&vol->inodes) && NVolPostponedRelease(vol)) {
3202		NVolClearPostponedRelease(vol);
3203		do_release = TRUE;
3204	}
3205	lck_mtx_unlock(&vol->inodes_lock);
3206	/* Destroy all the locks before finally discarding the ntfs inode. */
3207	lck_rw_destroy(&ni->lock, ntfs_lock_grp);
3208	lck_spin_destroy(&ni->size_lock, ntfs_lock_grp);
3209	ntfs_rl_deinit(&ni->rl);
3210	ntfs_rl_deinit(&ni->attr_list_rl);
3211	lck_mtx_destroy(&ni->extent_lock, ntfs_lock_grp);
3212	OSFree(ni, sizeof(ntfs_inode), ntfs_malloc_tag);
3213	/* If the volume release was postponed, perform it now. */
3214	if (do_release)
3215		ntfs_do_postponed_release(vol);
3216}
3217
3218/**
3219 * ntfs_inode_reclaim - destroy an ntfs inode freeing all its resources
3220 * @ni:		ntfs inode to destroy
3221 *
3222 * Destroy the ntfs inode @ni freeing all its resources in the process.  We are
3223 * assured that no-one can get the inode because to do that they would have to
3224 * take a reference on the corresponding vnode and that is not possible because
3225 * the vnode is flagged for termination thus the vnode_get() will return an
3226 * error.
3227 *
3228 * Note: When called from reclaim, the vnode of the ntfs inode has a zero
3229 *	 v_iocount and v_usecount and vnode_isrecycled() is true.
3230 *
3231 * This function cannot fail and always returns 0.
3232 */
3233errno_t ntfs_inode_reclaim(ntfs_inode *ni)
3234{
3235	vnode_t vn;
3236
3237	/* If @ni is NULL, do not do anything. */
3238	if (!ni)
3239		return 0;
3240	ntfs_debug("Entering for mft_no 0x%llx.",
3241			(unsigned long long)ni->mft_no);
3242	/*
3243	 * If this is a base inode and there are attribute/index inodes loaded,
3244	 * then recycle them now.
3245	 *
3246	 * FIXME: For a forced unmount where something is genuinely kept busy
3247	 * this will cause the system to hang but this is the only way to avoid
3248	 * a crash in NTFS...
3249	 */
3250	if (!NInoAttr(ni)) {
3251		int count = 0;
3252
3253		lck_mtx_lock(&ni->attr_nis_lock);
3254		while (ni->nr_attr_nis > 0) {
3255			ntfs_inode *attr_ni;
3256			int err;
3257
3258			attr_ni = ni->attr_nis[ni->nr_attr_nis - 1];
3259			err = 1;
3260			if (!NInoDeleted(attr_ni))
3261				err = vnode_get(attr_ni->vn);
3262			lck_mtx_unlock(&ni->attr_nis_lock);
3263			if (!err) {
3264				vnode_recycle(attr_ni->vn);
3265				vnode_put(attr_ni->vn);
3266			}
3267			/* Give it a chance to go away... */
3268			(void)thread_block(THREAD_CONTINUE_NULL);
3269			if (count < 1000)
3270				count++;
3271			else if (count == 1000) {
3272				ntfs_warning(ni->vol->mp, "Failed to reclaim "
3273						"inode 0x%llx because it has "
3274						"a busy attribute/index "
3275						"inode.  Going to keep "
3276						"trying for ever...",
3277						(unsigned long long)
3278						ni->mft_no);
3279				count = 1001;
3280			}
3281			lck_mtx_lock(&ni->attr_nis_lock);
3282		}
3283		lck_mtx_unlock(&ni->attr_nis_lock);
3284	}
3285	lck_mtx_lock(&ntfs_inode_hash_lock);
3286	NInoSetReclaim(ni);
3287	/*
3288	 * If the inode has been deleted then it has been removed from the ntfs
3289	 * inode hash already.
3290	 */
3291	if (!NInoDeleted(ni)) {
3292		NInoSetDeleted(ni);
3293		ntfs_inode_hash_rm_nolock(ni);
3294	}
3295	/*
3296	 * Need this for the error handling code paths but is ok for normal
3297	 * code path, too.
3298	 */
3299	NInoClearAllocLocked(ni);
3300	lck_mtx_unlock(&ntfs_inode_hash_lock);
3301	/* In case someone is waiting on the inode do a wakeup. */
3302	ntfs_inode_wakeup(ni);
3303	/* Detach the ntfs inode from its vnode, if there is one. */
3304	vn = ni->vn;
3305	if (vn)
3306		vnode_clearfsnode(vn);
3307	/*
3308	 * We now have exclusive access to the ntfs inode and as it is unhashed
3309	 * no-one else can ever find it thus we can finally destroy it.
3310	 */
3311	if (ni->nr_refs > 0)
3312		ntfs_debug("Called for mft_no 0x%llx, attribute type 0x%x, "
3313				"nr_refs %d.", (unsigned long long)ni->mft_no,
3314				(unsigned)le32_to_cpu(ni->type), ni->nr_refs);
3315	ntfs_inode_free(ni);
3316	ntfs_debug("Done.");
3317	return 0;
3318}
3319
3320/**
3321 * ntfs_inode_data_sync - synchronize an inode's in-core data
3322 * @ni:		ntfs inode the data of which to synchronize to disk
3323 * @ioflags:	flags describing the i/o request
3324 *
3325 * Sync all dirty cached data belonging/related to the ntfs inode @ni.
3326 *
3327 * If @ioflags has the IO_SYNC bit set, wait for all i/o to complete before
3328 * returning.
3329 *
3330 * If @ioflags has the IO_CLOSE bit set, this signals cluster_push() that the
3331 * i/o is issued from the close path (or in our case more precisely from the
3332 * VNOP_INACTIVE() path).
3333 *
3334 * Note: When called from reclaim (via VNOP_FSYNC() and hence ntfs_vnop_fsync()
3335 *	 and ntfs_inode_sync(), the vnode has a zero v_iocount and v_usecount
3336 *	 and vnode_isrecycled() is true.  Thus we cannot obtain any
3337 *	 attribute/raw inodes inside ntfs_inode_data_sync() or the vnode_ref()
3338 *	 on the base vnode that is done as part of getting an attribute/raw
3339 *	 inode causes a panic() to trigger as both the iocount and usecount are
3340 *	 zero on the base vnode.
3341 *
3342 * Return 0 on success and the error code on error.
3343 */
3344static errno_t ntfs_inode_data_sync(ntfs_inode *ni, const int ioflags)
3345{
3346	ntfs_volume *vol = ni->vol;
3347	vnode_t vn = ni->vn;
3348	errno_t err = 0;
3349
3350	/*
3351	 * $MFT/$DATA and $MFTMirr/$DATA are accessed exclusively via
3352	 * buf_meta_bread(), etc, i.e. they do not use the UBC, thus to write
3353	 * them we only need to worry about writing out any dirty buffers.
3354	 */
3355	lck_rw_lock_shared(&ni->lock);
3356	if (ni == vol->mft_ni || ni == vol->mftmirr_ni) {
3357		/* Flush all dirty buffers associated with the vnode. */
3358		ntfs_debug("Calling buf_flushdirtyblks() for $MFT%s/$DATA.",
3359				ni == vol->mft_ni ? "" : "Mirr");
3360		buf_flushdirtyblks(vn, ioflags & IO_SYNC, 0 /* lock flags */,
3361				"ntfs_inode_sync");
3362		lck_rw_unlock_shared(&ni->lock);
3363		return 0;
3364	}
3365	if (NInoNonResident(ni)) {
3366		int (*callback)(buf_t, void *) = NULL;
3367
3368		if (ni->type != AT_INDEX_ALLOCATION) {
3369			if (NInoCompressed(ni) && !NInoRaw(ni)) {
3370#if 0
3371				err = ntfs_inode_sync_compressed(ni, uio,
3372						ubc_getsize(vn), ioflags);
3373				if (!err)
3374					ntfs_debug("Done (ntfs_inode_sync_"
3375							"compressed()).");
3376				else
3377					ntfs_error(vol->mp, "Failed (ntfs_"
3378							"inode_sync_"
3379							"compressed(), error "
3380							"%d).", err);
3381#endif
3382				lck_rw_unlock_shared(&ni->lock);
3383				ntfs_error(vol->mp, "Syncing compressed file "
3384						"inodes is not implemented "
3385						"yet, sorry.");
3386				return ENOTSUP;
3387			}
3388			if (NInoEncrypted(ni)) {
3389#if 0
3390				callback = ntfs_cluster_iodone;
3391#endif
3392				lck_rw_unlock_shared(&ni->lock);
3393				ntfs_error(vol->mp, "Syncing encrypted file "
3394						"inodes is not implemented "
3395						"yet, sorry.");
3396				return ENOTSUP;
3397			}
3398		}
3399		/*
3400		 * Write any dirty clusters.  We are guaranteed not to have any
3401		 * for mst protected attributes.
3402		 */
3403		if (!NInoMstProtected(ni)) {
3404			/* Write out any dirty clusters. */
3405			ntfs_debug("Calling cluster_push_ext().");
3406			(void)cluster_push_ext(vn, ioflags, callback, NULL);
3407		}
3408		/* Flush all dirty buffers associated with the vnode. */
3409		ntfs_debug("Calling buf_flushdirtyblks().");
3410		buf_flushdirtyblks(vn, ioflags & IO_SYNC, 0 /* lock flags */,
3411				"ntfs_inode_sync");
3412#ifdef DEBUG
3413	} else /* if (!NInoNonResident(ni)) */ {
3414		if (vnode_hasdirtyblks(vn))
3415			ntfs_warning(vol->mp, "resident and "
3416					"vnode_hasdirtyblks!");
3417#endif /* DEBUG */
3418	}
3419	/* ubc_msync() cannot be called with the inode lock held. */
3420	lck_rw_unlock_shared(&ni->lock);
3421	/*
3422	 * If we have any dirty pages in the VM page cache, write them out now.
3423	 * For a resident attribute this will push the data into the mft record
3424	 * which needs to be pushed to disk later/elsewhere.
3425	 */
3426	ntfs_debug("Calling ubc_msync() for inode data.");
3427	err = ubc_msync(vn, 0, ubc_getsize(vn), NULL, UBC_PUSHDIRTY |
3428			(ioflags & IO_SYNC ? UBC_SYNC : 0));
3429	if (err)
3430		ntfs_error(vol->mp, "ubc_msync() of data for mft_no 0x%llx "
3431				"failed (error %d).",
3432				(unsigned long long)ni->mft_no, err);
3433	return err;
3434}
3435
3436struct fn_list_entry {
3437	SLIST_ENTRY(fn_list_entry) list_entry;
3438	unsigned alloc, size;
3439	FILENAME_ATTR fn;
3440};
3441
3442/**
3443 * ntfs_inode_sync_to_mft_record - update metadata with changes to ntfs inode
3444 * @ni:		ntfs inode the changes of which to update the metadata with
3445 *
3446 * Sync all dirty cached data belonging/related to the ntfs inode @ni.
3447 *
3448 * Note: When called from reclaim (via VNOP_FSYNC() and hence ntfs_vnop_fsync()
3449 *	 and ntfs_inode_sync(), the vnode has a zero v_iocount and v_usecount
3450 *	 and vnode_isrecycled() is true.  Thus we cannot obtain any
3451 *	 attribute/raw inodes inside ntfs_inode_sync_to_mft_record() or the
3452 *	 vnode_ref() on the base vnode that is done as part of getting an
3453 *	 attribute/raw inode causes a panic() to trigger as both the iocount
3454 *	 and usecount are zero on the base vnode.
3455 *
3456 * Return 0 on success and the error code on error.
3457 */
3458static errno_t ntfs_inode_sync_to_mft_record(ntfs_inode *ni)
3459{
3460	sle64 creation_time, last_data_change_time, last_mft_change_time,
3461			last_access_time, allocated_size, data_size;
3462	ino64_t dir_mft_no;
3463	ntfs_volume *vol = ni->vol;
3464	MFT_RECORD *m;
3465	ntfs_attr_search_ctx *actx;
3466	ATTR_RECORD *a;
3467	SLIST_HEAD(, fn_list_entry) fn_list;
3468	struct fn_list_entry *next;
3469	ntfs_index_context *ictx;
3470	ntfs_inode *dir_ni, *dir_ia_ni;
3471	FILENAME_ATTR *fn;
3472	errno_t err;
3473	FILE_ATTR_FLAGS file_attributes = 0;
3474	BOOL ignore_errors, dirty_times, dirty_file_attributes, dirty_sizes;
3475	BOOL dirty_set_file_bits, modified;
3476	static const char ies[] = "Failed to update directory index entry(ies) "
3477			"of inode 0x%llx because %s (error %d).  Run chkdsk "
3478			"or touch the inode again to retry the update.";
3479
3480	/*
3481	 * There is nothing to do for attribute inodes and raw inodes.  Note
3482	 * raw inodes are always attribute inodes so no need to check for them.
3483	 *
3484	 * There is nothing to do for clean inodes.
3485	 */
3486	if (NInoAttr(ni) || !NInoDirty(ni))
3487		return 0;
3488	lck_rw_lock_shared(&ni->lock);
3489	err = ntfs_mft_record_map(ni, &m);
3490	if (err) {
3491		lck_rw_unlock_shared(&ni->lock);
3492		ntfs_error(vol->mp, "Failed to map mft record.");
3493		return err;
3494	}
3495	actx = ntfs_attr_search_ctx_get(ni, m);
3496	if (!actx) {
3497		ntfs_mft_record_unmap(ni);
3498		lck_rw_unlock_shared(&ni->lock);
3499		ntfs_error(vol->mp, "Failed to get attribute search context.");
3500		return ENOMEM;
3501	}
3502	ignore_errors = FALSE;
3503	dirty_times = NInoTestClearDirtyTimes(ni);
3504	dirty_file_attributes = NInoTestClearDirtyFileAttributes(ni);
3505	dirty_sizes = NInoTestClearDirtySizes(ni);
3506	/* Directories always have their sizes set to zero. */
3507	if (S_ISDIR(ni->mode))
3508		dirty_sizes = FALSE;
3509	dirty_set_file_bits = NInoTestClearDirtySetFileBits(ni);
3510	/*
3511	 * Update the access times/file attributes in the standard information
3512	 * attribute.
3513	 */
3514	modified = FALSE;
3515	creation_time = last_data_change_time = last_mft_change_time =
3516			last_access_time = 0;
3517	if (dirty_times || dirty_file_attributes) {
3518		STANDARD_INFORMATION *si;
3519
3520		err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, AT_UNNAMED, 0,
3521				0, NULL, 0, actx);
3522		if (err)
3523			goto err;
3524		si = (STANDARD_INFORMATION*)((u8*)actx->a +
3525				le16_to_cpu(actx->a->value_offset));
3526		if (dirty_file_attributes) {
3527			file_attributes = ni->file_attributes;
3528			if (si->file_attributes != file_attributes) {
3529				ntfs_debug("Updating file attributes for "
3530						"inode 0x%llx: old = 0x%x, "
3531						"new = 0x%x",
3532						(unsigned long long)ni->mft_no,
3533						(unsigned)le32_to_cpu(
3534						si->file_attributes),
3535						(unsigned)le32_to_cpu(
3536						file_attributes));
3537				si->file_attributes = file_attributes;
3538				modified = TRUE;
3539			}
3540			/*
3541			 * We have updated the standard information attribute.
3542			 * Now need to update the file attributes for the
3543			 * directory entries which also have the
3544			 * FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set on all
3545			 * directory inodes.
3546			 */
3547			if (S_ISDIR(ni->mode))
3548				file_attributes |=
3549					FILE_ATTR_DUP_FILENAME_INDEX_PRESENT;
3550		}
3551		creation_time = utc2ntfs(ni->creation_time);
3552		if (si->creation_time != creation_time) {
3553			ntfs_debug("Updating creation_time for inode 0x%llx: "
3554					"old = 0x%llx, new = 0x%llx",
3555					(unsigned long long)ni->mft_no,
3556					(unsigned long long)
3557					sle64_to_cpu(si->creation_time),
3558					(unsigned long long)
3559					sle64_to_cpu(creation_time));
3560			si->creation_time = creation_time;
3561			modified = TRUE;
3562		}
3563		last_data_change_time = utc2ntfs(ni->last_data_change_time);
3564		if (si->last_data_change_time != last_data_change_time) {
3565			ntfs_debug("Updating last_data_change_time for inode "
3566					"0x%llx: old = 0x%llx, new = 0x%llx",
3567					(unsigned long long)ni->mft_no,
3568					(unsigned long long)
3569					sle64_to_cpu(si->last_data_change_time),
3570					(unsigned long long)
3571					sle64_to_cpu(last_data_change_time));
3572			si->last_data_change_time = last_data_change_time;
3573			modified = TRUE;
3574		}
3575		last_mft_change_time = utc2ntfs(ni->last_mft_change_time);
3576		if (si->last_mft_change_time != last_mft_change_time) {
3577			ntfs_debug("Updating last_mft_change_time for inode "
3578					"0x%llx: old = 0x%llx, new = 0x%llx",
3579					(unsigned long long)ni->mft_no,
3580					(unsigned long long)
3581					sle64_to_cpu(si->last_mft_change_time),
3582					(unsigned long long)
3583					sle64_to_cpu(last_mft_change_time));
3584			si->last_mft_change_time = last_mft_change_time;
3585			modified = TRUE;
3586		}
3587		last_access_time = utc2ntfs(ni->last_access_time);
3588		if (si->last_access_time != last_access_time) {
3589			ntfs_debug("Updating last_access_time for inode "
3590					"0x%llx: old = 0x%llx, new = 0x%llx",
3591					(unsigned long long)ni->mft_no,
3592					(unsigned long long)
3593					sle64_to_cpu(si->last_access_time),
3594					(unsigned long long)
3595					sle64_to_cpu(last_access_time));
3596			si->last_access_time = last_access_time;
3597			modified = TRUE;
3598		}
3599	}
3600	/*
3601	 * If we just modified the standard information attribute we need to
3602	 * mark the mft record it is in dirty.
3603	 */
3604	if (modified)
3605		NInoSetMrecNeedsDirtying(actx->ni);
3606	/*
3607	 * If the special mode bits S_ISUID, S_ISGID, and/or S_ISVTX need to be
3608	 * updated, do it now..
3609	 */
3610	if (dirty_set_file_bits) {
3611		modified = FALSE;
3612		// TODO: Lookup $EA_INFORMATION and $EA and if not there create
3613		// them, then if the SETFILEBITS EA is not present, create it,
3614		// then if the bits in the EA do not match the new ones update
3615		// the EA with the new bits.
3616		if (modified)
3617			NInoSetMrecNeedsDirtying(actx->ni);
3618		ntfs_attr_search_ctx_reinit(actx);
3619	}
3620	/* We ensure above that this never triggers for directory inodes. */
3621	if (dirty_sizes) {
3622		lck_spin_lock(&ni->size_lock);
3623		allocated_size = cpu_to_sle64(NInoNonResident(ni) &&
3624				(NInoSparse(ni) || NInoCompressed(ni)) ?
3625				ni->compressed_size : ni->allocated_size);
3626		data_size = cpu_to_sle64(ni->data_size);
3627		lck_spin_unlock(&ni->size_lock);
3628	} else
3629		allocated_size = data_size = 0;
3630	/*
3631	 * If the directory index entries need updating, do it now.  Note we
3632	 * use goto to skip this section to reduce indentation.
3633	 *
3634	 * Note, there is one special case; unlinked but not yet deleted inodes
3635	 * (POSIX semantics of being able to access an opened file/directory
3636	 * after unlinking it until it is closed when it is really deleted).
3637	 * The special thing here is that ntfs_unlink() has removed all
3638	 * directory entries pointing to the inode we are writing out but it
3639	 * has left the last filename attribute in the mft record of the inode
3640	 * thus we would find a filename attribute for which we would then fail
3641	 * to lookup the directory entry as it does not exist any more.  Thus
3642	 * we skip directory index entry updates completely for all unlinked
3643	 * inodes.  Even if this problem did not exist, it would still make
3644	 * sense to skip directory index entry updates for unlinked files as
3645	 * they by definition do not have any directory entries so we are just
3646	 * waisting cpu cycles trying to find some.
3647	 *
3648	 * Note: Any non-serious errors during the update of the index entries
3649	 * can be ignored because having not up-to-date index entries wrt the
3650	 * inode times and/or sizes does not actually make anything not work
3651	 * and even chkdsk /f does not report it as an error and the verbose
3652	 * chkdsk /f/v only reports it as a "cleanup of a minor inconsistency".
3653	 * Further, any transient errors get automatically corrected the next
3654	 * time an update happens as the updates simply overwrite the old
3655	 * values each time.
3656	 */
3657	if ((!dirty_file_attributes && !dirty_times && !dirty_sizes) ||
3658			!ni->link_count) {
3659		ntfs_attr_search_ctx_put(actx);
3660		ntfs_mft_record_unmap(ni);
3661		lck_rw_unlock_shared(&ni->lock);
3662		goto done;
3663	}
3664	ictx = NULL;
3665	ignore_errors = TRUE;
3666	/*
3667	 * Enumerate all filename attributes.  We do not reset the search
3668	 * context as we will be enumerating the filename attributes which come
3669	 * after the standard information attribute.
3670	 *
3671	 * Note that whilst from an NTFS point of view it would be perfectly
3672	 * safe to mix the attribute lookups with the index lookups because
3673	 * NTFS does not allow hard links to directories thus we are guaranteed
3674	 * not to be working on the directory inode that we will be using to do
3675	 * index lookups in, thus no danger of deadlock exists, we cannot
3676	 * actually do that as explained below.  There is only one special case
3677	 * we need to deal with where this is not true and this is the root
3678	 * directory of the volume which contains an entry for itself with the
3679	 * name ".".
3680	 *
3681	 * The reason we cannot mix the attribute lookups with the index
3682	 * lookups is that the mft record(s) for the directory can be in the
3683	 * same page as the mft record(s) for the file we are currently working
3684	 * on and when this happens we deadlock when ntfs_index_lookup() tries
3685	 * to map the mft record for the directory as we are holding the page
3686	 * it is in locked already due to the mapped mft record(s) of the file.
3687	 *
3688	 * Thus we go over all the filename attributes and copy them one by one
3689	 * into a temporary buffer, then release the mft record of the file and
3690	 * only then do the index lookups for each copied filename attribute.
3691	 *
3692	 * This is ugly but still a lot more efficient than having to drop and
3693	 * re-map the mft record for the file for each filename!  And it does
3694	 * have one advantage and that is that the root directory "." update
3695	 * does not need to be treated specially.
3696	 */
3697	SLIST_INIT(&fn_list);
3698	do {
3699		unsigned size, alloc;
3700
3701		err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0,
3702				actx);
3703		if (err) {
3704			/*
3705			 * ENOENT means that there are no more filenames in the
3706			 * mft record, i.e. we are done.
3707			 */
3708			if (err == ENOENT)
3709				break;
3710			/* Real error. */
3711			ntfs_error(vol->mp, ies,
3712					(unsigned long long)ni->mft_no,
3713					"looking up a filename attribute in "
3714					"the inode failed", err);
3715			ignore_errors = FALSE;
3716			goto list_err;
3717		}
3718		a = actx->a;
3719		if (a->non_resident) {
3720			ntfs_error(vol->mp, "Non-resident filename attribute "
3721					"found.  Run chkdsk.");
3722			err = EIO;
3723			ignore_errors = FALSE;
3724			goto list_err;
3725		}
3726		/*
3727		 * Allocate a new list entry, copy the current filename
3728		 * attribute value into it, and attach it to the end of the
3729		 * list.
3730		 */
3731		size = le32_to_cpu(a->value_length);
3732		alloc = offsetof(struct fn_list_entry, fn) + size;
3733		next = OSMalloc(alloc, ntfs_malloc_tag);
3734		if (!next) {
3735			ntfs_error(vol->mp, ies,
3736					(unsigned long long)ni->mft_no,
3737					"there was not enough memory to "
3738					"allocate a temporary filename buffer",
3739					ENOMEM);
3740			err = ENOMEM;
3741			goto list_err;
3742		}
3743		next->alloc = alloc;
3744		next->size = size;
3745		memcpy(&next->fn, (u8*)a + le16_to_cpu(a->value_offset), size);
3746		/*
3747		 * It makes no difference in what order we process the names so
3748		 * we just insert them all the the list head thus effectively
3749		 * processing them in LIFO order.
3750		 */
3751		SLIST_INSERT_HEAD(&fn_list, next, list_entry);
3752	} while (1);
3753	/* We are done with the mft record so release it. */
3754	ntfs_attr_search_ctx_put(actx);
3755	ntfs_mft_record_unmap(ni);
3756	lck_rw_unlock_shared(&ni->lock);
3757	actx = NULL;
3758	m = NULL;
3759	/*
3760	 * We have now gathered all the filenames into the @fn_list list and
3761	 * are ready to start looking up each filename in its parent directory
3762	 * index and updating the matching directory entry.
3763	 *
3764	 * Note that because we currently hold no locks any of the filenames
3765	 * we gathered can be unlinked() before we try to update them.  And
3766	 * they can even be re-created with a different target mft record or
3767	 * even with the same one but with an incremented sequence number.  We
3768	 * need to take this into consideration when handling errors below.
3769	 *
3770	 * Start by allocating an index context for doing the index lookups.
3771	 */
3772	ictx = ntfs_index_ctx_alloc();
3773	if (!ictx) {
3774		ntfs_debug(ies, (unsigned long long)ni->mft_no, "there was "
3775				"not enough memory to allocate an index "
3776				"context", ENOMEM);
3777		err = ENOMEM;
3778		goto list_err;
3779	}
3780	/*
3781	 * We cannot use SLIST_FOREACH() as that is not safe wrt to removal of
3782	 * the current element and we want to free each element as we go along
3783	 * so we do not have to traverse the list a second time just to do the
3784	 * freeing.
3785	 */
3786	dir_ni = NULL;
3787	while (!SLIST_EMPTY(&fn_list)) {
3788		next = SLIST_FIRST(&fn_list);
3789		/*
3790		 * We now have the next filename in @next->fn and
3791		 * @next->size.
3792		 */
3793		fn = &next->fn;
3794		dir_mft_no = MREF_LE(fn->parent_directory);
3795		/*
3796		 * Obtain the inode of the parent directory in which the
3797		 * current name is indexed if we do not have it already.
3798		 */
3799		if (!dir_ni || dir_ni->mft_no != dir_mft_no) {
3800			if (dir_ni) {
3801				lck_rw_unlock_exclusive(&dir_ia_ni->lock);
3802				lck_rw_unlock_exclusive(&dir_ni->lock);
3803				(void)vnode_put(dir_ia_ni->vn);
3804				(void)vnode_put(dir_ni->vn);
3805			}
3806			err = ntfs_inode_get(vol, dir_mft_no, FALSE,
3807					LCK_RW_TYPE_EXCLUSIVE, &dir_ni, NULL,
3808					NULL);
3809			if (err) {
3810				if (err != ENOENT) {
3811					ntfs_error(vol->mp, ies,
3812							(unsigned long long)
3813							ni->mft_no, "opening "
3814							"the parent directory "
3815							"inode failed", err);
3816					goto list_err;
3817				}
3818				/*
3819				 * Someone deleted the directory (and possibly
3820				 * recreated a new inode) under our feet.
3821				 * This is not an error so simply ignore this
3822				 * name and continue to the next one.
3823				 */
3824do_skip_name:
3825				ntfs_debug("Skipping name as it and its "
3826						"parent directory were "
3827						"unlinked under our feet.");
3828				dir_ni = NULL;
3829				goto skip_name;
3830			}
3831			/*
3832			 * If the directory has changed identity it has been
3833			 * deleted and recreated which means the directory
3834			 * entry we want to update has been removed so skip
3835			 * this name.
3836			 */
3837			if (dir_ni->seq_no != MSEQNO_LE(fn->parent_directory)) {
3838				lck_rw_unlock_exclusive(&dir_ni->lock);
3839				vnode_put(dir_ni->vn);
3840				goto do_skip_name;
3841			}
3842			err = ntfs_index_inode_get(dir_ni, I30, 4, FALSE,
3843					&dir_ia_ni);
3844			if (err) {
3845				ntfs_debug(ies, (unsigned long long)ni->mft_no,
3846						"opening the parent directory "
3847						"index inode failed", err);
3848				lck_rw_unlock_exclusive(&dir_ni->lock);
3849				(void)vnode_put(dir_ni->vn);
3850				goto list_err;
3851			}
3852			lck_rw_lock_exclusive(&dir_ia_ni->lock);
3853		}
3854		ntfs_index_ctx_init(ictx, dir_ia_ni);
3855		/* Get the index entry matching the current filename. */
3856		err = ntfs_index_lookup(fn, next->size, &ictx);
3857		if (err || ictx->entry->indexed_file !=
3858				MK_LE_MREF(ni->mft_no, ni->seq_no)) {
3859			if (err && err != ENOENT) {
3860				ntfs_error(vol->mp, ies,
3861						(unsigned long long)ni->mft_no,
3862						"looking up the name in the "
3863						"parent directory inode "
3864						"failed", err);
3865				if (err != ENOMEM)
3866					ignore_errors = FALSE;
3867				ntfs_index_ctx_put_reuse(ictx);
3868				lck_rw_unlock_exclusive(&dir_ia_ni->lock);
3869				lck_rw_unlock_exclusive(&dir_ni->lock);
3870				(void)vnode_put(dir_ia_ni->vn);
3871				(void)vnode_put(dir_ni->vn);
3872				goto list_err;
3873			}
3874			/*
3875			 * Someone unlinked the name (and possibly recreated a
3876			 * new inode) under our feet.  This is not an error so
3877			 * simply ignore this name and continue to the next
3878			 * one.
3879			 */
3880			ntfs_debug("Skipping name as it was unlinked under "
3881					"our feet.");
3882			goto put_skip_name;
3883		}
3884		/* Update the found index entry. */
3885		fn = &ictx->entry->key.filename;
3886		modified = FALSE;
3887		if (dirty_file_attributes && fn->file_attributes !=
3888				file_attributes) {
3889			fn->file_attributes = file_attributes;
3890			modified = TRUE;
3891		}
3892		if (dirty_times && (fn->creation_time != creation_time ||
3893				fn->last_data_change_time !=
3894				last_data_change_time ||
3895				fn->last_mft_change_time !=
3896				last_mft_change_time ||
3897				fn->last_access_time != last_access_time)) {
3898			fn->creation_time = creation_time;
3899			fn->last_data_change_time = last_data_change_time;
3900			fn->last_mft_change_time = last_mft_change_time;
3901			fn->last_access_time = last_access_time;
3902			modified = TRUE;
3903		}
3904		if (dirty_sizes && (fn->allocated_size != allocated_size ||
3905				fn->data_size != data_size)) {
3906			fn->allocated_size = allocated_size;
3907			fn->data_size = data_size;
3908			modified = TRUE;
3909		}
3910		/*
3911		 * If we changed anything, ensure the updates are written to
3912		 * disk.
3913		 */
3914		if (modified)
3915			ntfs_index_entry_mark_dirty(ictx);
3916put_skip_name:
3917		ntfs_index_ctx_put_reuse(ictx);
3918skip_name:
3919		SLIST_REMOVE_HEAD(&fn_list, list_entry);
3920		OSFree(next, next->alloc, ntfs_malloc_tag);
3921	}
3922	if (dir_ni) {
3923		lck_rw_unlock_exclusive(&dir_ia_ni->lock);
3924		lck_rw_unlock_exclusive(&dir_ni->lock);
3925		(void)vnode_put(dir_ia_ni->vn);
3926		(void)vnode_put(dir_ni->vn);
3927	}
3928	ntfs_index_ctx_free(ictx);
3929done:
3930	ntfs_debug("Done.");
3931	return 0;
3932list_err:
3933	/* Free all the copied filenames. */
3934	while (!SLIST_EMPTY(&fn_list)) {
3935		next = SLIST_FIRST(&fn_list);
3936		SLIST_REMOVE_HEAD(&fn_list, list_entry);
3937		OSFree(next, next->alloc, ntfs_malloc_tag);
3938	}
3939	if (ictx)
3940		ntfs_index_ctx_free(ictx);
3941err:
3942	if (actx)
3943		ntfs_attr_search_ctx_put(actx);
3944	if (m) {
3945		ntfs_mft_record_unmap(ni);
3946		lck_rw_unlock_shared(&ni->lock);
3947	}
3948	if (ignore_errors || err == ENOMEM) {
3949		ntfs_debug("Failed to sync ntfs inode.  Marking it dirty "
3950				"again, so that we try again later.");
3951		if (dirty_times)
3952			NInoSetDirtyTimes(ni);
3953		if (dirty_file_attributes)
3954			NInoSetDirtyFileAttributes(ni);
3955		if (dirty_sizes)
3956			NInoSetDirtySizes(ni);
3957		if (dirty_set_file_bits)
3958			NInoSetDirtySetFileBits(ni);
3959		if (ignore_errors)
3960			err = 0;
3961	} else {
3962		NVolSetErrors(vol);
3963		ntfs_error(vol->mp, "Failed (error %d).  Run chkdsk.", err);
3964	}
3965	return err;
3966}
3967
3968/**
3969 * ntfs_inode_sync - synchronize an inode's in-core state with that on disk
3970 * @ni:				ntfs inode to synchronize to disk
3971 * @ioflags:			flags describing the i/o request
3972 * @skip_mft_record_sync:	do not sync the mft record(s) to disk
3973 *
3974 * Write all dirty cached data belonging/related to the ntfs inode @ni to disk.
3975 *
3976 * If @ioflags has the IO_SYNC bit set, wait for all i/o to complete before
3977 * returning.
3978 *
3979 * If @ioflags has the IO_CLOSE bit set, this signals cluster_push() that the
3980 * i/o is issued from the close path (or in our case more precisely from the
3981 * VNOP_INACTIVE() path).
3982 *
3983 * Note: When called from reclaim (via VNOP_FSYNC() and hence ntfs_vnop_fsync(),
3984 *	 the vnode has a zero v_iocount and v_usecount and vnode_isrecycled()
3985 *	 is true.  Thus we cannot obtain any attribute/raw inodes inside
3986 *	 ntfs_inode_sync() or the vnode_ref() on the base vnode that is done as
3987 *	 part of getting an attribute/raw inode causes a panic() to trigger as
3988 *	 both the iocount and usecount are zero on the base vnode.
3989 *
3990 * Return 0 on success and the error code on error.
3991 *
3992 * Locking: @ni->lock must be unlocked.
3993 *
3994 * TODO:/FIXME: For directory vnodes this currently does not sync much.  We
3995 * really need to sync the index allocation vnode and the bitmap vnode for
3996 * directories.
3997 *
3998 * TODO:/FIXME: For symbolic link vnodes this currently does not sync much.  We
3999 * really need to sync the raw vnode for symbolic links.
4000 *
4001 * TODO:/FIXME: At present we do not sync the AFP_AfpInfo named stream inode
4002 * when syncing the base inode.
4003 *
4004 * TODO:/FIXME: In general when a vnode is being synced we should ensure that
4005 * all associated (loaded) vnodes are synced also, i.e. not just the extent
4006 * inodes but also all the attribute/index inodes as well and once that is all
4007 * done we should cause all associated mft records to be synced.
4008 *
4009 * Theory of operation:
4010 *
4011 * Only base inodes (i.e. real files/unnamed $DATA/S_ISREG(ni->mode) and
4012 * directories/S_IFDIR(ni->mode) need to have their information synced with the
4013 * standard information attribute and hence with all directory entries pointing
4014 * to those inodes also.
4015 *
4016 * Further, all changes to the ntfs inode structure @ni happen exclusively
4017 * through the ntfs driver thus we can mark @ni dirty on any modification and
4018 * then we can check and clear the flag here and only if it was set do we need
4019 * to go through and check what needs to be updated and to update it in the
4020 * standard information attribute (and all directory entries pointing to the
4021 * inode).
4022 *
4023 * However, changes to the contents of an attribute, can happen to all
4024 * attributes, i.e. base inodes, attribute inodes, and index inodes all alike.
4025 * Further changes to the contents can happen both under control of the ntfs
4026 * driver and outside its control via mmap() based writes for example.  Thus we
4027 * have no mechanism for determining whether file data is dirty or not and thus
4028 * we have to unconditionally perform an msync() on the entire file data.
4029 *
4030 * The msync() can in turn cause the mft record containing the attribute to be
4031 * dirtied, for example because the attribute is resident and the msync()
4032 * caused the data to go from the VM page cache into the mft record thus
4033 * dirtying it.
4034 *
4035 * So we at the end need to sync all mft records associated with the attribute.
4036 * Once again, the only way mft records are modified is through the ntfs driver
4037 * so we could set a flag each time we modify an mft record and check it and
4038 * only write if it is set.  However we do not do this as such flags would
4039 * invariably be out of date with reality because the mft records are stored as
4040 * the contents of the system file $MFT (S_ISREG()) which we access using
4041 * buf_meta_bread() and buf_bdwrite(), etc, thus they are governed by the
4042 * buffer layer and their dirtyness is tracked at a buffer (i.e. per mft
4043 * record) level by the buffer layer.  And the buffer layer can cause a buffer
4044 * to be written out without the ntfs driver having an easy means to go and
4045 * clear the putative dirty bit in the ntfs inode @ni.  Thus we do not use a
4046 * dirty flag for the mft records and instead buf_getblk() all cached buffers
4047 * containing loaded mft records belonging to the base ntfs inode of @ni and
4048 * for the ones that are dirty we cause them to be written out by calling
4049 * buf_bwrite().  We determine which mft records are loaded by iterating
4050 * through the @extent_nis array of the base ntfs inode of @ni.  This will skip
4051 * any mft records that are dirty but have been freed/deallocated from the
4052 * inode but this is irrelevant as for all intents and purposes they no longer
4053 * belong to the inode @ni.  They will still be synced to disk when the $MFT
4054 * inode is synced or the buffer layer pushes the dirty buffer containing the
4055 * freed mft record to disk.
4056 *
4057 * As a speed optimization when ntfs_inode_sync() is called from VFS_SYNC() and
4058 * thus from ntfs_sync(), we do not sync the mft records at all as ntfs_sync()
4059 * will as the last thing call ntfs_inode_sync() for $MFT itself and then all
4060 * dirty mft records can be synced in one single go via a single
4061 * buf_flushdirtyblks() on the entire data content of $MFT.  This massively
4062 * reduces disk head seeking and nicely streamlines and batches writes to the
4063 * $MFT.
4064 */
4065errno_t ntfs_inode_sync(ntfs_inode *ni, const int ioflags,
4066		const BOOL skip_mft_record_sync)
4067{
4068	ntfs_inode *base_ni;
4069	errno_t err;
4070
4071	ntfs_debug("Entering for %sinode 0x%llx, %ssync i/o, ioflags 0x%04x.",
4072			NInoAttr(ni) ? "attr " : "",
4073			(unsigned long long)ni->mft_no,
4074			(ioflags & IO_SYNC) ? "a" : "", ioflags);
4075	base_ni = ni;
4076	if (NInoAttr(ni)) {
4077		base_ni = ni->base_ni;
4078		if (ni != base_ni)
4079			lck_rw_lock_shared(&base_ni->lock);
4080	}
4081	/* Do not allow messing with the inode once it has been deleted. */
4082	lck_rw_lock_shared(&ni->lock);
4083	if (NInoDeleted(ni)) {
4084		/* Remove the inode from the name cache. */
4085		cache_purge(ni->vn);
4086		lck_rw_unlock_shared(&ni->lock);
4087		if (ni != base_ni)
4088			lck_rw_unlock_shared(&base_ni->lock);
4089		ntfs_debug("Inode is deleted.");
4090		return ENOENT;
4091	}
4092	/*
4093	 * This cannot happen as the attribute/raw inode holds a reference on
4094	 * the vnode of its base inode.
4095	 */
4096	if (ni != base_ni && NInoDeleted(base_ni))
4097		panic("%s(): Called for attribute inode whose base inode is "
4098				"NInoDeleted()!\n", __FUNCTION__);
4099	lck_rw_unlock_shared(&ni->lock);
4100	if (ni != base_ni)
4101		lck_rw_unlock_shared(&base_ni->lock);
4102	/*
4103	 * First of all, flush any dirty data.  This is done for all attribute
4104	 * inodes as well as for regular file base inodes.
4105	 * There is no need to do it for directory inodes, symbolic links
4106	 * fifos, sockets, or block and character device special files as they
4107	 * do not contain any data.
4108	 * We actually check for the vnode type being VREG as that is the case
4109	 * for all attribute inodes as well as for all regular files.
4110	 *
4111	 * Further, we do not yet support writing data for non-resident
4112	 * encrypted/compressed attributes so silently skip those here.  We do
4113	 * not want to fail completely because we want to allow access times
4114	 * and other flags/attributes to be updated.
4115	 */
4116	if (vnode_vtype(ni->vn) == VREG && (!NInoNonResident(ni) ||
4117			ni->type == AT_INDEX_ALLOCATION || NInoRaw(ni) ||
4118			(!NInoEncrypted(ni) && !NInoCompressed(ni)))) {
4119		err = ntfs_inode_data_sync(ni, ioflags);
4120		if (err)
4121			return err;
4122	}
4123	/*
4124	 * If this is a base inode and it contains any dirty fields that have
4125	 * not been synced to the standard information attribute in the mft
4126	 * record yet, update the standard information attribute and update all
4127	 * directory entries pointing to the inode if any affected fields were
4128	 * modified.
4129	 */
4130	if (ni == base_ni && NInoDirty(ni)) {
4131		err = ntfs_inode_sync_to_mft_record(ni);
4132		if (err)
4133			return err;
4134	}
4135	/*
4136	 * If we are called from ntfs_sync() we want to skip writing the mft
4137	 * records as that will happen at the end of the ntfs_sync() call.
4138	 */
4139	if (skip_mft_record_sync) {
4140		ntfs_debug("Done (skipped mft record(s) sync).");
4141		return 0;
4142	}
4143	/*
4144	 * If this inode does not have an attribute list attribute there is
4145	 * only one mft record associated with the inode thus we can write it
4146	 * now if it is dirty and we are finished if not.
4147	 *
4148	 * If the inode does have an attribute list attribute then we need to
4149	 * go through all loaded mft records, starting with the base inode and
4150	 * looking at all its attached extent inodes and we need to write the
4151	 * ones that have dirty mft records out one by one.
4152	 */
4153	err = ntfs_mft_record_sync(base_ni);
4154	if (NInoAttrList(base_ni)) {
4155		int nr_extents;
4156
4157		lck_mtx_lock(&base_ni->extent_lock);
4158		nr_extents = base_ni->nr_extents;
4159		if (nr_extents > 0) {
4160			ntfs_inode **extent_nis = base_ni->extent_nis;
4161			errno_t err2;
4162			int i;
4163
4164			ntfs_debug("Syncing %d extent inodes.", nr_extents);
4165			for (i = 0; i < nr_extents; i++) {
4166				err2 = ntfs_mft_record_sync(extent_nis[i]);
4167				if (err2 && (!err || err == ENOMEM))
4168					err = err2;
4169			}
4170		}
4171		lck_mtx_unlock(&base_ni->extent_lock);
4172	}
4173	if (!err) {
4174		ntfs_debug("Done.");
4175		return 0;
4176	}
4177	if (err == ENOMEM)
4178		ntfs_warning(ni->vol->mp, "Not enough memory to sync inode.");
4179	else {
4180		NVolSetErrors(ni->vol);
4181		ntfs_error(ni->vol->mp, "Failed to sync mft_no 0x%llx (error "
4182				"%d).  Run chkdsk.",
4183				(unsigned long long)ni->mft_no, err);
4184	}
4185	return err;
4186}
4187
4188/**
4189 * ntfs_inode_get_name_and_parent_mref - get the name and parent mft reference
4190 * @ni:			ntfs inode whose name and parent mft reference to find
4191 * @have_parent:	true if @parent_mref already contains an mft reference
4192 * @parent_mref:	destination to return the parent mft reference in
4193 * @name:		destination to return the name in or NULL
4194 *
4195 * If @have_parent is false, look up the first, non-DOS filename attribute in
4196 * the mft record(s) of the ntfs inode @ni and return the name contained in the
4197 * filename attribute in @name as well as the parent mft reference contained in
4198 * the filename attribute in *@parent_mref.  If @name is NULL the name is not
4199 * returned.
4200 *
4201 * If @name is NULL, check if there is a name cached in the vnode of the inode
4202 * @ni, and if so, look for the filename attribute matching this name and if
4203 * one is found, return its parent id.  If one is not found return the first,
4204 * non-DOS filename attribute as described above.  Note there is no point in
4205 * doing this unless the link count of the inode @ni is larger than one as
4206 * otherwise there is only one filename attribute and thus we do not need to
4207 * bother doing any comparissons as it is the only name we can return thus we
4208 * return it.
4209 *
4210 * If @have_parent is true, iterate over the filename attributes in the mft
4211 * record until we find the one matching the parent mft reference @parent_mref
4212 * and return the corresponding name in @name.  If such a name is not found,
4213 * revert to the previous case where @have_parent is false, i.e. return the
4214 * first, non-DOS filename in @name and the corresponding parent mft reference
4215 * in *@parent_mref.  Note that as above in the @name is NULL case, there is no
4216 * point in doing this unless the link count of the inode @ni is larger than
4217 * one as otherwise there is only one filename attribute and thus we return it.
4218 *
4219 * If @have_parent is true @name must not be NULL as it makes no sense to look
4220 * only for the parent mft reference when the caller already has it.
4221 *
4222 * Return 0 on success and the error code on error.
4223 */
4224errno_t ntfs_inode_get_name_and_parent_mref(ntfs_inode *ni, BOOL have_parent,
4225		MFT_REF *parent_mref, const char *name)
4226{
4227	MFT_REF mref;
4228	ntfs_inode *base_ni;
4229	ntfschar *ntfs_name;
4230	MFT_RECORD *m;
4231	ntfs_attr_search_ctx *ctx;
4232	ATTR_RECORD *a;
4233	FILENAME_ATTR *fn;
4234	size_t name_size;
4235	unsigned link_count = ni->link_count;
4236	signed res_size = 0;
4237	errno_t err;
4238	BOOL name_present;
4239	ntfschar ntfs_name_buf[link_count > 1 ? NTFS_MAX_NAME_LEN : 0];
4240
4241	ntfs_debug("Entering for mft_no 0x%llx.",
4242			(unsigned long long)ni->mft_no);
4243	if (have_parent && !name)
4244		panic("%s(): have_parent && !name\n", __FUNCTION__);
4245	/*
4246	 * As explained above do not bother doing anything fancy unless the
4247	 * link count of the inode @ni is greater than one.
4248	 */
4249	ntfs_name = NULL;
4250	if (link_count > 1) {
4251		if (!name) {
4252			const char *vn_name;
4253
4254			vn_name = vnode_getname(ni->vn);
4255			if (vn_name) {
4256				/* Convert the name from utf8 to Unicode. */
4257				ntfs_name = ntfs_name_buf;
4258				name_size = sizeof(ntfs_name_buf);
4259				res_size = utf8_to_ntfs(ni->vol, (u8*)vn_name,
4260						strlen(vn_name), &ntfs_name,
4261						&name_size);
4262				(void)vnode_putname(vn_name);
4263				/*
4264				 * If we failed to convert the name, warn the
4265				 * user about it and then continue execution
4266				 * pretending that there is no cached name,
4267				 * i.e. ignoring the potentially corrupt name.
4268				 */
4269				if (res_size < 0) {
4270					ntfs_warning(ni->vol->mp, "Failed to "
4271							"convert cached name "
4272							"to Unicode (error "
4273							"%d).  This may "
4274							"indicate "
4275							"corruption.  You "
4276							"should unmount and "
4277							"run chkdsk.",
4278							-res_size);
4279					NVolSetErrors(ni->vol);
4280					ntfs_name = NULL;
4281				}
4282			}
4283		}
4284	} else
4285		have_parent = FALSE;
4286	base_ni = ni;
4287	if (NInoAttr(ni))
4288		base_ni = ni->base_ni;
4289	if (!link_count || (ni != base_ni && !base_ni->link_count))
4290		goto deleted;
4291	/* Map the mft record. */
4292	err = ntfs_mft_record_map(base_ni, &m);
4293	if (err) {
4294		ntfs_error(ni->vol->mp, "Failed to map mft record (error %d).",
4295				err);
4296		return err;
4297	}
4298	/* Verify the mft record has not been deleted. */
4299	if (!(m->flags & MFT_RECORD_IN_USE))
4300		goto unm_deleted;
4301	/* Find the first filename attribute in the mft record. */
4302	ctx = ntfs_attr_search_ctx_get(base_ni, m);
4303	if (!ctx) {
4304		ntfs_error(ni->vol->mp, "Failed to allocate search context "
4305				"(error %d).", err);
4306		err = ENOMEM;
4307		goto err;
4308	}
4309	name_present = FALSE;
4310try_next:
4311	err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0, ctx);
4312	if (err) {
4313		if (err == ENOENT && name_present) {
4314			have_parent = name_present = FALSE;
4315			ntfs_name = NULL;
4316			ntfs_attr_search_ctx_reinit(ctx);
4317			goto try_next;
4318		}
4319		ntfs_error(ni->vol->mp, "Failed to find a valid filename "
4320				"attribute (error %d).", err);
4321		goto put_err;
4322	}
4323	a = ctx->a;
4324	fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset));
4325	/* If the filename attribute is invalid/corrupt abort. */
4326	if (a->non_resident || (u8*)fn + le32_to_cpu(a->value_length) >
4327			(u8*)a + le32_to_cpu(a->length)) {
4328		ntfs_error(ni->vol->mp, "Found corrupt filename attribute in "
4329				"mft_no 0x%llx.  Unmount and run chkdsk.",
4330				(unsigned long long)ni->mft_no);
4331		NVolSetErrors(ni->vol);
4332		err = EIO;
4333		goto put_err;
4334	}
4335	/*
4336	 * Do not return the DOS name.  If it exists there must also be a
4337	 * matching WIN32 name or the inode is corrupt.
4338	 */
4339	if (fn->filename_type == FILENAME_DOS)
4340		goto try_next;
4341	mref = le64_to_cpu(fn->parent_directory);
4342	/*
4343	 * If we have a cached name, check if the current filename attribute
4344	 * matches this name and if not try the next name.
4345	 *
4346	 * We can do a case sensitive comparison because we only ever cache
4347	 * correctly cased names in the vnode.
4348	 */
4349	if (ntfs_name && (res_size != fn->filename_length ||
4350			bcmp(ntfs_name, fn->filename, res_size))) {
4351		name_present = TRUE;
4352		goto try_next;
4353	}
4354	/*
4355	 * If we already have a parent mft reference and the current filename
4356	 * attribute has a different parent mft reference try the next name.
4357	 *
4358	 * Note we have to only compare the sequence number if one is passed to
4359	 * us in *@parent_mref, i.e. if MSEQNO(*@parent_mref) is not zero.
4360	 */
4361	if (have_parent && (MREF(*parent_mref) != MREF(mref) ||
4362			(MSEQNO(*parent_mref) &&
4363			MSEQNO(*parent_mref) != MSEQNO(mref)))) {
4364		name_present = TRUE;
4365		goto try_next;
4366	}
4367	/*
4368	 * If we are looking for the name, convert it from NTFS Unicode to
4369	 * UTF-8 OS X string format and save it in @name.
4370	 */
4371	if (name) {
4372		name_size = MAXPATHLEN;
4373		res_size = ntfs_to_utf8(ni->vol, (ntfschar*)&fn->filename,
4374				fn->filename_length << NTFSCHAR_SIZE_SHIFT,
4375				(u8**)&name, &name_size);
4376		if (res_size < 0) {
4377			ntfs_warning(ni->vol->mp, "Failed to convert name of "
4378					"mft_no 0x%llx to UTF8 (error %d).",
4379					(unsigned long long)ni->mft_no,
4380					-res_size);
4381			goto try_next;
4382		}
4383	}
4384	/* Get the inode number of the parent directory into *@parent_mref. */
4385	*parent_mref = mref;
4386	/*
4387	 * Release the search context and the mft record of the inode as we do
4388	 * not need them any more.
4389	 */
4390	ntfs_attr_search_ctx_put(ctx);
4391	ntfs_mft_record_unmap(base_ni);
4392	if (name)
4393		ntfs_debug("Done (mft_no 0x%llx has parent mft_no 0x%llx and "
4394				"name %.*s).", (unsigned long long)ni->mft_no,
4395				(unsigned long long)MREF(mref), res_size, name);
4396	else
4397		ntfs_debug("Done (mft_no 0x%llx has parent mft_no 0x%llx "
4398				"(name was not requested and was %scached)).",
4399				(unsigned long long)ni->mft_no,
4400				(unsigned long long)MREF(mref),
4401				ntfs_name ? "" : "not ");
4402	return 0;
4403unm_deleted:
4404	ntfs_mft_record_unmap(base_ni);
4405deleted:
4406	ntfs_debug("Inode 0x%llx has been deleted, returning ENOENT.",
4407			(unsigned long long)ni->mft_no);
4408	return ENOENT;
4409put_err:
4410	ntfs_attr_search_ctx_put(ctx);
4411err:
4412	ntfs_mft_record_unmap(base_ni);
4413	return err;
4414}
4415
4416/**
4417 * ntfs_inode_is_parent - test if an inode is a parent of another inode
4418 * @parent_ni:	ntfs inode to check for being a parent of @child_ni
4419 * @child_ni:	ntfs inode to check for being a child of @parent_ni
4420 * @is_parent:	pointer in which to return the result of the test
4421 * @forbid_ni:	ntfs inode that may not be encountered on the path or NULL
4422 *
4423 * Starting with @child_ni, walk up the file system directory tree until the
4424 * root directory of the volume is reached.  Compare the inodes found along the
4425 * way, i.e. the parent inodes of @child_ni, against @parent_ni and if one of
4426 * them matches @parent_ni we know that @parent_ni is indeed a parent of
4427 * @child_ni thus we return true in *@is_parent.  If we reach the root
4428 * directory without matching @parent_ni we know that @parent_ni is definitely
4429 * not a parent of @child_ni thus we return false in *@is_parent.
4430 *
4431 * If @forbid_ni is NULL it is ignored.  If it is not NULL it is an ntfs inode
4432 * which may not be located on the path traversed during the parent lookup.  If
4433 * it is present, return EINVAL.  This is used in ntfs_vnop_rename() where the
4434 * source inode may not be a parent directory of the destination directory or a
4435 * loop would be created if the rename was allowed to continue.
4436 *
4437 * Return 0 on success and the error code on error.  On error *@is_parent is
4438 * not defined.
4439 *
4440 * Locking: - The volume rename lock must be held by the caller to ensure that
4441 *	      the relationship between the inodes cannot change under our feet.
4442 *	    - The caller must hold an iocount reference on both @parent_ni and
4443 *	      @child_ni.
4444 *
4445 * Note both @parent_ni and @child_ni must be directory inodes.
4446 */
4447errno_t ntfs_inode_is_parent(ntfs_inode *parent_ni, ntfs_inode *child_ni,
4448		BOOL *is_parent, ntfs_inode *forbid_ni)
4449{
4450	ntfs_volume *vol;
4451	ntfs_inode *root_ni, *ni;
4452	vnode_t vn, prev_vn;
4453
4454	if (forbid_ni)
4455		ntfs_debug("Entering for parent mft_no 0x%llx, child mft_no "
4456				"0x%llx, and forbidden mft_no 0x%llx.",
4457				(unsigned long long)parent_ni->mft_no,
4458				(unsigned long long)child_ni->mft_no,
4459				(unsigned long long)forbid_ni->mft_no);
4460	else
4461		ntfs_debug("Entering for parent mft_no 0x%llx and child "
4462				"mft_no 0x%llx.",
4463				(unsigned long long)parent_ni->mft_no,
4464				(unsigned long long)child_ni->mft_no);
4465	vol = child_ni->vol;
4466	root_ni = vol->root_ni;
4467	ni = child_ni;
4468	prev_vn = NULL;
4469	vn = child_ni->vn;
4470	/*
4471	 * Iterate over the parent inodes until we reach the root directory
4472	 * inode @root_ni of the volume.
4473	 */
4474	while (ni != root_ni) {
4475		if (ni == forbid_ni) {
4476			ntfs_debug("Forbidden mft_no 0x%llx is a parent of "
4477					"child mft_no 0x%llx.  Returning "
4478					"EINVAL.",
4479					(unsigned long long)forbid_ni->mft_no,
4480					(unsigned long long)child_ni->mft_no);
4481			if (prev_vn) {
4482				lck_rw_unlock_shared(&ni->lock);
4483				(void)vnode_put(prev_vn);
4484			}
4485			return EINVAL;
4486		}
4487		/*
4488		 * Try to find the parent vnode of the current inode in the
4489		 * current vnode and if it is not present try to get it by hand
4490		 * by looking up the filename attribute in the mft record of
4491		 * the inode.
4492		 */
4493		vn = vnode_getparent(vn);
4494		if (vn) {
4495			if (prev_vn) {
4496				lck_rw_unlock_shared(&ni->lock);
4497				(void)vnode_put(prev_vn);
4498			}
4499			ni = NTFS_I(vn);
4500			lck_rw_lock_shared(&ni->lock);
4501			if (NInoDeleted(ni))
4502				panic("%s(): vnode_getparent() returned "
4503						"NInoDeleted() inode!\n",
4504						__FUNCTION__);
4505			/* Check the inode has not been deleted. */
4506			if (!ni->link_count)
4507				goto deleted;
4508		} else {
4509			MFT_REF mref;
4510			s64 mft_no;
4511			errno_t err;
4512			u16 seq_no;
4513
4514			/*
4515			 * The vnode of the parent is not attached to the vnode
4516			 * of the current inode thus find the parent mft
4517			 * reference by hand.
4518			 */
4519			err = ntfs_inode_get_name_and_parent_mref(ni, FALSE,
4520					&mref, NULL);
4521			mft_no = ni->mft_no;
4522			if (prev_vn) {
4523				lck_rw_unlock_shared(&ni->lock);
4524				(void)vnode_put(prev_vn);
4525			}
4526			if (err) {
4527				ntfs_error(vol->mp, "Failed to determine "
4528						"parent mft reference of "
4529						"mft_no 0x%llx (error %d).",
4530						(unsigned long long)mft_no,
4531						err);
4532				return err;
4533			}
4534			/* Get the inode with mft reference @mref. */
4535			err = ntfs_inode_get(vol, MREF(mref), FALSE,
4536					LCK_RW_TYPE_SHARED, &ni, NULL, NULL);
4537			if (err) {
4538				ntfs_error(vol->mp, "Failed to obtain parent "
4539						"mft_no 0x%llx of mft_no "
4540						"0x%llx (error %d).",
4541						(unsigned long long)MREF(mref),
4542						(unsigned long long)mft_no,
4543						err);
4544				return err;
4545			}
4546			vn = ni->vn;
4547			/* Check the inode has not been deleted and reused. */
4548			seq_no = MSEQNO(mref);
4549			if (seq_no && seq_no != ni->seq_no)
4550				goto deleted;
4551		}
4552		/*
4553		 * We found the parent inode.  If it equals @parent_ni it means
4554		 * that our test is successful and @parent_ni is indeed a
4555		 * parent directory of @child_ni thus set *@is_parent to true
4556		 * and return success.
4557		 */
4558		if (ni == parent_ni) {
4559			lck_rw_unlock_shared(&ni->lock);
4560			(void)vnode_put(ni->vn);
4561			*is_parent = TRUE;
4562			ntfs_debug("Parent mft_no 0x%llx is a parent of "
4563					"child mft_no 0x%llx.",
4564					(unsigned long long)parent_ni->mft_no,
4565					(unsigned long long)child_ni->mft_no);
4566			return 0;
4567		}
4568		prev_vn = vn;
4569	}
4570	if (prev_vn) {
4571		lck_rw_unlock_shared(&ni->lock);
4572		(void)vnode_put(prev_vn);
4573	}
4574	/*
4575	 * We reached the root directory of the volume without encountering
4576	 * @parent_ni thus it is not a parent of @child_ni so set *@is_parent
4577	 * to false and return success.
4578	 */
4579	*is_parent = FALSE;
4580	ntfs_debug("Parent mft_no 0x%llx is not a parent of child mft_no "
4581			"0x%llx.", (unsigned long long)parent_ni->mft_no,
4582			(unsigned long long)child_ni->mft_no);
4583	return 0;
4584deleted:
4585	ntfs_error(ni->vol->mp, "Parent mft_no 0x%llx has been deleted.  "
4586			"Returning ENOENT.", (unsigned long long)ni->mft_no);
4587	lck_rw_unlock_shared(&ni->lock);
4588	(void)vnode_put(vn);
4589	return ENOENT;
4590}
4591