1/*
2 * ntfs_vnops.c - NTFS kernel vnode operations.
3 *
4 * Copyright (c) 2006-2011 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2011 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/attr.h>
39#include <sys/buf.h>
40#include <sys/errno.h>
41#include <sys/param.h>
42#include <sys/stat.h>
43#include <sys/syslimits.h>
44#include <sys/time.h>
45#include <sys/ubc.h>
46#include <sys/ucred.h>
47#include <sys/uio.h>
48#include <sys/unistd.h>
49#include <sys/vnode.h>
50#include <sys/vnode_if.h>
51#include <sys/xattr.h>
52
53#include <string.h>
54
55#include <mach/kern_return.h>
56#include <mach/memory_object_types.h>
57
58#include <kern/debug.h>
59#include <kern/locks.h>
60
61#include <vfs/vfs_support.h>
62
63#include "ntfs.h"
64#include "ntfs_attr.h"
65#include "ntfs_bitmap.h"
66#include "ntfs_compress.h"
67#include "ntfs_debug.h"
68#include "ntfs_dir.h"
69#include "ntfs_endian.h"
70#include "ntfs_hash.h"
71#include "ntfs_inode.h"
72#include "ntfs_layout.h"
73#include "ntfs_lcnalloc.h"
74#include "ntfs_mft.h"
75#include "ntfs_mst.h"
76#include "ntfs_page.h"
77#include "ntfs_sfm.h"
78#include "ntfs_time.h"
79#include "ntfs_unistr.h"
80#include "ntfs_vnops.h"
81#include "ntfs_volume.h"
82
83/* Global ntfs vnode operations. */
84vnop_t **ntfs_vnodeop_p;
85
86/**
87 * ntfs_cluster_iodone - complete i/o on a memory region
88 * @cbp:	cluster head buffer for which i/o is being completed
89 * @arg:	callback argument, we do not use it at present
90 *
91 * In the read case:
92 *
93 * For an mst protected attribute we do the post read mst deprotection and for
94 * an encrypted attribute we do the decryption (not supported at present).
95 * Note we ignore mst fixup errors as those are detected when
96 * ntfs_mft_record_map() is called later which gives us per record granularity.
97 *
98 * In the write case:
99 *
100 * For an mst protected attribute we do the post write mst deprotection.
101 * Writing to encrypted attributes is not supported at present.
102 *
103 * Return 0 on success and errno on error.
104 */
105int ntfs_cluster_iodone(buf_t cbp, void *arg __unused)
106{
107	long size;
108	ntfs_inode *ni;
109	u8 *kend, *kaddr;
110	errno_t err, err2;
111	BOOL is_read = buf_flags(cbp) & B_READ;
112
113	ni = NTFS_I(buf_vnode(cbp));
114	size = buf_count(cbp);
115	if (size & (ni->block_size - 1))
116		panic("%s(): Called with size not a multiple of the inode "
117				"block size.\n", __FUNCTION__);
118	err = buf_map(cbp, (caddr_t*)&kaddr);
119	if (err) {
120		ntfs_error(ni->vol->mp, "Failed to map buffer (error %d).",
121				err);
122		goto err;
123	}
124	kend = kaddr + size;
125	if (NInoMstProtected(ni)) {
126		s64 ofs, data_size, init_size;
127		u32 rec_size = ni->block_size;
128		NTFS_RECORD_TYPE magic = 0;
129
130		if (!is_read) {
131			if (ni->type == AT_INDEX_ALLOCATION)
132				magic = magic_INDX;
133			else
134				panic("%s(): Unknown mst protected inode "
135						"0x%llx, type 0x%x, name_len "
136						"0x%x.", __FUNCTION__,
137						(unsigned long long)ni->mft_no,
138						(unsigned)le32_to_cpu(ni->type),
139						(unsigned)ni->name_len);
140		}
141		/* The offset in the attribute at which this buffer begins. */
142		ofs = (s64)buf_lblkno(cbp) << PAGE_SHIFT;
143		lck_spin_lock(&ni->size_lock);
144		data_size = ni->data_size;
145		init_size = ni->initialized_size;
146		lck_spin_unlock(&ni->size_lock);
147		/*
148		 * Limit mst deprotection to the initialized size as beyond
149		 * that the data is zero and deprotection will fail.  And worse
150		 * in the write case it will lead to a kernel panic.
151		 */
152		if (ofs + size > init_size) {
153			if (ofs > data_size) {
154				ntfs_error(ni->vol->mp, "Buffer begins past "
155						"the end of the data of the "
156						"attribute (mft_no 0x%llx).",
157						(unsigned long long)ni->mft_no);
158				err = EINVAL;
159				goto unm_err;
160			}
161			if (ofs > init_size) {
162				ntfs_debug("Buffer begins past the end of the "
163						"initialized data of the "
164						"attribute (mft_no 0x%llx).",
165						(unsigned long long)ni->mft_no);
166				goto unm_err;
167			}
168			size = init_size - ofs;
169			kend = kaddr + size;
170		}
171		/*
172		 * Do the mst deprotection ignoring errors and make sure we do
173		 * not go past the initialized size should an error somehow
174		 * have caused the last record to straddle the initialized
175		 * size.
176		 */
177		while (kaddr + rec_size <= kend) {
178			if (is_read)
179				(void)ntfs_mst_fixup_post_read(
180						(NTFS_RECORD*)kaddr, rec_size);
181			else if (__ntfs_is_magic(((NTFS_RECORD*)kaddr)->magic,
182					magic))
183				ntfs_mst_fixup_post_write((NTFS_RECORD*)kaddr);
184			kaddr += rec_size;
185		}
186	} else if (NInoEncrypted(ni)) {
187		// TODO: Need to decrypt the encrypted sectors here.  This
188		// cannot happen at present as we deny opening/reading/writing/
189		// paging encrypted vnodes.
190		panic("%s(): Called for encrypted vnode.\n", __FUNCTION__);
191	} else
192		panic("%s(): Called for normal vnode.\n", __FUNCTION__);
193unm_err:
194	err2 = buf_unmap(cbp);
195	if (err2) {
196		if (!err)
197			err = err2;
198		ntfs_error(ni->vol->mp, "Failed to unmap buffer (error %d).",
199				err2);
200	}
201err:
202	return err;
203}
204
205/**
206 * ntfs_buf_iodone - remove the MST fixups when i/o is complete on a buffer
207 * @buf:	buffer for which to remove the MST fixups
208 * @arg:	unused, always NULL
209 *
210 * ntfs_buf_iodone() is an i/o completion handler which is called when i/o is
211 * completed on a buffer belonging to $MFT/$DATA.  It removes the MST fixups
212 * and returns after which the buffer busy state (BL_BUSY flag) is cleared and
213 * others can access the buffer again.
214 *
215 * ntfs_buf_iodone() is called both when the i/o was successful and when it
216 * failed thus we have to deal with that as appropriate.
217 *
218 * Note that ntfs_buf_iodone() is called deep from within the driver stack and
219 * thus there are limitations on what it is allowed to do.  In particular it is
220 * not allowed to initiate new i/o operations nor to allocate/free memory.
221 *
222 * WARNING: This function can be called whilst an unmount is in progress and
223 * thus it may not look up nor use the ntfs_volume structure to which the inode
224 * belongs.
225 */
226static void ntfs_buf_iodone(buf_t buf, void *arg __unused)
227{
228	s64 ofs, data_size, init_size;
229	vnode_t vn;
230	mount_t mp;
231	ntfs_inode *ni;
232	unsigned size, b_flags;
233	errno_t err;
234
235	vn = buf_vnode(buf);
236	mp = vnode_mount(vn);
237	ni = NTFS_I(vn);
238	ntfs_debug("Entering for mft_no 0x%llx, lblkno 0x%llx.",
239			(unsigned long long)ni->mft_no,
240			(unsigned long long)buf_lblkno(buf));
241	if (!NInoMstProtected(ni) || ni->mft_no || NInoAttr(ni))
242		panic("%s(): Called not for $MFT!\n", __FUNCTION__);
243	/* The size and offset in the attribute at which this buffer begins. */
244	size = buf_count(buf);
245	if (size != ni->block_size)
246		panic("%s(): size != ni->block_size\n", __FUNCTION__);
247	ofs = (s64)buf_lblkno(buf) << ni->block_size_shift;
248	lck_spin_lock(&ni->size_lock);
249	data_size = ni->data_size;
250	init_size = ni->initialized_size;
251	lck_spin_unlock(&ni->size_lock);
252	b_flags = buf_flags(buf);
253	/*
254	 * Limit mst deprotection to the initialized size as beyond that the
255	 * data is zero and deprotection will fail.  And worse in the write
256	 * case it will lead to a kernel panic.
257	 */
258	if (ofs + size > init_size) {
259		if (ofs > data_size) {
260			ntfs_error(mp, "Buffer begins past the end of the "
261					"data of the attribute (mft_no "
262					"0x%llx).",
263					(unsigned long long)ni->mft_no);
264			err = EINVAL;
265			goto err;
266		}
267		if (ofs > init_size) {
268			ntfs_error(mp, "Buffer begins past the end of the "
269					"initialized data of the attribute "
270					"(mft_no 0x%llx).",
271					(unsigned long long)ni->mft_no);
272			err = EINVAL;
273			goto err;
274		}
275	}
276	/*
277	 * Do not try to remove the fixups if a read failed as there will be
278	 * nothing to remove.
279	 */
280	if (!buf_error(buf) || !(b_flags & B_READ)) {
281		NTFS_RECORD *rec;
282
283		err = buf_map(buf, (caddr_t*)&rec);
284		if (err) {
285			ntfs_error(mp, "Failed to map buffer (error %d).",
286					err);
287			goto err;
288		}
289		if (b_flags & B_READ) {
290			err = ntfs_mst_fixup_post_read(rec, size);
291			if (err) {
292				ntfs_error(mp, "Multi sector transfer error "
293						"detected in mft_no 0x%llx "
294						"(error %d).  Run chkdsk",
295						(unsigned long long)ni->mft_no,
296						err);
297				buf_seterror(buf, err);
298			}
299		} else
300			ntfs_mst_fixup_post_write(rec);
301		err = buf_unmap(buf);
302		if (err) {
303			ntfs_error(mp, "Failed to unmap buffer (error %d).",
304					err);
305			goto err;
306		}
307	}
308	ntfs_debug("Done.");
309	return;
310err:
311	if (!buf_error(buf))
312		buf_seterror(buf, err);
313	ntfs_debug("Failed.");
314	return;
315}
316
317/**
318 * ntfs_vnop_strategy - prepare and issue the i/o described by a buffer
319 * @a:		arguments to strategy function
320 *
321 * @a contains:
322 *	buf_t a_bp;	buffer for which to prepare and issue the i/o
323 *
324 * Prepare and issue the i/o described by the buffer @a->a_bp.  Adapted from
325 * buf_strategy().
326 *
327 * In NTFS, we only ever get called for buffers which have a page list
328 * attached.  The page list is mapped and the address of the mapping is stored
329 * in (u8*)buf_dataptr(@a->a_bp).  The exception to this is i/o for $MFT/$DATA
330 * and $MFTMirr/$DATA which is issued via buf_meta_bread(), etc, and thus does
331 * not involve a page list at all.
332 *
333 * Return 0 on success and errno on error.
334 */
335static int ntfs_vnop_strategy(struct vnop_strategy_args *a)
336{
337	s64 ofs, max_end_io;
338	daddr64_t lblkno;
339	buf_t buf = a->a_bp;
340	vnode_t vn = buf_vnode(buf);
341	ntfs_inode *ni;
342	ntfs_volume *vol;
343	void (*old_iodone)(buf_t, void *);
344	void *old_transact;
345	unsigned b_flags;
346	errno_t err, err2;
347	BOOL do_fixup;
348
349	/* Same checks as in buf_strategy(). */
350	if (!vn || vnode_ischr(vn) || vnode_isblk(vn))
351		panic("%s(): !vn || vnode_ischr(vn) || vnode_isblk(vn)\n",
352				__FUNCTION__);
353	ni = NTFS_I(vn);
354	if (!ni) {
355		err = EIO;
356		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
357		goto err;
358	}
359	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, "
360			"logical block 0x%llx.", (unsigned long long)ni->mft_no,
361			le32_to_cpu(ni->type), (unsigned)ni->name_len,
362			(unsigned long long)buf_lblkno(buf));
363	if (S_ISDIR(ni->mode))
364		panic("%s(): Called for directory vnode.\n", __FUNCTION__);
365	vol = ni->vol;
366	b_flags = buf_flags(buf);
367	/*
368	 * If we are called from cluster_io() then pass the request down to the
369	 * underlying device containing the NTFS volume.  We have no KPI way of
370	 * doing this directly so we invoke buf_strategy() and rely on the fact
371	 * that it does not do anything other than associate the physical
372	 * device with the buffer and then pass the buffer down to the device.
373	 */
374	if (b_flags & B_CLUSTER)
375		goto done;
376	/*
377	 * If this i/o is for $MFTMirr/$DATA send it through straight without
378	 * modifications.  This is because we keep the $MFTMirr/$DATA buffers
379	 * in memory with the fixups applied for simplicity.
380	 */
381	if (ni->mft_no == FILE_MFTMirr && !NInoAttr(ni))
382		goto done;
383	/*
384	 * Except for $MFT/$DATA we never do i/o via file system buffers thus
385	 * we should never get here.
386	 */
387	if (ni->mft_no != FILE_MFT || NInoAttr(ni))
388		panic("%s(): Called for non-cluster i/o buffer.\n",
389				__FUNCTION__);
390	/*
391	 * We are reading/writing $MFT/$DATA.
392	 *
393	 * For reads, i/o is allowed up to the data_size whilst for writes, i/o
394	 * is only allowed up to the initialized_size.
395	 *
396	 * Further when reading past the initialized size we do not need to do
397	 * i/o at all as we can simply clear the buffer and return success.
398	 */
399	lblkno = buf_lblkno(buf);
400	ofs = lblkno << ni->block_size_shift;
401	lck_spin_lock(&ni->size_lock);
402	max_end_io = ni->initialized_size;
403	do_fixup = FALSE;
404	if (b_flags & B_READ) {
405		if (ofs >= max_end_io) {
406			if (max_end_io > ni->data_size)
407				panic("%s() initialized_size > data_size\n",
408						__FUNCTION__);
409			if (ofs < ni->data_size) {
410				lck_spin_unlock(&ni->size_lock);
411				buf_clear(buf);
412				buf_biodone(buf);
413				ntfs_debug("Read past initialized size.  "
414						"Clearing buffer.");
415				return 0;
416			}
417		}
418		max_end_io = ni->data_size;
419		do_fixup = TRUE;
420	}
421	lck_spin_unlock(&ni->size_lock);
422	if (ofs >= max_end_io) {
423		/* I/o is out of range.  This should never happen. */
424		ntfs_error(vol->mp, "Trying to %s buffer for $MFT/$DATA which "
425				"is out of range, aborting.",
426				b_flags & B_READ ? "read" : "write");
427		err = EIO;
428		goto err;
429	}
430	/*
431	 * For writes we need to apply the MST fixups before calling
432	 * buf_strategy() which will perform the i/o and if the write is for an
433	 * mft record that is also in the mft mirror we now need to write it to
434	 * the mft mirror as well.
435	 *
436	 * Note B_WRITE is a pseudo flag and cannot be used for checking thus
437	 * check that B_READ is not set which implies it is a write.
438	 */
439	if (!(b_flags & B_READ)) {
440		NTFS_RECORD *rec;
441		NTFS_RECORD_TYPE magic;
442		BOOL need_mirr_sync;
443
444		err = buf_map(buf, (caddr_t*)&rec);
445		if (err) {
446			ntfs_error(vol->mp, "Failed to map buffer (error %d).",
447					err);
448			goto err;
449		}
450		if (!rec)
451			panic("%s(): buf_map() returned NULL.\n", __FUNCTION__);
452#if 0
453		need_mirr_sync = FALSE;
454		if (ni->type == AT_INDEX_ALLOCATION)
455			magic = magic_INDX;
456		else if (ni == mft_ni || ni == vol->mftmirr_ni) {
457			magic = magic_FILE;
458			if (ni == mft_ni)
459				need_mirr_sync = (lblkno < vol->mftmirr_size);
460		} else
461			panic("%s(): Unknown mst protected inode 0x%llx, type "
462					"0x%x, name_len 0x%x.", __FUNCTION__,
463					(unsigned long long)ni->mft_no,
464					(unsigned)le32_to_cpu(ni->type),
465					(unsigned)ni->name_len);
466#else
467		need_mirr_sync = (lblkno < vol->mftmirr_size);
468		magic = magic_FILE;
469#endif
470		/*
471		 * Only apply fixups if the record has the correct magic.  We
472		 * may have detected a multi sector transfer error and are thus
473		 * now writing a BAAD record in which case we do not want to
474		 * touch its contents.
475		 *
476		 * Further, if there is an error do not sync the record to the
477		 * mft mirror as that may still be intact and we do not want to
478		 * overwrite the correct data with corrupt data.
479		 */
480		if (__ntfs_is_magic(rec->magic, magic)) {
481			err = ntfs_mst_fixup_pre_write(rec, ni->block_size);
482			if (err) {
483				/* The record is corrupt, do not write it. */
484				ntfs_error(vol->mp, "Failed to apply mst "
485						"fixups (mft_no 0x%llx, type "
486						"0x%x, offset 0x%llx).",
487						(unsigned long long)ni->mft_no,
488						(unsigned)le32_to_cpu(ni->type),
489						(unsigned long long)ofs);
490				err = EIO;
491				goto unm_err;
492			}
493			do_fixup = TRUE;
494			if (need_mirr_sync) {
495				/*
496				 * Note we continue despite an error as we may
497				 * succeed to write the actual mft record.
498				 */
499				err = ntfs_mft_mirror_sync(vol, lblkno,
500						(MFT_RECORD*)rec,
501						!(b_flags & B_ASYNC));
502				if (err)
503					ntfs_error(vol->mp, "Failed to sync "
504							"mft mirror (error "
505							"%d).  Run chkdsk.",
506							err);
507			}
508		}
509		err = buf_unmap(buf);
510		if (err)
511			ntfs_error(vol->mp, "Failed to unmap buffer (error "
512					"%d).", err);
513	}
514	/*
515	 * For both reads and writes we need to register our i/o completion
516	 * handler which will be called after i/o is complete (including on i/o
517	 * failure) and in which we will remove the MST fixups so the buffer in
518	 * memory never has MST fixups applied unless it is under i/o in which
519	 * case it is BL_BUSY and thus cannot be accessed by anyone so it is
520	 * safe to have the MST fixups applied whilst i/o is in flight.
521	 */
522	if (do_fixup) {
523		buf_setfilter(buf, ntfs_buf_iodone, NULL, &old_iodone,
524				&old_transact);
525		if (old_iodone || old_transact)
526			panic("%s(): Buffer for $MFT/$DATA already had an i/o "
527					"completion handler assigned!\n",
528					__FUNCTION__);
529	}
530	/*
531	 * Everything is set up.  Pass the i/o onto the buffer layer.
532	 *
533	 * When the i/o is done it will call our i/o completion handler which
534	 * will remove the mst fixups.
535	 */
536done:
537	return buf_strategy(vol->dev_vn, a);
538unm_err:
539	err2 = buf_unmap(buf);
540	if (err2)
541		ntfs_error(vol->mp, "Failed to unmap buffer in error code "
542				"path (error %d).", err2);
543err:
544	buf_seterror(buf, err);
545	buf_biodone(buf);
546	return err;
547}
548
549/**
550 * ntfs_vnop_lookup - find a vnode inside an ntfs directory given its name
551 * @a:		arguments to lookup function
552 *
553 * @a contains:
554 *	vnode_t a_dvp;			directory vnode in which to search
555 *	vnode_t *a_vpp;			destination pointer for the found vnode
556 *	struct componentname *a_cnp;	name to find in the directory vnode
557 *	vfs_context_t a_context;
558 *
559 * In short, ntfs_vnop_lookup() looks for the vnode represented by the name
560 * @a->a_cnp in the directory vnode @a->a_dvp and if found returns the vnode in
561 * *@a->a_vpp.
562 *
563 * Return 0 on success and the error code on error.  A return value of ENOENT
564 * does not signify an error as such but merely the fact that the name
565 * @a->a_cnp is not present in the directory @a->a_dvp.  When the lookup is
566 * done for purposes of create, including for the destination of a rename, we
567 * return EJUSTRETURNED instead of ENOENT when the name is not found.  This
568 * allows the VFS to proceed with the create/rename.
569 *
570 * To simplify matters for us, we do not treat the DOS and WIN32 filenames as
571 * two hard links but instead if the lookup matches a DOS filename, we return
572 * the corresponding WIN32 filename instead.
573 *
574 * There are three cases we need to distinguish here:
575 *
576 * 1) The name perfectly matches (i.e. including case) a directory entry with a
577 *    filename in the WIN32 or POSIX namespaces.  In this case
578 *    ntfs_lookup_inode_by_name() will return with name set to NULL and we
579 *    just use the name as supplied in @a->a_cnp.
580 * 2) The name matches (not including case) a directory entry with a filename
581 *    in the WIN32 or POSIX namespaces.  In this case
582 *    ntfs_lookup_inode_by_name() will return with name set to point to an
583 *    allocated ntfs_dir_lookup_name structure containing the properly cased
584 *    little endian Unicode name.  We convert the name to decomposed UTF-8 and
585 *    use that name.
586 * 3) The name matches either perfectly or not (i.e. we do not care about case)
587 *    a directory entry with a filename in the DOS namespace.  In this case
588 *    ntfs_lookup_inode_by_name() will return with name set to point to an
589 *    allocated ntfs_dir_lookup_name structure which just tells us that the
590 *    name is in the DOS namespace.  We read the inode and find the filename in
591 *    the WIN32 namespace corresponding to the matched DOS name.  We then
592 *    convert the name to decomposed UTF-8 and use that name to update the
593 *    vnode identity with.
594 */
595static int ntfs_vnop_lookup(struct vnop_lookup_args *a)
596{
597	MFT_REF mref;
598	ino64_t mft_no;
599	unsigned long op;
600	struct componentname *name_cn, *cn;
601	ntfs_inode *ni, *dir_ni = NTFS_I(a->a_dvp);
602	vnode_t vn;
603	ntfs_volume *vol;
604	ntfschar *ntfs_name;
605	ntfs_dir_lookup_name *name = NULL;
606	u8 *utf8_name = NULL;
607	size_t ntfs_name_size, utf8_size;
608	signed ntfs_name_len;
609	int err;
610	/*
611	 * This is rather gross but several other file systems do it so perhaps
612	 * the large stack (16kiB I believe) in the OS X kernel is big enough.
613	 * If we do not want to do the static allocation then simply set
614	 * ntfs_name to NULL and utf8_to_ntfs() will allocate the memory for
615	 * us.  (We then have to free it, see utf8_to_ntfs() description for
616	 * details.)
617	 */
618	ntfschar ntfs_name_buf[NTFS_MAX_NAME_LEN];
619	struct componentname cn_buf;
620#ifdef DEBUG
621	static const char *ops[4] = { "LOOKUP", "CREATE", "DELETE", "RENAME" };
622#endif
623
624	if (!dir_ni) {
625		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
626		return EINVAL;
627	}
628	vol = dir_ni->vol;
629	name_cn = cn = a->a_cnp;
630	op = cn->cn_nameiop;
631	ntfs_debug("Looking up %.*s in directory inode 0x%llx for %s, flags "
632			"0x%lx.", (int)cn->cn_namelen, cn->cn_nameptr,
633			(unsigned long long)dir_ni->mft_no,
634			op < 4 ? ops[op] : "UNKNOWN",
635			(unsigned long)cn->cn_flags);
636	/*
637	 * Ensure we are being called for a directory in case we are not being
638	 * called from the VFS.
639	 */
640	if (!S_ISDIR(dir_ni->mode)) {
641		ntfs_error(vol->mp, "Not a directory.");
642		return ENOTDIR;
643	}
644	lck_rw_lock_shared(&dir_ni->lock);
645	/* Do not allow messing with the inode once it has been deleted. */
646	if (NInoDeleted(dir_ni)) {
647		/* Remove the inode from the name cache. */
648		cache_purge(dir_ni->vn);
649		lck_rw_unlock_shared(&dir_ni->lock);
650		ntfs_debug("Parent directory is deleted.");
651		return ENOENT;
652	}
653	/*
654	 * First, look for the name in the name cache.  cache_lookup() returns
655	 * -1 if found and @vn is set to the vnode, ENOENT if found and it is a
656	 * negative entry thus @vn is not set to anything, or 0 if the lookup
657	 * failed in which case we need to do a file system based lookup.
658	 *
659	 * Note that if @op is CREATE and there is a negative entry in the name
660	 * cache cache_lookup() will discard that name and return 0, i.e. the
661	 * lookup failed.  In this case we will automatically fall through and
662	 * do the right thing during the real lookup.
663	 */
664	err = cache_lookup(dir_ni->vn, &vn, cn);
665	if (err) {
666		if (err == -1) {
667			ni = NTFS_I(vn);
668			lck_rw_lock_shared(&ni->lock);
669			/*
670			 * Do not allow messing with the inode once it has been
671			 * deleted.
672			 */
673			if (!NInoDeleted(ni)) {
674				lck_rw_unlock_shared(&ni->lock);
675				lck_rw_unlock_shared(&dir_ni->lock);
676				*a->a_vpp = vn;
677				ntfs_debug("Done (cached).");
678				return 0;
679			}
680			lck_rw_unlock_shared(&ni->lock);
681			/* Remove the inode from the name cache. */
682			cache_purge(vn);
683			vnode_put(vn);
684			ntfs_warning(vol->mp, "Cached but deleted vnode "
685					"found, purged from cache and doing "
686					"real lookup.");
687		} else {
688			lck_rw_unlock_shared(&dir_ni->lock);
689			if (err == ENOENT) {
690				ntfs_debug("Done (cached, negative).");
691				return err;
692			}
693			ntfs_error(vol->mp, "cache_lookup() failed (error "
694					"%d).", err);
695			return err;
696		}
697	}
698	/* We special case "." and ".." as they are emulated on NTFS. */
699	if (cn->cn_namelen == 1 && cn->cn_nameptr[0] == '.') {
700		/* "." is not cached. */
701		cn->cn_flags &= ~MAKEENTRY;
702		if (op == RENAME) {
703			lck_rw_unlock_shared(&dir_ni->lock);
704			ntfs_debug("Op is RENAME but name is \".\", returning "
705					"EISDIR.");
706			return EISDIR;
707		}
708		err = vnode_get(dir_ni->vn);
709		lck_rw_unlock_shared(&dir_ni->lock);
710		if (err) {
711			ntfs_error(vol->mp, "Failed to get iocount reference "
712					"on current directory (error %d).",
713					err);
714			return err;
715		}
716		ntfs_debug("Got \".\" directory 0x%llx.",
717				(unsigned long long)dir_ni->mft_no);
718		*a->a_vpp = dir_ni->vn;
719		return 0;
720	} else if (cn->cn_flags & ISDOTDOT) {
721		/* ".." is not cached. */
722		cn->cn_flags &= ~MAKEENTRY;
723		vn = vnode_getparent(dir_ni->vn);
724		if (vn) {
725			lck_rw_unlock_shared(&dir_ni->lock);
726			ntfs_debug("Got \"..\" directory 0x%llx of directory "
727					"0x%llx.",
728					(unsigned long long)NTFS_I(vn)->mft_no,
729					(unsigned long long)dir_ni->mft_no);
730			*a->a_vpp = vn;
731			return 0;
732		}
733		/*
734		 * Look up a filename attribute in the mft record of the
735		 * directory @dir_ni and use its parent mft reference to run an
736		 * ntfs_inode_get() on it to obtain an inode for "..".
737		 */
738		err = ntfs_inode_get_name_and_parent_mref(dir_ni, FALSE, &mref,
739				NULL);
740		lck_rw_unlock_shared(&dir_ni->lock);
741		if (err) {
742			ntfs_error(vol->mp, "Failed to obtain parent mft "
743					"reference for directory 0x%llx "
744					"(error %d).",
745					(unsigned long long)dir_ni->mft_no,
746					err);
747			return err;
748		}
749		mft_no = MREF(mref);
750		err = ntfs_inode_get(vol, mft_no, FALSE, LCK_RW_TYPE_SHARED,
751				&ni, NULL, NULL);
752		if (err) {
753			ntfs_error(vol->mp, "Failed to obtain parent inode "
754					"0x%llx for directory 0x%llx (error "
755					"%d).", (unsigned long long)mft_no,
756					(unsigned long long)dir_ni->mft_no,
757					err);
758			return err;
759		}
760		/* Consistency check. */
761		if (MSEQNO(mref) != ni->seq_no) {
762			lck_rw_unlock_shared(&ni->lock);
763			(void)vnode_put(ni->vn);
764			ntfs_error(vol->mp, "Found stale parent mft reference "
765					"in filename of directory 0x%llx.  "
766					"Volume is corrupt.  Run chkdsk.",
767					(unsigned long long)dir_ni->mft_no);
768			return EIO;
769		}
770		if (!S_ISDIR(ni->mode)) {
771			lck_rw_unlock_shared(&ni->lock);
772			(void)vnode_put(ni->vn);
773			ntfs_error(vol->mp, "Found non-directory parent for "
774					"filename of directory 0x%llx.  "
775					"Volume is corrupt.  Run chkdsk.",
776					(unsigned long long)dir_ni->mft_no);
777			return EIO;
778		}
779		ntfs_debug("Got \"..\" directory 0x%llx of directory 0x%llx.",
780				(unsigned long long)mft_no,
781				(unsigned long long)dir_ni->mft_no);
782		*a->a_vpp = ni->vn;
783		lck_rw_unlock_shared(&ni->lock);
784		return 0;
785	}
786	/* Convert the name from utf8 to Unicode. */
787	ntfs_name = ntfs_name_buf;
788	ntfs_name_size = sizeof(ntfs_name_buf);
789	ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen,
790			&ntfs_name, &ntfs_name_size);
791	if (ntfs_name_len < 0) {
792		lck_rw_unlock_shared(&dir_ni->lock);
793		err = -ntfs_name_len;
794		if (err == ENAMETOOLONG)
795			ntfs_debug("Failed (name is too long).");
796		else
797			ntfs_error(vol->mp, "Failed to convert name to "
798					"Unicode (error %d).", err);
799		return err;
800	}
801	/* Look up the converted name in the directory index. */
802	err = ntfs_lookup_inode_by_name(dir_ni, ntfs_name, ntfs_name_len,
803			&mref, &name);
804	if (err) {
805		lck_rw_unlock_shared(&dir_ni->lock);
806		if (err != ENOENT) {
807			ntfs_error(vol->mp, "Failed to find name in directory "
808					"(error %d).", err);
809			return err;
810		}
811not_found:
812		/*
813		 * The name does not exist in the directory @dir_ni.
814		 *
815		 * If creating (or renaming and the name is the destination
816		 * name) and we are at the end of a pathname we can consider
817		 * allowing the file to be created so return EJUSTRETURN
818		 * instead of ENOENT.
819		 */
820		if (cn->cn_flags & ISLASTCN && (op == CREATE || op == RENAME)) {
821			ntfs_debug("Done (not found but for CREATE or RENAME, "
822					"returning EJUSTRETURN).");
823			return EJUSTRETURN;
824		}
825		/*
826		 * Insert a negative entry into the name cache if caching of
827		 * this name is desired unless this is a create operation in
828		 * which case we do not want to do that.
829		 */
830		if (cn->cn_flags & MAKEENTRY && op != CREATE)
831			cache_enter(dir_ni->vn, NULL, cn);
832		 /*
833		  * Prevent the caller from trying to add the name to the cache
834		  * as well.
835		  */
836		cn->cn_flags &= ~MAKEENTRY;
837		ntfs_debug("Done (not found%s).", cn->cn_flags & MAKEENTRY ?
838				"adding negative name cache entry" : "");
839		return err;
840	}
841	/* The lookup succeeded. */
842	mft_no = MREF(mref);
843	ntfs_debug("Name matches inode number 0x%llx.",
844			(unsigned long long)mft_no);
845	/*
846	 * Remove all NTFS core system files from the name space so we do not
847	 * need to worry about users damaging a volume by writing to them or
848	 * deleting/renaming them and so that we can return fsRtParID (1) as
849	 * the inode number of the parent of the volume root directory and
850	 * fsRtDirID (2) as the inode number of the volume root directory which
851	 * are both expected by Carbon and various applications.
852	 */
853	if (mft_no < FILE_first_user) {
854		lck_rw_unlock_shared(&dir_ni->lock);
855		if (name)
856			OSFree(name, sizeof(*name), ntfs_malloc_tag);
857		ntfs_debug("Removing core NTFS system file (mft_no 0x%x) "
858				"from name space.", (unsigned)mft_no);
859		err = ENOENT;
860		goto not_found;
861	}
862	/*
863	 * If the name is at the end of a pathname and is about to be deleted
864	 * either directly or as a consequence of a rename with the name as the
865	 * target, do not cache it.
866	 */
867	if (cn->cn_flags & ISLASTCN && (op == DELETE || op == RENAME))
868		cn->cn_flags &= ~MAKEENTRY;
869	/*
870	 * If a name was returned from the lookup and it is in the POSIX or
871	 * WIN32 namespaces we need to convert it into a componentname so we
872	 * can use it instead of the existing componentname @cn when getting
873	 * the inode.
874	 *
875	 * If the returned name is in the DOS namespace we have to get the
876	 * inode without a name as we need the inode in order to be able to
877	 * find the WIN32 name corresponding to the DOS name.  Once we have the
878	 * name we will update the vnode identity with it.
879	 *
880	 * If no name was returned, the match was perfect and we just use the
881	 * componentname that was passed in by the caller.
882	 */
883	if (name) {
884		if (name->type == FILENAME_DOS) {
885			name_cn = NULL;
886			/*
887			 * We do not need @name any more but do not set it to
888			 * NULL because we use that fact to distinguish between
889			 * the DOS and WIN32/POSIX cases.
890			 */
891			OSFree(name, sizeof(*name), ntfs_malloc_tag);
892		} else {
893			signed res_size;
894
895			res_size = ntfs_to_utf8(vol, name->name, name->len <<
896					NTFSCHAR_SIZE_SHIFT, &utf8_name,
897					&utf8_size);
898			OSFree(name, sizeof(*name), ntfs_malloc_tag);
899			if (res_size < 0) {
900				lck_rw_unlock_shared(&dir_ni->lock);
901				/* Failed to convert name. */
902				err = -res_size;
903				ntfs_error(vol->mp, "Failed to convert inode "
904						"name to decomposed UTF-8 "
905						"(error %d).", err);
906				return err;
907			}
908			name = NULL;
909			cn_buf = (struct componentname) {
910				.cn_flags = cn->cn_flags,
911				.cn_nameptr = (char*)utf8_name,
912				.cn_namelen = res_size,
913			};
914			name_cn = &cn_buf;
915		}
916	}
917	/*
918	 * @name_cn now contains the correct name of the inode or is NULL.
919	 *
920	 * If @name_cn is not NULL and its cn_flags indicate that the name is
921	 * to be entered into the name cache, ntfs_inode_get() will do this and
922	 * clear the MAKEENTRY bit in the cn_flags.
923	 *
924	 * Note we only drop the directory lock after obtaining the inode
925	 * otherwise someone could delete it under our feet.
926	 */
927	err = ntfs_inode_get(vol, mft_no, FALSE, LCK_RW_TYPE_SHARED, &ni,
928			dir_ni->vn, name_cn);
929	lck_rw_unlock_shared(&dir_ni->lock);
930	if (name_cn == &cn_buf) {
931		/* Pick up any modifications to the cn_flags. */
932		cn->cn_flags = cn_buf.cn_flags;
933		OSFree(utf8_name, utf8_size, ntfs_malloc_tag);
934	}
935	if (!err) {
936		/* Consistency check. */
937		// FIXME: I cannot remember why we need the "mft_no !=
938		// FILE_MFT" test...
939		if (MSEQNO(mref) != ni->seq_no && mft_no != FILE_MFT) {
940			lck_rw_unlock_shared(&ni->lock);
941			(void)vnode_put(ni->vn);
942			ntfs_debug("Inode was deleted and reused under our "
943					"feet.");
944			err = ENOENT;
945			goto not_found;
946		}
947		/*
948		 * We found it.  Before we can return it, we have to check if
949		 * returning this inode is a valid response to the requested
950		 * lookup.  To be more specific, if the lookup was for an
951		 * intermediate path component and the inode is not a directory
952		 * or symbolic link, it is not a valid response because it
953		 * cannot be part of an intermediate path component.  In that
954		 * case return an error.
955		 */
956		if (cn->cn_flags & ISLASTCN || S_ISDIR(ni->mode) ||
957				S_ISLNK(ni->mode)) {
958			/*
959			 * Perfect WIN32/POSIX match or wrong case WIN32/POSIX
960			 * match, i.e. cases 1 and 2, respectively.
961			 */
962			if (!name) {
963				*a->a_vpp = ni->vn;
964				ntfs_debug("Done (case %d).",
965						name_cn == &cn_buf ? 2 : 1);
966				lck_rw_unlock_shared(&ni->lock);
967				return 0;
968			}
969			/*
970			 * We are too indented.  Handle DOS matches further
971			 * below.
972			 */
973			goto handle_dos_name;
974		}
975		lck_rw_unlock_shared(&ni->lock);
976		(void)vnode_put(ni->vn);
977		ntfs_debug("Done (intermediate path component requested but "
978				"found inode is not a directory or symbolic "
979				"link, returning ENOTDIR).");
980		err = ENOTDIR;
981	} else {
982		if (err == ENOENT) {
983			ntfs_debug("Inode was deleted under our feet.");
984			goto not_found;
985		}
986		ntfs_error(vol->mp, "Failed to get inode 0x%llx (error %d).",
987				(unsigned long long)mft_no, err);
988	}
989	return err;
990	// TODO: Consider moving this lot to a separate function.
991handle_dos_name:
992   {
993	MFT_RECORD *m;
994	ntfs_attr_search_ctx *ctx;
995	FILENAME_ATTR *fn;
996	const char *old_name;
997	signed res_size;
998
999	vn = ni->vn;
1000	/*
1001	 * DOS match. -- Case 3.
1002	 *
1003	 * Find the WIN32 name corresponding to the matched DOS name.
1004	 *
1005	 * At present @ni is guaranteed to be a base inode.
1006	 */
1007	err = ntfs_mft_record_map(ni, &m);
1008	if (err) {
1009		ntfs_error(vol->mp, "Failed to map mft record (error %d).",
1010				err);
1011		goto err;
1012	}
1013	ctx = ntfs_attr_search_ctx_get(ni, m);
1014	if (!ctx) {
1015		ntfs_error(vol->mp, "Failed to allocate search context.");
1016		err = ENOMEM;
1017		goto unm_err;
1018	}
1019	do {
1020		ATTR_RECORD *attr;
1021		u32 val_len;
1022		u16 val_ofs;
1023
1024		err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0,
1025				ctx);
1026		if (err) {
1027			if (err == ENOENT) {
1028				ntfs_error(vol->mp, "WIN32 namespace name is "
1029						"missing from inode.  Run "
1030						"chkdsk.");
1031				err = EIO;
1032			} else
1033				ntfs_error(vol->mp, "Failed to find WIN32 "
1034						"namespace name in inode "
1035						"(error %d).", err);
1036			goto put_err;
1037		}
1038		/* Consistency checks. */
1039		attr = ctx->a;
1040		if (attr->non_resident || attr->flags)
1041			goto attr_err;
1042		val_len = le32_to_cpu(attr->value_length);
1043		val_ofs = le16_to_cpu(attr->value_offset);
1044		if (val_ofs + val_len > le32_to_cpu(attr->length))
1045			goto attr_err;
1046		fn = (FILENAME_ATTR*)((u8*)attr + val_ofs);
1047		if ((u32)(sizeof(FILENAME_ATTR) + (fn->filename_length <<
1048				NTFSCHAR_SIZE_SHIFT)) > val_len)
1049			goto attr_err;
1050	} while (fn->filename_type != FILENAME_WIN32);
1051	/* Convert the name to decomposed UTF-8. */
1052	res_size = ntfs_to_utf8(vol, fn->filename, fn->filename_length <<
1053			NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size);
1054	ntfs_attr_search_ctx_put(ctx);
1055	ntfs_mft_record_unmap(ni);
1056	if (res_size < 0) {
1057		/* Failed to convert name. */
1058		err = -res_size;
1059		ntfs_error(vol->mp, "Failed to convert inode name to "
1060				"decomposed UTF-8 (error %d).", err);
1061		goto err;
1062	}
1063	/* Update the vnode with the new name if it differs from the old one. */
1064	old_name = vnode_getname(vn);
1065	if (!old_name || (ni->link_count > 1 && ((long)strlen(old_name) !=
1066			res_size || bcmp(old_name, utf8_name, res_size)))) {
1067		vnode_update_identity(vn, NULL, (char*)utf8_name, res_size, 0,
1068				VNODE_UPDATE_NAME | VNODE_UPDATE_CACHE);
1069	}
1070	if (old_name)
1071		vnode_putname(old_name);
1072	/*
1073	 * Enter the name into the cache (if it is already there this is a
1074	 * no-op) and prevent the caller from trying to add the name to the
1075	 * cache as well.
1076	 */
1077	cn_buf = (struct componentname) {
1078		.cn_flags = cn->cn_flags,
1079		.cn_nameptr = (char*)utf8_name,
1080		.cn_namelen = res_size,
1081	};
1082	cache_enter(dir_ni->vn, vn, &cn_buf);
1083	cn->cn_flags &= ~MAKEENTRY;
1084	OSFree(utf8_name, utf8_size, ntfs_malloc_tag);
1085	*a->a_vpp = ni->vn;
1086	lck_rw_unlock_shared(&ni->lock);
1087	ntfs_debug("Done (case 3).");
1088	return 0;
1089attr_err:
1090	ntfs_error(vol->mp, "Filename attribute is corrupt.  Run chkdsk.");
1091	err = EIO;
1092put_err:
1093	ntfs_attr_search_ctx_put(ctx);
1094unm_err:
1095	ntfs_mft_record_unmap(ni);
1096err:
1097	lck_rw_unlock_shared(&ni->lock);
1098	(void)vnode_put(vn);
1099	return err;
1100   }
1101}
1102
1103// TODO: Rename to ntfs_inode_create and move to ntfs_inode.[hc]?
1104/**
1105 * ntfs_create - create an inode on an ntfs volume
1106 * @dir_vn:	vnode of directory in which to create the new inode
1107 * @vn:		destination pointer for the vnode of the created inode
1108 * @cn:		componentname specifying name of the inode to create
1109 * @va:		vnode attributes to assign to the new inode
1110 * @lock:	if true the ntfs inode of the returned vnode *@vn is locked
1111 *
1112 * Create an inode with name as specified in @cn in the directory specified by
1113 * the vnode @dir_vn.  Assign the attributes @va to the created inode.  Finally
1114 * return the vnode of the created inode in *@vn.
1115 *
1116 * @va is used to determine which type of inode is to be created, i.e. if
1117 * @va->va_type if VDIR create a directory, etc.
1118 *
1119 * If @lock is true the ntfs inode of the returned vnode is locked for writing
1120 * (NTFS_I(@vn)->lock).
1121 *
1122 * Called by the various inode creation ntfs functions (ntfs_vnop_create(),
1123 * ntfs_vnop_mkdir(), ntfs_vnop_symlink(), ntfs_vnop_mknod(), etc) which are
1124 * called by the VFS.
1125 *
1126 * Return 0 on success and errno on error.
1127 *
1128 * Note we always create inode names in the POSIX namespace.
1129 */
1130static errno_t ntfs_create(vnode_t dir_vn, vnode_t *vn,
1131		struct componentname *cn, struct vnode_attr *va,
1132		const BOOL lock)
1133{
1134	ntfs_inode *ni, *dir_ni = NTFS_I(dir_vn);
1135	ntfs_volume *vol;
1136	FILENAME_ATTR *fn;
1137	ntfschar *ntfs_name;
1138	MFT_RECORD *m;
1139	ATTR_RECORD *a;
1140	size_t ntfs_name_size;
1141	signed ntfs_name_len;
1142	unsigned fn_alloc, fn_size;
1143	errno_t err, err2;
1144
1145	if (!dir_ni) {
1146		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
1147		return EINVAL;
1148	}
1149	vol = dir_ni->vol;
1150	if (!S_ISDIR(dir_ni->mode)) {
1151		ntfs_debug("Parent inode is not a directory, returning "
1152				"ENOTDIR.");
1153		return ENOTDIR;
1154	}
1155	if (dir_ni->file_attributes & FILE_ATTR_REPARSE_POINT) {
1156		ntfs_error(vol->mp, "Parent inode is a reparse point and not "
1157				"a regular directory, returning ENOTSUP.");
1158		return ENOTDIR;
1159	}
1160	/*
1161	 * Create a temporary copy of the filename attribute so we can release
1162	 * the mft record before we add the directory entry.  This is needed
1163	 * because when we hold the mft record for the newly created inode and
1164	 * we call ntfs_dir_entry_add() this would cause the mft record for the
1165	 * directory to be mapped which would result in a deadlock in the event
1166	 * that both mft records are in the same page.
1167	 */
1168	fn_alloc = sizeof(FILENAME_ATTR) + NTFS_MAX_NAME_LEN * sizeof(ntfschar);
1169	fn = OSMalloc(fn_alloc, ntfs_malloc_tag);
1170	if (!fn) {
1171		ntfs_error(vol->mp, "Failed to allocate memory for temporary "
1172				"filename attribute.");
1173		return ENOMEM;
1174	}
1175	bzero(fn, fn_alloc);
1176	/* Begin setting up the temporary filename attribute. */
1177	fn->parent_directory = MK_LE_MREF(dir_ni->mft_no, dir_ni->seq_no);
1178	/* FILENAME_POSIX is zero and the attribute is already zeroed. */
1179	/* fn->filename_type = FILENAME_POSIX; */
1180	/* Convert the name from utf8 to Unicode. */
1181	ntfs_name = fn->filename;
1182	ntfs_name_size = NTFS_MAX_NAME_LEN * sizeof(ntfschar);
1183	ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen,
1184			&ntfs_name, &ntfs_name_size);
1185	if (ntfs_name_len < 0) {
1186		err = -ntfs_name_len;
1187		if (err == ENAMETOOLONG)
1188			ntfs_debug("Failed (name is too long).");
1189		else
1190			ntfs_error(vol->mp, "Failed to convert name to "
1191					"Unicode (error %d).", err);
1192		goto err;
1193	}
1194	/* Set the filename length in the temporary filename attribute. */
1195	fn->filename_length = ntfs_name_len;
1196	fn_size = sizeof(FILENAME_ATTR) + ntfs_name_len * sizeof(ntfschar);
1197	/* If no vnode type is specified default to VREG, i.e. regular file. */
1198	if (va->va_type == VNON)
1199		va->va_type = VREG;
1200	/*
1201	 * We support regular files, directories, symbolic links, sockets,
1202	 * fifos, and block and character device special filesr.
1203	 */
1204	switch (va->va_type) {
1205	case VBLK:
1206	case VCHR:
1207		if (!VATTR_IS_ACTIVE(va, va_rdev)) {
1208			ntfs_error(vol->mp, "va_type is %s but va_rdev is not "
1209					"specified!", va->va_type == VBLK ?
1210					"VBLK" : "VCHR");
1211			err = EINVAL;
1212			goto err;
1213		}
1214	case VREG:
1215	case VDIR:
1216	case VLNK:
1217	case VSOCK:
1218	case VFIFO:
1219		break;
1220	default:
1221		ntfs_error(vol->mp, "Tried to create inode of type 0x%x which "
1222				"is not supported at present.", va->va_type);
1223		err = ENOTSUP;
1224		goto err;
1225	}
1226	va->va_mode |= VTTOIF(va->va_type);
1227	/* If no create time is supplied default it to the current time. */
1228	if (!VATTR_IS_ACTIVE(va, va_create_time))
1229		nanotime(&va->va_create_time);
1230	/*
1231	 * Round the time down to the nearest 100-nano-second interval as
1232	 * needed for NTFS.
1233	 */
1234	va->va_create_time.tv_nsec -= va->va_create_time.tv_nsec % 100;
1235	/* Set the times in the temporary filename attribute. */
1236	fn->last_access_time = fn->last_mft_change_time =
1237			fn->last_data_change_time = fn->creation_time =
1238			utc2ntfs(va->va_create_time);
1239	/* Set the bits for all the supported fields at once. */
1240	va->va_supported |=
1241			VNODE_ATTR_BIT(va_mode) |
1242			VNODE_ATTR_BIT(va_flags) |
1243			VNODE_ATTR_BIT(va_create_time) |
1244			VNODE_ATTR_BIT(va_type);
1245again:
1246	/* Lock the target directory and check that it has not been deleted. */
1247	lck_rw_lock_exclusive(&dir_ni->lock);
1248	if (!dir_ni->link_count) {
1249		/* Remove the target directory from the name cache. */
1250		cache_purge(dir_vn);
1251		err = ENOENT;
1252		goto unl_err;
1253	}
1254	/* Allocate and map a new mft record. */
1255	err = ntfs_mft_record_alloc(vol, va, cn, dir_ni, &ni, &m, &a);
1256	if (err) {
1257		if (err != ENOSPC)
1258			ntfs_error(vol->mp, "Failed to allocate a new on-disk "
1259					"inode (error %d).", err);
1260		goto unl_err;
1261	}
1262	/*
1263	 * If requested by the caller, take the ntfs inode lock on the
1264	 * allocated ntfs inode for writing so no-one can start using it before
1265	 * it is ready.  For example if it is a symbolic link we cannot allow
1266	 * anyone to look at it until we have set the data size to the symbolic
1267	 * link target size otherwise a concurrent ntfs_vnop_readlink() would
1268	 * return EINVAL as it would see a target size of zero.
1269	 *
1270	 * Also, if the inode is a symbolic link we need to take the lock so
1271	 * that we can create the AFP_AfpInfo attribute when we have finished
1272	 * setting up the inode.
1273	 */
1274	if (lock || S_ISLNK(ni->mode))
1275		lck_rw_lock_exclusive(&ni->lock);
1276	/*
1277	 * @a now points to the location in the allocated mft record at which
1278	 * we need to insert the filename attribute so we can insert it without
1279	 * having to do a lookup first.
1280	 *
1281	 * Insert the filename attribute and initialize the value to zero.
1282	 * This cannot fail as we are dealing with a newly allocated mft record
1283	 * so there must be enough space for a filename attribute even if the
1284	 * filename is of the maximum allowed length.
1285	 */
1286	err = ntfs_resident_attr_record_insert_internal(m, a, AT_FILENAME,
1287			NULL, 0, fn_size);
1288	if (err)
1289		panic("%s(): err\n", __FUNCTION__);
1290	/* Finish setting up the filename attribute value. */
1291	fn->file_attributes = ni->file_attributes;
1292	/*
1293	 * Directories need the FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set
1294	 * in their filename attributes both in their mft records and in the
1295	 * index entries pointing to them but not in the standard information
1296	 * attribute which is why it is not set in @ni->file_attributes.
1297	 */
1298	if (va->va_type == VDIR)
1299		fn->file_attributes |= FILE_ATTR_DUP_FILENAME_INDEX_PRESENT;
1300	/*
1301	 * Update the data_size in the temporary filename attribute from the
1302	 * created ntfs inode.  This will not be zero for fifos and block and
1303	 * character device special files for example.
1304	 */
1305	fn->data_size = ni->data_size;
1306	/*
1307	 * Copy the created filename attribute into place in the attribute
1308	 * record.
1309	 */
1310	memcpy((u8*)a + le16_to_cpu(a->value_offset), fn, fn_size);
1311	/*
1312	 * Set the link count to one to indicate there is one filename
1313	 * attribute inside the mft record.
1314	 */
1315	m->link_count = const_cpu_to_le16(1);
1316	ni->link_count = 1;
1317	/*
1318	 * Ensure the mft record is written to disk.
1319	 *
1320	 * Note we do not set any of the NInoDirty*() flags because we have
1321	 * just created the inode thus all the fields are in sync between the
1322	 * ntfs_inode @ni and its mft record @m.
1323	 */
1324	NInoSetMrecNeedsDirtying(ni);
1325	/*
1326	 * Release the mft record.  It is safe to do so even though the
1327	 * directory entry has not been added yet because the inode is still
1328	 * locked and marked new thus it is not a candidate for syncing yet.
1329	 */
1330	ntfs_mft_record_unmap(ni);
1331	/*
1332	 * If the inode is a symbolic link now create the AFP_AfpInfo attribute
1333	 * with the Finder Info specifying that this is a symbolic link.
1334	 */
1335	if (S_ISLNK(ni->mode)) {
1336		err = ntfs_inode_afpinfo_write(ni);
1337		/*
1338		 * If the caller has not requested that the inode be returned
1339		 * locked unlock it now.
1340		 */
1341		if (!lock)
1342			lck_rw_unlock_exclusive(&ni->lock);
1343		if (err) {
1344			ntfs_error(vol->mp, "Failed to create AFP_AfpInfo "
1345					"attribute in allocated inode 0x%llx "
1346					"(error %d).",
1347					(unsigned long long)ni->mft_no, err);
1348			goto rm_err;
1349		}
1350	}
1351	/* Add the created filename attribute to the parent directory index. */
1352	err = ntfs_dir_entry_add(dir_ni, fn, fn_size,
1353			MK_LE_MREF(ni->mft_no, ni->seq_no));
1354	if (!err) {
1355		/* Free the temporary filename attribute. */
1356		OSFree(fn, fn_alloc, ntfs_malloc_tag);
1357		/*
1358		 * Invalidate negative cache entries in the directory.  We need
1359		 * to do this because there may be negative cache entries
1360		 * which would match the name of the just created inode but in
1361		 * a different case.  Such negative cache entries would now be
1362		 * incorrect thus we need to throw away all negative cache
1363		 * entries to ensure there cannot be any incorrectly negative
1364		 * entries in the name cache.
1365		 */
1366		cache_purge_negatives(dir_vn);
1367		/*
1368		 * Add the inode to the name cache.  Note that
1369		 * ntfs_vnop_lookup() will have caused the name to not be
1370		 * cached because it will have cleared the MAKEENTRY flag.
1371		 */
1372		cache_enter(dir_ni->vn, ni->vn, cn);
1373		/* We are done with the directory so unlock it. */
1374		lck_rw_unlock_exclusive(&dir_ni->lock);
1375		/*
1376		 * We can finally unlock and unmark as new the new ntfs inode
1377		 * thus rendering the inode a full member of society.
1378		 */
1379		ntfs_inode_unlock_alloc(ni);
1380		ntfs_debug("Done (new mft_no 0x%llx).",
1381				(unsigned long long)ni->mft_no);
1382		*vn = ni->vn;
1383		return 0;
1384	}
1385	/*
1386	 * We failed to add the directory entry thus we have to effectively
1387	 * delete the created inode again.  To do this we need to map the mft
1388	 * record and mark it as no longer in use.
1389	 *
1390	 * We then also need to set the link count in the ntfs inode to zero to
1391	 * reflect that it is deleted and to ensure that the subsequent
1392	 * vnode_put() results in ntfs_delete_inode() being called (via
1393	 * VNOP_INACTIVE() and ntfs_vnop_inactive() respectively).
1394	 *
1395	 * But first, unlock the allocated ntfs inode if we locked it above.
1396	 * No-one can get to it now as it does not have a directory entry
1397	 * pointing to it.
1398	 */
1399rm_err:
1400	if (lock)
1401		lck_rw_unlock_exclusive(&ni->lock);
1402	err2 = ntfs_mft_record_map(ni, &m);
1403	if (err2) {
1404		ntfs_error(vol->mp, "Failed to map mft record in error code "
1405				"path (error %d).  Run chkdsk to recover the "
1406				"lost mft record.", err2);
1407		NVolSetErrors(vol);
1408	} else {
1409		m->flags &= ~MFT_RECORD_IN_USE;
1410		NInoSetMrecNeedsDirtying(ni);
1411		ntfs_mft_record_unmap(ni);
1412	}
1413	ni->link_count = 0;
1414	lck_rw_unlock_exclusive(&dir_ni->lock);
1415	ntfs_inode_unlock_alloc(ni);
1416	cache_purge(ni->vn);
1417	(void)vnode_put(ni->vn);
1418	if (err == EEXIST) {
1419		/*
1420		 * There are two possible reasons why the directory entry
1421		 * already exists.  Either someone created it under our feet in
1422		 * which case we try to look up the existing vnode and retrn
1423		 * that instead and failing that we try to create the inode
1424		 * again or the name really does exist but we have removed it
1425		 * from the name space thus ntfs_vnop_lookup() will always
1426		 * return ENOENT/EJUSTRETURN for it.  This is the case for the
1427		 * core system files for example.  This would cause an infinite
1428		 * loop thus we need to check for this case by checking that
1429		 * the name being created does not match one of the core system
1430		 * filenames and if it does we return EEXIST.
1431		 */
1432		if (dir_ni == vol->root_ni) {
1433			/* Catch the "." entry. */
1434			if (cn->cn_namelen == 1 && cn->cn_nameptr[0] == '.')
1435				goto is_system;
1436			/*
1437			 * Catch the core system files which all start with the
1438			 * '$' character.
1439			 */
1440			if (cn->cn_nameptr[0] == '$') {
1441				char *n = (char*)cn->cn_nameptr + 1;
1442				int l = cn->cn_namelen;
1443
1444				if ((l == 4 && !strncmp(n, "MFT", 3)) ||
1445						(l == 5 && !strncmp(n, "Boot",
1446						4)) ||
1447						(l == 6 && !strncmp(n, "Quota",
1448						5)) ||
1449						(l == 7 && (
1450						!strncmp(n, "Volume", 6) ||
1451						!strncmp(n, "Bitmap", 6) ||
1452						!strncmp(n, "Secure", 6) ||
1453						!strncmp(n, "UpCase", 6) ||
1454						!strncmp(n, "Extend", 6))) ||
1455						(l == 8 && (
1456						!strncmp(n, "MFTMirr", 7) ||
1457						!strncmp(n, "LogFile", 7) ||
1458						!strncmp(n, "AttrDef", 7) ||
1459						!strncmp(n, "BadClus", 7))))
1460					goto is_system;
1461			}
1462		}
1463		ntfs_debug("Inode was created under our feet.");
1464		/*
1465		 * If the inode was created under our feet, we are creating a
1466		 * regular file, and the caller did not want an exclusive
1467		 * create, simply look up the inode and return that.
1468		 */
1469		if (va->va_type == VREG && !(va->va_vaflags & VA_EXCLUSIVE)) {
1470			struct vnop_lookup_args la;
1471
1472			cn->cn_nameiop = LOOKUP;
1473			la = (struct vnop_lookup_args) {
1474				.a_desc = &vnop_lookup_desc,
1475				.a_dvp = dir_vn,
1476				.a_vpp = vn,
1477				.a_cnp = cn,
1478			};
1479			err = ntfs_vnop_lookup(&la);
1480			cn->cn_nameiop = CREATE;
1481			/*
1482			 * If the inode that was created under our feet was
1483			 * also deleted under our feet, repeat the whole
1484			 * process.
1485			 */
1486			if (err == ENOENT || err == EJUSTRETURN) {
1487				*vn = NULL;
1488				goto again;
1489			}
1490			/*
1491			 * Make sure the vnode we looked up is a regular file
1492			 * as we would not want to return a directory instead
1493			 * of a file for example.
1494			 */
1495			if (!err && vnode_vtype(*vn) != VREG) {
1496				(void)vnode_put(*vn);
1497				*vn = NULL;
1498				err = EEXIST;
1499			}
1500		}
1501	} else
1502		ntfs_error(vol->mp, "Failed to add directory entry (error "
1503				"%d).", err);
1504err:
1505	OSFree(fn, fn_alloc, ntfs_malloc_tag);
1506	return err;
1507unl_err:
1508	lck_rw_unlock_exclusive(&dir_ni->lock);
1509	goto err;
1510is_system:
1511	ntfs_error(vol->mp, "Cannot create inode with name %.*s in the volume "
1512			"root directory as the name clashes with the name of "
1513			"a core system file.  Returning EEXIST.",
1514			(int)cn->cn_namelen, cn->cn_nameptr);
1515	err = EEXIST;
1516	*vn = NULL;
1517	goto err;
1518}
1519
1520/**
1521 * ntfs_vnop_create - create a regular file
1522 * @a:		arguments to create function
1523 *
1524 * @a contains:
1525 *	vnode_t a_dvp;			directory in which to create the file
1526 *	vnode_t *a_vpp;			destination pointer for the created file
1527 *	struct componentname *a_cnp;	name of the file to create
1528 *	struct vnode_attr *a_vap;	attributes to set on the created file
1529 *	vfs_context_t a_context;
1530 *
1531 * Create a regular file with name as specified in @a->a_cnp in the directory
1532 * specified by the vnode @a->a_dvp.  Assign the attributes @a->a_vap to the
1533 * created file.  Finally return the vnode of the created file in *@a->a_vpp.
1534 *
1535 * Return 0 on success and errno on error.
1536 *
1537 * Note we always create filenames in the POSIX namespace.
1538 */
1539static int ntfs_vnop_create(struct vnop_create_args *a)
1540{
1541	errno_t err;
1542#ifdef DEBUG
1543	ntfs_inode *ni = NTFS_I(a->a_dvp);
1544
1545	if (ni)
1546		ntfs_debug("Creating a file named %.*s in directory mft_no "
1547				"0x%llx.", (int)a->a_cnp->cn_namelen,
1548				a->a_cnp->cn_nameptr,
1549				(unsigned long long)ni->mft_no);
1550#endif
1551	err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE);
1552	ntfs_debug("Done (error %d).", (int)err);
1553	return err;
1554}
1555
1556/**
1557 * ntfs_vnop_mknod - create a special file node
1558 * @a:		arguments to mknod function
1559 *
1560 * @a contains:
1561 *	vnode_t a_dvp;			directory in which to create the file
1562 *	vnode_t *a_vpp;			destination pointer for the created file
1563 *	struct componentname *a_cnp;	name of the file to create
1564 *	struct vnode_attr *a_vap;	attributes to set on the created file
1565 *	vfs_context_t a_context;
1566 *
1567 * Create a special file node with name as specified in @a->a_cnp in the
1568 * directory specified by the vnode @a->a_dvp.  Assign the attributes @a->a_vap
1569 * to the created node.  Finally return the vnode of the created file in
1570 * *@a->a_vpp.
1571 *
1572 * The type of special file node to create is specified by the caller in
1573 * @a->a_vap->va_type and can be one of:
1574 *	VSOCK - create a socket
1575 *	VFIFO - create a fifo
1576 *	VBLK  - create a block special device
1577 *	VCHR  - create a character special device
1578 *
1579 * Return 0 on success and errno on error.
1580 *
1581 * Note we always create filenames in the POSIX namespace.
1582 */
1583static int ntfs_vnop_mknod(struct vnop_mknod_args *a)
1584{
1585	errno_t err;
1586#ifdef DEBUG
1587	ntfs_inode *ni = NTFS_I(a->a_dvp);
1588
1589	if (ni)
1590		ntfs_debug("Creating a special inode of type 0x%x named %.*s "
1591				"in directory mft_no 0x%llx.",
1592				a->a_vap->va_type, (int)a->a_cnp->cn_namelen,
1593				a->a_cnp->cn_nameptr,
1594				(unsigned long long)ni->mft_no);
1595#endif
1596	err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE);
1597	ntfs_debug("Done (error %d).", (int)err);
1598	return err;
1599}
1600
1601/**
1602 * ntfs_vnop_open - open a vnode
1603 * @a:		arguments to open function
1604 *
1605 * @a contains:
1606 *	vnode_t a_vp;		vnode to open
1607 *	int a_mode;		mode to open the file with
1608 *	vfs_context_t a_context;
1609 *
1610 * Open the vnode @a->a_vp with mode @a->a_mode.
1611 *
1612 * Note the VFS does a lot of checking before ntfs_vnop_open() is called
1613 * including permissions and checking for a read-only file system thus we do
1614 * not need to worry about the case where the driver is compiled read-only as
1615 * the volume is then mounted read-only so the vfs catches all write accesses
1616 * very early on and denies them.
1617 *
1618 * Return 0 on success and errno on error.
1619 */
1620static int ntfs_vnop_open(struct vnop_open_args *a)
1621{
1622	ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
1623	errno_t err = 0;
1624
1625	if (!ni) {
1626		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
1627		return EINVAL;
1628	}
1629	ntfs_debug("Entering for mft_no 0x%llx, mode 0x%x.",
1630			(unsigned long long)ni->mft_no, (unsigned)a->a_mode);
1631	base_ni = ni;
1632	if (NInoAttr(ni))
1633		base_ni = ni->base_ni;
1634	/*
1635	 * All the core system files cannot possibly be opened because they are
1636	 * removed from the name space thus it is impossible for a process to
1637	 * obtain a vnode to them thus VNOP_OPEN() can never be called for
1638	 * them.  The only exception is the root directory which we of course
1639	 * allow access to.
1640	 */
1641	if (ni->mft_no < FILE_first_user && ni != ni->vol->root_ni)
1642		panic("%s(): Called for a system inode.  This is not "
1643				"possible.\n", __FUNCTION__);
1644	lck_rw_lock_shared(&ni->lock);
1645	/* Do not allow messing with the inode once it has been deleted. */
1646	if (NInoDeleted(ni)) {
1647		lck_rw_unlock_shared(&ni->lock);
1648		/* Remove the inode from the name cache. */
1649		cache_purge(ni->vn);
1650		ntfs_debug("Cannot open deleted mft_no 0x%llx, returning "
1651				"ENOENT.", (unsigned long long)ni->mft_no);
1652		return ENOENT;
1653	}
1654	/*
1655	 * Do not allow opening encrpyted files as we do not support reading,
1656	 * writing, nor mmap()ing them.
1657	 */
1658	if (NInoEncrypted(ni)) {
1659		lck_rw_unlock_shared(&ni->lock);
1660		ntfs_debug("Cannot open encrypted mft_no 0x%llx, returning "
1661				"EACCES.", (unsigned long long)ni->mft_no);
1662		return EACCES;
1663	}
1664	lck_rw_unlock_shared(&ni->lock);
1665	/*
1666	 * We keep track of how many times the base vnode has been opened and
1667	 * we count other vnodes towards the base vnode open count to ensure
1668	 * we do the right thing in ntfs_unlink().
1669	 */
1670	OSIncrementAtomic(&base_ni->nr_opens);
1671	ntfs_debug("Done (error %d).", (int)err);
1672	return err;
1673}
1674
1675/**
1676 * ntfs_vnop_close - close a vnode
1677 * @a:		arguments to close function
1678 *
1679 * @a contains:
1680 *	vnode_t a_vp;		vnode to close
1681 *	int a_fflag;		close flags (FREAD and/or FWRITE for example)
1682 *	vfs_context_t a_context;
1683 *
1684 * Close the vnode @a->a_vp with flags @a->a_fflag.
1685 *
1686 * Return 0 on success and errno on error.
1687 */
1688static int ntfs_vnop_close(struct vnop_close_args *a)
1689{
1690	vnode_t vn = a->a_vp;
1691	ntfs_inode *base_ni, *ni = NTFS_I(vn);
1692
1693	if (!ni) {
1694		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
1695		return 0;
1696	}
1697	ntfs_debug("Entering for mft_no 0x%llx, fflag 0x%x.",
1698			(unsigned long long)ni->mft_no, a->a_fflag);
1699	base_ni = ni;
1700	if (NInoAttr(ni))
1701		base_ni = ni->base_ni;
1702	/*
1703	 * We keep track of how many times the base vnode has been opened and
1704	 * we count other vnodes towards the base vnode open count to ensure
1705	 * we do the right thing in ntfs_unlink().
1706	 */
1707	OSDecrementAtomic(&base_ni->nr_opens);
1708	/*
1709	 * If the vnode is still in use release any expired directory hints.
1710	 *
1711	 * If the vnode is no longer in use release all directory hints.
1712	 *
1713	 * Note we check for presence of directory hints outside the locks as
1714	 * an optimization.  It is not a disaster if we miss any as all will be
1715	 * released in ntfs_inode_free() before the inode is thrown away at the
1716	 * latest.
1717	 */
1718	if (ni != base_ni && ni->type == AT_INDEX_ALLOCATION &&
1719			ni->nr_dirhints) {
1720		int busy;
1721
1722		busy = vnode_isinuse(vn, ni->nr_refs + 1);
1723		lck_rw_lock_exclusive(&ni->lock);
1724		ntfs_dirhints_put(ni, busy);
1725		lck_rw_unlock_exclusive(&ni->lock);
1726	}
1727	ntfs_debug("Done.");
1728	return 0;
1729}
1730
1731/**
1732 * ntfs_vnop_access -
1733 *
1734 */
1735static int ntfs_vnop_access(struct vnop_access_args *a)
1736{
1737	errno_t err;
1738
1739	ntfs_debug("Entering.");
1740	// TODO:
1741	err = ENOTSUP;
1742	ntfs_debug("Done (error %d).", (int)err);
1743	return err;
1744}
1745
1746/**
1747 * ntfs_vnop_getattr - get attributes about a vnode or about the mounted volume
1748 * @a:		arguments to getattr function
1749 *
1750 * @a contains:
1751 *	vnode_t a_vp;			vnode for which to return attributes
1752 *	struct vnode_attr *a_vap;	attributes to return and destination
1753 *	vfs_context_t a_context;
1754 *
1755 * Return the attributes described in @a_vap about the vnode @a_vp.  Some
1756 * attributes are intercepted by the VFS in getattrlist() and getvolattrlist()
1757 * so we do not bother with them.
1758 *
1759 * At present we do not support all attributes.  We declare what we support to
1760 * the world in our VFS_GETATTR() function (ntfs_vfsops.c::ntfs_getattr()) so
1761 * do not forget to update that when support for further attributes is added
1762 * here.
1763 *
1764 * Return 0 on success and errno on error.
1765 *
1766 * TODO: Implement more attributes.
1767 */
1768static int ntfs_vnop_getattr(struct vnop_getattr_args *a)
1769{
1770	MFT_REF parent_mref;
1771	ino64_t mft_no;
1772	s64 on_disk_size;
1773	struct vnode_attr *va = a->a_vap;
1774	ntfs_inode *ni, *base_ni;
1775	ntfs_volume *vol;
1776	const char *name;
1777	FILE_ATTR_FLAGS file_attributes;
1778	unsigned flags;
1779	errno_t err;
1780	lck_rw_type_t lock;
1781	BOOL is_root, name_is_done, have_parent;
1782
1783	ni = NTFS_I(a->a_vp);
1784	if (!ni) {
1785		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
1786		return 0;
1787	}
1788	vol = ni->vol;
1789	mft_no = ni->mft_no;
1790	have_parent = name_is_done = is_root = FALSE;
1791	ntfs_debug("Entering for mft_no 0x%llx.", (unsigned long long)mft_no);
1792	base_ni = ni;
1793	if (NInoAttr(ni)) {
1794		base_ni = ni->base_ni;
1795		lck_rw_lock_shared(&base_ni->lock);
1796	}
1797	lck_rw_lock_shared(&ni->lock);
1798	lock = LCK_RW_TYPE_SHARED;
1799	/* Do not allow messing with the inode once it has been deleted. */
1800	if (NInoDeleted(ni)) {
1801		/* Remove the inode from the name cache. */
1802		cache_purge(ni->vn);
1803		err = ENOENT;
1804		goto err;
1805	}
1806	/*
1807	 * If this is the root directory, leave it to the VFS to get the name
1808	 * from the mountpoint (see below).
1809	 */
1810	if (base_ni == vol->root_ni)
1811		name_is_done = is_root = TRUE;
1812	/* For directories always return a link count of 1. */
1813	va->va_nlink = 1;
1814	if (!S_ISDIR(ni->mode))
1815		va->va_nlink = ni->link_count;
1816	va->va_rdev = (dev_t)0;
1817	switch (ni->mode & S_IFMT) {
1818	case S_IFBLK:
1819	case S_IFCHR:
1820		/*
1821		 * For block and character device special inodes return the
1822		 * device.
1823		 */
1824		va->va_rdev = ni->rdev;
1825	case S_IFIFO:
1826	case S_IFSOCK:
1827		/*
1828		 * For fifos, sockets, block and character device special files
1829		 * return all sizes set to zero.
1830		 */
1831		va->va_total_alloc = va->va_data_alloc = va->va_total_size =
1832				va->va_data_size = 0;
1833		break;
1834	default:
1835		lck_spin_lock(&ni->size_lock);
1836		/*
1837		 * We cheat for both the total size and the total allocated
1838		 * size and just return the attribute size rather than looping
1839		 * over all ($DATA?) attributes and adding up their sizes.
1840		 */
1841		va->va_total_size = va->va_data_size = ni->data_size;
1842		/*
1843		 * Resident attributes reside inside the on-disk inode and thus
1844		 * have no on-disk allocation because the on-disk inode itself
1845		 * is already accounted for in the allocated size of the $MFT
1846		 * system file which contains the table of on-disk inodes.
1847		 * Perhaps more importantly, if we delete a resident file no
1848		 * space would be freed up on the volume, thus we definitely
1849		 * need to return zero for the allocated size of such resident
1850		 * files.
1851		 */
1852		on_disk_size = 0;
1853		if (NInoNonResident(ni)) {
1854			if (ni->type == AT_DATA && (NInoCompressed(ni) ||
1855					NInoSparse(ni)))
1856				on_disk_size = ni->compressed_size;
1857			else
1858				on_disk_size = ni->allocated_size;
1859		}
1860		va->va_total_alloc = va->va_data_alloc = on_disk_size;
1861		lck_spin_unlock(&ni->size_lock);
1862	}
1863	va->va_iosize = ubc_upl_maxbufsize();
1864	va->va_uid = ni->uid;
1865	va->va_gid = ni->gid;
1866	va->va_mode = ni->mode;
1867	file_attributes = base_ni->file_attributes;
1868	/*
1869	 * Do not allow the volume root directory to be read-only or hidden and
1870	 * do not allow directories in general to be read-only as Windows uses
1871	 * the read-only bit on directories for completely different purposes
1872	 * like customized/specialized folder views which are lost when you
1873	 * clear the read-only bit.
1874	 */
1875	if (S_ISDIR(base_ni->mode)) {
1876		file_attributes &= ~FILE_ATTR_READONLY;
1877		if (is_root)
1878			file_attributes &= ~FILE_ATTR_HIDDEN;
1879	}
1880	flags = 0;
1881/*
1882 *	if (NInoCompressed(ni))
1883 *		flags |= SF_COMPRESSED;
1884 */
1885	if (file_attributes & FILE_ATTR_READONLY)
1886		flags |= UF_IMMUTABLE;
1887	if (file_attributes & FILE_ATTR_HIDDEN)
1888		flags |= UF_HIDDEN;
1889	/*
1890	 * Windows does not set the "needs archiving" bit on directories
1891	 * except for encrypted directories where it does set the bit.
1892	 */
1893	if ((!S_ISDIR(base_ni->mode) ||
1894			file_attributes & FILE_ATTR_ENCRYPTED) &&
1895			!(file_attributes & FILE_ATTR_ARCHIVE))
1896		flags |= SF_ARCHIVED;
1897	va->va_flags = flags;
1898	va->va_create_time = base_ni->creation_time;
1899	va->va_access_time = base_ni->last_access_time;
1900	va->va_modify_time = base_ni->last_data_change_time;
1901	va->va_change_time = base_ni->last_mft_change_time;
1902	/*
1903	 * NTFS does not distinguish between the inode and its hard links.
1904	 *
1905	 * We have to remap the root directory inode to inode number 2, i.e.
1906	 * fsRtDirID, for compatibility with Carbon.
1907	 */
1908	if (!is_root)
1909		va->va_fileid = mft_no;
1910	else
1911		va->va_fileid = 2;
1912	va->va_fsid = vol->dev;
1913	/* FIXME: What is the difference between the below two? */
1914	va->va_filerev = base_ni->seq_no;
1915	va->va_gen = base_ni->seq_no;
1916	va->va_encoding = 0x7e; /* = kTextEncodingMacUnicode */
1917	va->va_supported |=
1918			VNODE_ATTR_BIT(va_rdev) |
1919			VNODE_ATTR_BIT(va_nlink) |
1920			VNODE_ATTR_BIT(va_total_size) |
1921			VNODE_ATTR_BIT(va_total_alloc) |
1922			VNODE_ATTR_BIT(va_data_size) |
1923			VNODE_ATTR_BIT(va_data_alloc) |
1924			VNODE_ATTR_BIT(va_iosize) |
1925			VNODE_ATTR_BIT(va_uid) |
1926			VNODE_ATTR_BIT(va_gid) |
1927			VNODE_ATTR_BIT(va_mode) |
1928			VNODE_ATTR_BIT(va_flags) |
1929			VNODE_ATTR_BIT(va_create_time) |
1930			VNODE_ATTR_BIT(va_access_time) |
1931			VNODE_ATTR_BIT(va_modify_time) |
1932			VNODE_ATTR_BIT(va_change_time) |
1933			VNODE_ATTR_BIT(va_fileid) |
1934			VNODE_ATTR_BIT(va_fsid) |
1935			VNODE_ATTR_BIT(va_filerev) |
1936			VNODE_ATTR_BIT(va_gen) |
1937			VNODE_ATTR_BIT(va_encoding) |
1938			0;
1939	/*
1940	 * Return va_parentid, i.e. the mft record number of the parent of the
1941	 * inode, if it was requested.
1942	 *
1943	 * We have to return 1, i.e. fsRtParID, for the parent inode number of
1944	 * the root directory inode for compatibility with Carbon.  Simillarly
1945	 * we have to return 2, i.e. fsRtDirID, if the parent inode is the root
1946	 * directory inode.
1947	 *
1948	 * For all other inodes we try to get the parent from the vnode and if
1949	 * it does not have the vnode cached then if the inode is an attribute
1950	 * inode we return the inode number of the base inode (in line with how
1951	 * named streams work on Mac OS X) and otherwise we obtain the parent
1952	 * mft reference by looking up a filename attribute record in the mft
1953	 * record of the inode and obtaining the parent mft record reference
1954	 * from there.
1955	 *
1956	 * There is one pitfall with this approach for files and that is that a
1957	 * file may have multiple parents and we are returning a random one but
1958	 * that is the best we can do.
1959	 *
1960	 * To make this a little better we get the name at the same time as we
1961	 * get the parent mft reference so we can at least return a parent id
1962	 * and name that match, i.e. the name is present in the parent id.
1963	 *
1964	 * And to make this even better, when the parent is requested and a
1965	 * name is cached in the vnode, we use the name in the vnode to find
1966	 * the parent that matches that name if it exists.  If it does not
1967	 * exist we revert to finding a random parent.
1968	 */
1969	if (VATTR_IS_ACTIVE(va, va_parentid)) {
1970		ino64_t parent_mft_no;
1971		vnode_t parent_vn;
1972
1973		if (is_root && base_ni == ni)
1974			VATTR_RETURN(va, va_parentid, 1);
1975		else if ((parent_vn = vnode_getparent(ni->vn))) {
1976			parent_mft_no = NTFS_I(parent_vn)->mft_no;
1977			(void)vnode_put(parent_vn);
1978			have_parent = TRUE;
1979			if (parent_mft_no == FILE_root)
1980				parent_mft_no = 2;
1981			VATTR_RETURN(va, va_parentid, parent_mft_no);
1982		} else if (ni != base_ni) {
1983			parent_mft_no = base_ni->mft_no;
1984			if (parent_mft_no == FILE_root)
1985				parent_mft_no = 2;
1986			VATTR_RETURN(va, va_parentid, parent_mft_no);
1987		} else /* if (ni == base_ni) */ {
1988			name_is_done = TRUE;
1989			name = NULL;
1990			if (VATTR_IS_ACTIVE(va, va_name))
1991				name = va->va_name;
1992			err = ntfs_inode_get_name_and_parent_mref(base_ni,
1993					FALSE, &parent_mref, name);
1994			if (err) {
1995				ntfs_error(base_ni->vol->mp, "Failed to obtain "
1996						"parent mft reference for "
1997						"mft_no 0x%llx (error %d).",
1998						(unsigned long long)
1999						base_ni->mft_no, err);
2000				goto err;
2001			}
2002			parent_mft_no = MREF(parent_mref);
2003			if (parent_mft_no == FILE_root)
2004				parent_mft_no = 2;
2005			va->va_parentid = parent_mft_no;
2006			va->va_supported |= VNODE_ATTR_BIT(va_parentid) |
2007					(name ? VNODE_ATTR_BIT(va_name) : 0);
2008		}
2009	}
2010	/*
2011	 * Return va_name, i.e. the name of the inode, if it was requested.
2012	 *
2013	 * If this is the root directory of the volume, leave it to the VFS to
2014	 * find the mounted-on name, which is different from the real volume
2015	 * root directory name of "." (this is ensured by the fact that
2016	 * @name_is_done was set to TRUE for the root directory earlier).
2017	 *
2018	 * For all other inodes we try to get the name from the vnode and if it
2019	 * does not have the name cached we obtain the name by looking up a
2020	 * filename attribute record in the mft record of the inode and using
2021	 * that.
2022	 *
2023	 * Note we do not need to do anything if we dealt with the name as part
2024	 * of dealing with va_parentid above.  In this case @name_is_done will
2025	 * be set to true.
2026	 *
2027	 * Also we do not need to do anything if we tried to deal with
2028	 * va_parentid above and failed as we would only fail again here.  This
2029	 * means that if @err is not zero we skip the call to
2030	 * ntfs_inode_get_name_and_parent_mref().
2031	 *
2032	 * TODO: What do we return for attribute inodes?  Shall we exclude them
2033	 * from VNOP_GETATTR() altogether?  For now we simply do not return a
2034	 * name for them.
2035	 */
2036	if (!name_is_done && VATTR_IS_ACTIVE(va, va_name) && ni == base_ni) {
2037		name = vnode_getname(base_ni->vn);
2038		if (name) {
2039			(void)strlcpy(va->va_name, name, MAXPATHLEN - 1);
2040			VATTR_SET_SUPPORTED(va, va_name);
2041			(void)vnode_putname(name);
2042		} else {
2043			err = ntfs_inode_get_name_and_parent_mref(base_ni,
2044					have_parent, &parent_mref, va->va_name);
2045			if (err) {
2046				ntfs_error(base_ni->vol->mp, "Failed to obtain "
2047						"parent mft reference for "
2048						"mft_no 0x%llx (error %d).",
2049						(unsigned long long)
2050						base_ni->mft_no, err);
2051				goto err;
2052			}
2053			/*
2054			 * We forcibly overwrite the parent id with the
2055			 * possibly new parent id here to be consistent with
2056			 * the name, i.e. we want the name we return to
2057			 * actually exist in the returned parent.
2058			 *
2059			 * If we already had the parent id from before then
2060			 * ntfs_inode_get_name_and_parent_mref() will have
2061			 * found the name matching this parent id thus our
2062			 * setting of the parent id here will be a no-op.
2063			 */
2064			va->va_parentid = MREF(parent_mref);
2065			if (va->va_parentid == FILE_root)
2066				va->va_parentid = 2;
2067			va->va_supported |= VNODE_ATTR_BIT(va_parentid) |
2068					VNODE_ATTR_BIT(va_name);
2069		}
2070	}
2071	/*
2072	 * Unlock the attribute inode as we do not need it any more and so we
2073	 * cannot deadlock with converting the lock on the base inode to
2074	 * exclusive and with the call to ntfs_inode_afpinfo_read() below.
2075	 */
2076	if (ni != base_ni)
2077		lck_rw_unlock_shared(&ni->lock);
2078	if (VATTR_IS_ACTIVE(va, va_backup_time)) {
2079		if (!NInoValidBackupTime(base_ni)) {
2080			if (!lck_rw_lock_shared_to_exclusive(&base_ni->lock)) {
2081				lck_rw_lock_exclusive(&base_ni->lock);
2082				if (NInoDeleted(base_ni)) {
2083					cache_purge(base_ni->vn);
2084					lck_rw_unlock_exclusive(&base_ni->lock);
2085					return ENOENT;
2086				}
2087			}
2088			lock = LCK_RW_TYPE_EXCLUSIVE;
2089			/*
2090			 * Load the AFP_AfpInfo stream and initialize the
2091			 * backup time and Finder Info (if they are not already
2092			 * valid).
2093			 */
2094			err = ntfs_inode_afpinfo_read(base_ni);
2095			if (err) {
2096				ntfs_error(base_ni->vol->mp, "Failed to "
2097						"read AFP_AfpInfo attribute "
2098						"from inode 0x%llx (error "
2099						"%d).", (unsigned long long)
2100						base_ni->mft_no, err);
2101				lck_rw_unlock_exclusive(&base_ni->lock);
2102				return err;
2103			}
2104			if (!NInoValidBackupTime(base_ni))
2105				panic("%s(): !NInoValidBackupTime(base_ni)\n",
2106						__FUNCTION__);
2107		}
2108		VATTR_RETURN(va, va_backup_time, base_ni->backup_time);
2109	}
2110	if (lock == LCK_RW_TYPE_SHARED)
2111		lck_rw_unlock_shared(&base_ni->lock);
2112	else
2113		lck_rw_unlock_exclusive(&base_ni->lock);
2114	ntfs_debug("Done.");
2115	return 0;
2116err:
2117	lck_rw_unlock_shared(&ni->lock);
2118	if (ni != base_ni)
2119		lck_rw_unlock_shared(&base_ni->lock);
2120	return err;
2121}
2122
2123/**
2124 * ntfs_vnop_setattr - set attributes of a vnode or of the mounted volume
2125 * @a:		arguments to setattr function
2126 *
2127 * @a contains:
2128 *	vnode_t a_vp;			vnode of which to set attributes
2129 *	struct vnode_attr *a_vap;	attributes to set and source
2130 *	vfs_context_t a_context;
2131 *
2132 * Set the attributes described by @a_vap in the vnode @a_vp.  Some attributes
2133 * are intercepted by the VFS in setattrlist() and setvolattrlist() so we do
2134 * not bother with them.
2135 *
2136 * At present we do not support all attributes.  We declare what we support to
2137 * the world in our VFS_GETATTR() function (ntfs_vfsops.c::ntfs_getattr()) so
2138 * do not forget to update that when support for further attributes is added
2139 * here.
2140 *
2141 * Return 0 on success and errno on error.
2142 *
2143 * TODO: Implement more attributes.
2144 */
2145static int ntfs_vnop_setattr(struct vnop_setattr_args *a)
2146{
2147	ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
2148	ntfs_volume *vol;
2149	struct vnode_attr *va = a->a_vap;
2150	errno_t err = 0;
2151	BOOL dirty_times = FALSE;
2152
2153	if (!ni) {
2154		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
2155		return EINVAL;
2156	}
2157	vol = ni->vol;
2158	ntfs_debug("Entering for mft_no 0x%llx.",
2159			(unsigned long long)ni->mft_no);
2160	base_ni = ni;
2161	if (NInoAttr(ni)) {
2162		base_ni = ni->base_ni;
2163		lck_rw_lock_exclusive(&base_ni->lock);
2164	}
2165	lck_rw_lock_exclusive(&ni->lock);
2166	/* Do not allow messing with the inode once it has been deleted. */
2167	if (NInoDeleted(ni)) {
2168		/* Remove the inode from the name cache. */
2169		cache_purge(ni->vn);
2170		err = ENOENT;
2171		goto unl_err;
2172	}
2173	if (VATTR_IS_ACTIVE(va, va_data_size)) {
2174		ntfs_debug("Changing size for mft_no 0x%llx to 0x%llx.",
2175				(unsigned long long)ni->mft_no,
2176				(unsigned long long)va->va_data_size);
2177#if 1		// TODO: Remove this when sparse support is done...
2178		if (NInoSparse(ni)) {
2179			err = ENOTSUP;
2180			goto unl_err;
2181		}
2182#endif
2183		/*
2184		 * Do not allow calling for $MFT/$DATA as it would destroy the
2185		 * volume.
2186		 *
2187		 * Also only allow setting the size of VREG vnodes as that
2188		 * covers both regular files and named streams whilst excluding
2189		 * symbolic links for example.
2190		 */
2191		if (vnode_vtype(ni->vn) != VREG ||
2192				(!ni->mft_no && !NInoAttr(ni)))
2193			err = EPERM;
2194		else
2195			err = ntfs_attr_resize(ni, va->va_data_size,
2196					va->va_vaflags & 0xffff, NULL);
2197		if (err) {
2198			ntfs_error(vol->mp, "Failed to set inode size (error "
2199					"%d).", err);
2200			goto unl_err;
2201		}
2202		VATTR_SET_SUPPORTED(va, va_data_size);
2203	}
2204	/*
2205	 * Unlock the attribute inode as we do not need it any more and so we
2206	 * cannot deadlock with the call to ntfs_inode_afpinfo_write() below.
2207	 */
2208	if (ni != base_ni)
2209		lck_rw_unlock_exclusive(&ni->lock);
2210	if (VATTR_IS_ACTIVE(va, va_flags)) {
2211		u32 flags = va->va_flags;
2212		BOOL dirty_flags = FALSE;
2213
2214		/*
2215		 * Only allow changing of supported flags.  There are two
2216		 * exceptions and those are the archived flag and read-only bit
2217		 * on directories which are not supported on NTFS but we have
2218		 * to ignore them or too many things break such as "cp -pr"
2219		 * from a more sensible file system.
2220		 */
2221		if (flags & ~(SF_ARCHIVED | SF_IMMUTABLE | UF_IMMUTABLE |
2222				UF_HIDDEN /* | SF_COMPRESSED */)) {
2223			ntfs_error(vol->mp, "Cannot set unsupported flags "
2224					"0x%x.",
2225					(unsigned)(flags & ~(SF_ARCHIVED |
2226					SF_IMMUTABLE | UF_IMMUTABLE |
2227					UF_HIDDEN)));
2228			err = EINVAL;
2229			goto err;
2230		}
2231		/*
2232		 * We do not allow modification for any of the core NTFS
2233		 * system files which we want to remain as they are except that
2234		 * we silently ignore changes to the root directory.
2235		 */
2236		if (base_ni->mft_no < FILE_first_user &&
2237				base_ni != vol->root_ni) {
2238			ntfs_error(vol->mp, "Refusing to change flags on core "
2239					"NTFS system file (mft_no 0x%llx).",
2240					(unsigned long long)base_ni->mft_no);
2241			err = EPERM;
2242			goto err;
2243		}
2244		/*
2245		 * We currently do not support changing the compression state
2246		 * of a vnode.
2247		 *
2248		 * Further, only the base inode may be compressed.
2249		 */
2250/*
2251 *		if (((flags & SF_COMPRESSED) && !NInoCompressed(ni)) ||
2252 *				(!(flags & SF_COMPRESSED) &&
2253 *				NInoCompressed(ni))) {
2254 *			if (ni != base_ni) {
2255 *				ntfs_error(vol->mp, "Only regular files and "
2256 *						"directories may be "
2257 *						"compressed, aborting.");
2258 *				err = EINVAL;
2259 *				goto err;
2260 *			}
2261 *			ntfs_warning(vol->mp, "Changing the compression state "
2262 *					"is not supported at present, "
2263 *					"returning ENOTSUP.");
2264 *			err = ENOTSUP;
2265 *			goto err;
2266 *		}
2267 */
2268		/*
2269		 * The root directory of a volume always has the hidden bit set
2270		 * but we pretend that it is not hidden to OS X and we do not
2271		 * allow this bit to be modified for the root directory.
2272		 */
2273		if (base_ni != vol->root_ni) {
2274			/*
2275			 * If the Finder info is valid need to update it as
2276			 * well.  Note setting or clearing the hidden flag in
2277			 * the Finder info does not cause the Finder info to
2278			 * become dirty as the hidden bit is not stored on disk
2279			 * in the Finder info.
2280			 */
2281			if (flags & UF_HIDDEN) {
2282				base_ni->file_attributes |= FILE_ATTR_HIDDEN;
2283				if (NInoValidFinderInfo(base_ni))
2284					base_ni->finder_info.attrs |=
2285							FINDER_ATTR_IS_HIDDEN;
2286			} else {
2287				base_ni->file_attributes &= ~FILE_ATTR_HIDDEN;
2288				if (NInoValidFinderInfo(base_ni))
2289					base_ni->finder_info.attrs &=
2290							~FINDER_ATTR_IS_HIDDEN;
2291			}
2292			dirty_flags = TRUE;
2293		}
2294		/*
2295		 * Windows does not allow users to set/clear the read-only bit
2296		 * on directories.  In fact Windows uses the read-only bit on a
2297		 * directory to signify that a customized or specialized folder
2298		 * view is in effect thus we do not allow setting/clearing the
2299		 * read-only bit on directories from OS X.
2300		 *
2301		 * Windows does not set the "needs archiving" bit on
2302		 * directories.
2303		 *
2304		 * The only exception are encrypted directories which do have
2305		 * the "needs archiving" bit set but we do not want to allow
2306		 * this bit to be cleared so ignore them, too.
2307		 */
2308		if (!S_ISDIR(base_ni->mode)) {
2309			if (flags & (SF_IMMUTABLE | UF_IMMUTABLE))
2310				base_ni->file_attributes |= FILE_ATTR_READONLY;
2311			else
2312				base_ni->file_attributes &= ~FILE_ATTR_READONLY;
2313			if (flags & SF_ARCHIVED)
2314				base_ni->file_attributes &= ~FILE_ATTR_ARCHIVE;
2315			else
2316				base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
2317			dirty_flags = TRUE;
2318		}
2319		if (dirty_flags)
2320			NInoSetDirtyFileAttributes(base_ni);
2321		VATTR_SET_SUPPORTED(va, va_flags);
2322	}
2323	if (VATTR_IS_ACTIVE(va, va_create_time)) {
2324		base_ni->creation_time = va->va_create_time;
2325		VATTR_SET_SUPPORTED(va, va_create_time);
2326		dirty_times = TRUE;
2327	}
2328	if (VATTR_IS_ACTIVE(va, va_modify_time)) {
2329		base_ni->last_data_change_time = va->va_modify_time;
2330		VATTR_SET_SUPPORTED(va, va_modify_time);
2331		dirty_times = TRUE;
2332		/*
2333		 * The following comment came from the HFS code:
2334		 *
2335		 * <quote>The utimes system call can reset the modification
2336		 * time but it doesn't know about HFS create times.  So we need
2337		 * to ensure that the creation time is always at least as old
2338		 * as the modification time.</quote>
2339		 *
2340		 * SMB also follows this behaviour and it also adds the
2341		 * following comment:
2342		 *
2343		 * <quote>The HFS code also checks to make sure it was not the
2344		 * root vnode. Don Brady said that the SMB code should not use
2345		 * that part of the check.</quote>
2346		 *
2347		 * I assume the root vnode check is there in HFS as it does not
2348		 * support times on the root vnode at all so the check is
2349		 * needed for HFS only.
2350		 *
2351		 * The same applies for NTFS so follow the HFS/SMB behaviour.
2352		 *
2353		 * One salient point is that we only do the above if the
2354		 * creation time is not being explicitly set already.
2355		 */
2356		if (!VATTR_IS_ACTIVE(va, va_create_time) &&
2357				(va->va_modify_time.tv_sec <
2358				base_ni->creation_time.tv_sec ||
2359				(va->va_modify_time.tv_sec ==
2360				base_ni->creation_time.tv_sec &&
2361				va->va_modify_time.tv_nsec <
2362				base_ni->creation_time.tv_nsec)))
2363			base_ni->creation_time = va->va_modify_time;
2364	}
2365	if (VATTR_IS_ACTIVE(va, va_change_time)) {
2366		base_ni->last_mft_change_time = va->va_change_time;
2367		VATTR_SET_SUPPORTED(va, va_change_time);
2368		dirty_times = TRUE;
2369	}
2370	if (VATTR_IS_ACTIVE(va, va_access_time)) {
2371		base_ni->last_access_time = va->va_access_time;
2372		VATTR_SET_SUPPORTED(va, va_access_time);
2373		dirty_times = TRUE;
2374	}
2375	if (dirty_times)
2376		NInoSetDirtyTimes(base_ni);
2377	if (VATTR_IS_ACTIVE(va, va_backup_time)) {
2378		base_ni->backup_time = va->va_backup_time;
2379		NInoSetValidBackupTime(base_ni);
2380		NInoSetDirtyBackupTime(base_ni);
2381		/*
2382		 * Now write (if needed creating) the AFP_AfpInfo attribute
2383		 * with the specified backup time.
2384		 */
2385		err = ntfs_inode_afpinfo_write(base_ni);
2386		if (err) {
2387			ntfs_error(vol->mp, "Failed to write/create "
2388					"AFP_AfpInfo attribute in inode "
2389					"0x%llx (error %d).",
2390					(unsigned long long)base_ni->mft_no,
2391					err);
2392			goto err;
2393		}
2394		VATTR_SET_SUPPORTED(va, va_backup_time);
2395	}
2396	ntfs_debug("Done.");
2397err:
2398	lck_rw_unlock_exclusive(&base_ni->lock);
2399	return err;
2400unl_err:
2401	if (ni != base_ni)
2402		lck_rw_unlock_exclusive(&ni->lock);
2403	goto err;
2404}
2405
2406/* Limit the internal i/o size so we can represent it in a 32-bit int. */
2407#define NTFS_MAX_IO_REQUEST_SIZE	(1024 * 1024 * 256)
2408
2409/**
2410 * ntfs_vnop_read_compressed - read from a compressed attribute
2411 * @ni:		ntfs inode describing the compressed attribute to read
2412 * @uio:	destination in which to return the read data
2413 * @data_size:	data size of the compressed attribute
2414 * @ioflags:	flags further describing the read request (see ntfs_vnop_read())
2415 *
2416 * This is a helper function for ntfs_vnop_read() (see below).  It is called
2417 * when a read request for a compressed attribute is received by
2418 * ntfs_vnop_read().
2419 *
2420 * This function is somewhat similar to cluster_read() or to be more precise to
2421 * cluster_read_copy() in that it breaks up large i/os into smaller manageable
2422 * chunks, and for each chunk tries to get the data from the vm page cache and
2423 * return it in the destination buffer described by @uio and failing that, it
2424 * creates and maps a upl and causes it to be filled with data by calling
2425 * ntfs_read_compressed() which reads the compressed data via the raw inode and
2426 * decompresses it into our mapped upl and once that is done we now have the
2427 * data in the vm page cache and copy it into the destination buffer described
2428 * by @uio.
2429 *
2430 * Return 0 on success and errno on error.
2431 */
2432static inline int ntfs_vnop_read_compressed(ntfs_inode *ni, uio_t uio,
2433		const s64 data_size, int ioflags)
2434{
2435	s64 size;
2436	user_ssize_t start_count;
2437	off_t ofs;
2438	vnode_t vn = ni->vn;
2439	ntfs_inode *raw_ni;
2440	upl_t upl;
2441	upl_page_info_t *pl;
2442	kern_return_t kerr;
2443	int count, err, align_mask, cur_pg, last_pg;
2444	int max_upl_size = ubc_upl_maxbufsize();
2445
2446	ofs = uio_offset(uio);
2447	start_count = uio_resid(uio);
2448	ntfs_debug("Entering for compressed file inode 0x%llx, offset 0x%llx, "
2449			"count 0x%llx, ioflags 0x%x.",
2450			(unsigned long long)ni->mft_no,
2451			(unsigned long long)ofs,
2452			(unsigned long long)start_count, ioflags);
2453	/*
2454	 * We can only read from regular files and named streams that are
2455	 * compressed and non-resident.  We should never be called for anything
2456	 * else.
2457	 */
2458	if (ni->type != AT_DATA || !NInoCompressed(ni) ||
2459			!NInoNonResident(ni) || NInoEncrypted(ni) ||
2460			NInoRaw(ni))
2461		panic("%s(): Called for inappropriate inode.\n", __FUNCTION__);
2462	/*
2463	 * Get the raw inode.  We take the inode lock shared to protect against
2464	 * concurrent writers as the compressed data is invalid whilst a write
2465	 * is in progress.
2466	 */
2467	err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni);
2468	if (err) {
2469		ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).",
2470				err);
2471		return err;
2472	}
2473	if (!NInoRaw(raw_ni))
2474		panic("%s(): Requested raw inode but got non-raw one.\n",
2475				__FUNCTION__);
2476	lck_spin_lock(&raw_ni->size_lock);
2477	size = ubc_getsize(raw_ni->vn);
2478	if (size != raw_ni->data_size)
2479		panic("%s(): size != raw_ni->data_size\n", __FUNCTION__);
2480	lck_spin_unlock(&raw_ni->size_lock);
2481	/*
2482	 * If nothing was requested or the request starts at or beyond the end
2483	 * of the attribute, we do not need to do anything.
2484	 */
2485	if (!start_count || ofs >= data_size) {
2486		err = 0;
2487		goto err;
2488	}
2489	/* Cannot read from a negative offset. */
2490	if (ofs < 0) {
2491		err = EINVAL;
2492		goto err;
2493	}
2494	if (vnode_isnocache(vn) || vnode_isnocache(raw_ni->vn))
2495		ioflags |= IO_NOCACHE;
2496	if (vnode_isnoreadahead(vn) || vnode_isnoreadahead(raw_ni->vn))
2497		ioflags |= IO_RAOFF;
2498	align_mask = ni->compression_block_size - 1;
2499	if (align_mask < PAGE_MASK)
2500		align_mask = PAGE_MASK;
2501	/*
2502	 * Loop until we have finished the whole request or reached the end of
2503	 * the attribute.
2504	 *
2505	 * FIXME: We do not bother with read-ahead on the uncompressed vnode
2506	 * for now except to the extent that we always decompress full
2507	 * compression blocks which may be larger than the current i/o request
2508	 * so the next i/o request will find the whole compression block
2509	 * decompressed in the vm page cache thus small reads will in effect
2510	 * experience a certain amount of read-ahead in this way.
2511	 */
2512	do {
2513		u8 *kaddr;
2514		int delta, next_pg, orig_count;
2515
2516		size = data_size - ofs;
2517		if (size > start_count)
2518			size = start_count;
2519		count = size;
2520		/*
2521		 * Break up the i/o in chunks that fit into a 32-bit int so
2522		 * we can call cluster_copy_ubc_data(), etc.
2523		 */
2524		if (size > NTFS_MAX_IO_REQUEST_SIZE)
2525			count = NTFS_MAX_IO_REQUEST_SIZE;
2526		/*
2527		 * First of all, try to copy the data from the vm page cache.
2528		 * This will work on the second and all later reads so this is
2529		 * the hot path.  If the attribute has not been accessed at all
2530		 * before or its cached pages were dropped due to vm pressure
2531		 * this will fail to copy any data due to the lack of a valid
2532		 * page and we will drop into the slow path.
2533		 */
2534		if (!(ioflags & IO_NOCACHE)) {
2535			err = cluster_copy_ubc_data(vn, uio, &count, 0);
2536			if (err) {
2537				/*
2538				 * The copying (uiomove()) failed with an
2539				 * error, abort.
2540				 */
2541				ntfs_error(ni->vol->mp,
2542						"cluster_copy_ubc_data() "
2543						"failed (error %d).", err);
2544				goto err;
2545			}
2546			/*
2547			 * @count is now set to the number of bytes remaining
2548			 * to be transferred.  If it is zero, it means all the
2549			 * pages were in the vm page cache so we can skip onto
2550			 * the next part of the i/o.
2551			 */
2552			if (!count)
2553				continue;
2554			ofs = uio_offset(uio);
2555		}
2556		/*
2557		 * Only some or none of the pages were in the vm page cache or
2558		 * this is not a cached i/o.  First align this i/o request to
2559		 * compression block boundaries and to PAGE_SIZE boundaries and
2560		 * truncate it to the maximum upl size then create and map a
2561		 * page list so we can fill it with the data.
2562		 */
2563		delta = ofs & align_mask;
2564		ofs -= delta;
2565		orig_count = count;
2566		count += delta;
2567		count = (count + align_mask) & ~(off_t)align_mask;
2568		if (count > max_upl_size)
2569			count = max_upl_size;
2570		/*
2571		 * Do not exceed the attribute size except for a final partial
2572		 * page.
2573		 */
2574		size = (data_size - ofs + PAGE_MASK) & ~PAGE_MASK_64;
2575		if (count > size)
2576			count = size;
2577		start_count = count;
2578		kerr = ubc_create_upl(vn, ofs, count, &upl, &pl, UPL_SET_LITE);
2579		if (kerr != KERN_SUCCESS)
2580			panic("%s(): Failed to get page list (error %d).\n",
2581					__FUNCTION__, (int)kerr);
2582		kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
2583		if (kerr != KERN_SUCCESS) {
2584			ntfs_error(ni->vol->mp, "Failed to map page list "
2585					"(error %d).", (int)kerr);
2586			err = EIO;
2587			goto abort_err;
2588		}
2589		/*
2590		 * We know @ofs starts on both a compression block and a page
2591		 * boundary.  We read from the compressed raw vnode
2592		 * decompressing the data into our mapped page list.  Any
2593		 * already valid pages are automatically skipped.
2594		 */
2595		err = ntfs_read_compressed(ni, raw_ni, ofs, count, kaddr, pl,
2596				ioflags);
2597		if (err) {
2598			ntfs_error(ni->vol->mp, "Failed to decompress data "
2599					"(error %d).", err);
2600			goto unm_err;
2601		}
2602		/*
2603		 * We now have the entire page list filled with valid pages,
2604		 * thus we can now copy from the mapped page list into the
2605		 * destination buffer using uiomove().  We just need to make
2606		 * sure not to copy past the end of the attribute.
2607		 */
2608		ofs += delta;
2609		count -= delta;
2610		if (count > orig_count)
2611			count = orig_count;
2612		if (ofs + count > data_size)
2613			count = data_size - ofs;
2614		err = uiomove((caddr_t)(kaddr + delta), count, uio);
2615		if (err) {
2616			ntfs_error(ni->vol->mp, "uiomove() failed (error %d).",
2617					err);
2618			goto unm_err;
2619		}
2620		kerr = ubc_upl_unmap(upl);
2621		if (kerr != KERN_SUCCESS) {
2622			ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed "
2623					"(error %d).", (int)kerr);
2624			err = EIO;
2625			goto abort_err;
2626		}
2627		/*
2628		 * We are done with the page list, commit and/or abort the
2629		 * pages.
2630		 */
2631		next_pg = 0;
2632		last_pg = start_count >> PAGE_SHIFT;
2633		do {
2634			int commit_flags;
2635			BOOL was_valid, was_dirty;
2636
2637			cur_pg = next_pg;
2638			/* Determine the state of the current first page. */
2639			was_valid = upl_valid_page(pl, cur_pg);
2640			was_dirty = (was_valid && upl_dirty_page(pl, cur_pg));
2641			/* Find sequential pages of the same state. */
2642			for (next_pg = cur_pg + 1; next_pg < last_pg;
2643					next_pg++) {
2644				if (was_valid != upl_valid_page(pl, next_pg))
2645					break;
2646				if (was_valid) {
2647					if (was_dirty != upl_dirty_page(pl,
2648							next_pg))
2649						break;
2650				}
2651			}
2652			count = (next_pg - cur_pg) << PAGE_SHIFT;
2653			/*
2654			 * For a set of pages that were invalid and hence we
2655			 * just filled them with data we commit and clean them
2656			 * unless no caching is requested in which case we dump
2657			 * them.
2658			 *
2659			 * For a set of pages that were already valid and hence
2660			 * we did not touch we commit them taking care to
2661			 * preserve any dirty state unless the pages were clean
2662			 * and no caching is requested in which case we dump
2663			 * them.
2664			 */
2665			if (ioflags & IO_NOCACHE && !was_dirty) {
2666				ubc_upl_abort_range(upl, cur_pg << PAGE_SHIFT,
2667						count, UPL_ABORT_DUMP_PAGES |
2668						UPL_ABORT_FREE_ON_EMPTY);
2669				continue;
2670			}
2671			commit_flags = UPL_COMMIT_FREE_ON_EMPTY |
2672					UPL_COMMIT_INACTIVATE;
2673			if (!was_valid)
2674				commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
2675			else if (was_dirty)
2676				commit_flags |= UPL_COMMIT_SET_DIRTY;
2677			ubc_upl_commit_range(upl, cur_pg << PAGE_SHIFT, count,
2678					commit_flags);
2679		} while (next_pg < last_pg);
2680	} while ((start_count = uio_resid(uio)) &&
2681			(ofs = uio_offset(uio)) < data_size);
2682	ntfs_debug("Done.");
2683err:
2684	lck_rw_unlock_shared(&raw_ni->lock);
2685	(void)vnode_put(raw_ni->vn);
2686	return err;
2687unm_err:
2688	kerr = ubc_upl_unmap(upl);
2689	if (kerr != KERN_SUCCESS)
2690		ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
2691				(int)kerr);
2692abort_err:
2693	/*
2694	 * We handle each page independently for simplicity.  We do not care
2695	 * for performance given this is an error code path.
2696	 *
2697	 * For a page that was not valid, we dump it as it still does not
2698	 * contain valid data.  For a page that was valid, we release it
2699	 * without modification as we have not touched it unless no caching is
2700	 * requested and the page was clean in which case we dump it.
2701	 */
2702	last_pg = start_count >> PAGE_SHIFT;
2703	for (cur_pg = 0; cur_pg < last_pg; cur_pg++) {
2704		int abort_flags;
2705
2706		abort_flags = UPL_ABORT_FREE_ON_EMPTY;
2707		if (!upl_valid_page(pl, cur_pg) || (ioflags & IO_NOCACHE &&
2708				!upl_dirty_page(pl, cur_pg)))
2709			abort_flags |= UPL_ABORT_DUMP_PAGES;
2710		ubc_upl_abort_range(upl, cur_pg << PAGE_SHIFT, PAGE_SIZE,
2711				abort_flags);
2712	}
2713	goto err;
2714}
2715
2716// TODO: Rename to ntfs_inode_read and move to ntfs_inode.[hc]?
2717/**
2718 * ntfs_read - read a number of bytes from an inode into memory
2719 * @ni:		ntfs inode whose data to read into memory
2720 * @uio:	destination in which to return the read data
2721 * @ioflags:	flags further describing the read request
2722 * @locked:	if true the ntfs inode lock is already taken for reading
2723 *
2724 * Read uio_resid(@uio) bytes from the ntfs inode @ni, starting at byte offset
2725 * uio_offset(@uio) into the inode into the destination buffer pointed to by
2726 * @uio.
2727 *
2728 * The flags in @ioflags further describe the read request.  The following
2729 * ioflags are currently defined in OS X kernel (a lot of them are not
2730 * applicable to VNOP_READ() however):
2731 *	IO_UNIT		- Do i/o as atomic unit.
2732 *	IO_APPEND	- Append write to end.
2733 *	IO_SYNC		- Do i/o synchronously.
2734 *	IO_NODELOCKED	- Underlying node already locked.
2735 *	IO_NDELAY	- FNDELAY flag set in file table.
2736 *	IO_NOZEROFILL	- F_SETSIZE fcntl uses this to prevent zero filling.
2737 *	IO_TAILZEROFILL	- Zero fills at the tail of write.
2738 *	IO_HEADZEROFILL	- Zero fills at the head of write.
2739 *	IO_NOZEROVALID	- Do not zero fill if valid page.
2740 *	IO_NOZERODIRTY	- Do not zero fill if page is dirty.
2741 *	IO_CLOSE	- The i/o was issued from close path.
2742 *	IO_NOCACHE	- Same effect as VNOCACHE_DATA, but only for this i/o.
2743 *	IO_RAOFF	- Same effect as VRAOFF, but only for this i/o.
2744 *	IO_DEFWRITE	- Defer write if vfs.defwrite is set.
2745 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
2746 *
2747 * For encrypted attributes we abort for now as we do not support them yet.
2748 *
2749 * For non-resident attributes we use cluster_read_ext() which deals with both
2750 * normal and multi sector transfer protected attributes and
2751 * ntfs_vnop_read_compressed() which deals with compressed attributes.
2752 *
2753 * For resident attributes we read the data from the vm page cache and if it is
2754 * not there we cause the vm page cache to be populated by reading the buffer
2755 * at offset 0 in the attribute.
2756 *
2757 * Return 0 on success and errno on error.
2758 *
2759 * Note it is up to the caller to verify that reading from the inode @ni makes
2760 * sense.  We cannot do the verification inside ntfs_read() as it is called
2761 * from various VNOPs which all have different requirements.  For example
2762 * VNOP_READLINK(), i.e. ntfs_vnop_readlink(), needs to only allow S_ISLNK()
2763 * inodes whilst VNOP_READ(), i.e. ntfs_vnop_read(), needs to not allow
2764 * S_ISLNK() but needs to allow S_IFREG() instead but only if it is not a
2765 * system file.
2766 */
2767static errno_t ntfs_read(ntfs_inode *ni, uio_t uio, const int ioflags,
2768		const BOOL locked)
2769{
2770	s64 size;
2771	user_ssize_t start_count;
2772	off_t ofs;
2773	vnode_t vn = ni->vn;
2774	ntfs_inode *base_ni;
2775	upl_t upl;
2776	upl_page_info_array_t pl;
2777	u8 *kaddr;
2778	int err, count;
2779
2780	ofs = uio_offset(uio);
2781	start_count = uio_resid(uio);
2782	base_ni = ni;
2783	if (NInoAttr(ni))
2784		base_ni = ni->base_ni;
2785	ntfs_debug("Entering for file inode 0x%llx, offset 0x%llx, count "
2786			"0x%llx, ioflags 0x%x, locked is %s.",
2787			(unsigned long long)ni->mft_no,
2788			(unsigned long long)ofs,
2789			(unsigned long long)start_count, ioflags,
2790			locked ? "true" : "false");
2791	/*
2792	 * Protect against changes in initialized_size and thus against
2793	 * truncation also.
2794	 */
2795	if (!locked)
2796		lck_rw_lock_shared(&ni->lock);
2797	/* Do not allow messing with the inode once it has been deleted. */
2798	if (NInoDeleted(ni)) {
2799		if (!locked)
2800			lck_rw_unlock_shared(&ni->lock);
2801		/* Remove the inode from the name cache. */
2802		cache_purge(ni->vn);
2803		return ENOENT;
2804	}
2805	/*
2806	 * TODO: This check may no longer be necessary now that we lock against
2807	 * changes in initialized size and thus truncation...  Revisit this
2808	 * issue when the write code has been written and remove the check if
2809	 * appropriate simply using ubc_getsize(vn); without the size_lock.
2810	 */
2811	lck_spin_lock(&ni->size_lock);
2812	size = ubc_getsize(vn);
2813	if (size > ni->data_size)
2814		size = ni->data_size;
2815	lck_spin_unlock(&ni->size_lock);
2816	/*
2817	 * If nothing was requested or the request starts at or beyond the end
2818	 * of the attribute, we do not need to do anything.
2819	 */
2820	if (!start_count || ofs >= size) {
2821		err = 0;
2822		goto err;
2823	}
2824	/* Cannot read from a negative offset. */
2825	if (ofs < 0) {
2826		err = EINVAL;
2827		goto err;
2828	}
2829	/* TODO: Deny access to encrypted attributes, just like NT4. */
2830	if (NInoEncrypted(ni)) {
2831		ntfs_warning(ni->vol->mp, "Denying access to encrypted "
2832				"attribute (EACCES).");
2833		err = EACCES;
2834		goto err;
2835	}
2836	if (NInoNonResident(ni)) {
2837		int (*callback)(buf_t, void *);
2838
2839		if (NInoCompressed(ni) && !NInoRaw(ni)) {
2840			err = ntfs_vnop_read_compressed(ni, uio, size, ioflags);
2841			if (!err)
2842				ntfs_debug("Done (ntfs_vnop_read_compressed()"
2843						").");
2844			else
2845				ntfs_error(ni->vol->mp, "Failed ("
2846						"ntfs_vnop_read_compressed(), "
2847						"error %d).", err);
2848			goto err;
2849		}
2850		callback = NULL;
2851		if (NInoMstProtected(ni) || NInoEncrypted(ni))
2852			callback = ntfs_cluster_iodone;
2853		err = cluster_read_ext(vn, uio, size, ioflags, callback, NULL);
2854		if (!err)
2855			ntfs_debug("Done (cluster_read_ext()).");
2856		else
2857			ntfs_error(ni->vol->mp, "Failed for file inode "
2858					"0x%llx, start offset 0x%llx, start "
2859					"count 0x%llx, now offset 0x%llx, "
2860					"now count 0x%llx, ioflags 0x%x "
2861					"(cluster_read_ext(), error %d).",
2862					(unsigned long long)ni->mft_no,
2863					(unsigned long long)ofs,
2864					(unsigned long long)start_count,
2865					(unsigned long long)uio_offset(uio),
2866					(unsigned long long)uio_resid(uio),
2867					ioflags, err);
2868		goto err;
2869	} /* else if (!NInoNonResident(ni)) */
2870	/*
2871	 * That attribute is resident thus we have to deal with it by
2872	 * ourselves.  First of all, try to copy the data from the vm page
2873	 * cache.  This will work on the second and all later reads so this is
2874	 * the hot path.  If the attribute has not been accessed at all before
2875	 * or its cached pages were dropped due to vm pressure this will fail
2876	 * to copy any data due to the lack of a valid page and we will drop
2877	 * into the slow path.
2878	 */
2879	size -= ofs;
2880	if (size > start_count)
2881		size = start_count;
2882	if (size > PAGE_SIZE) {
2883		ntfs_warning(ni->vol->mp, "Unexpected count 0x%llx > PAGE_SIZE "
2884				"0x%x, overriding it to PAGE_SIZE.",
2885				(unsigned long long)size, PAGE_SIZE);
2886		size = PAGE_SIZE;
2887	}
2888	count = size;
2889	err = cluster_copy_ubc_data(vn, uio, &count, 0);
2890	if (err) {
2891		/* The copying (uiomove()) failed with an error, abort. */
2892		ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() failed "
2893				"(error %d).", err);
2894		goto err;
2895	}
2896	/*
2897	 * @count is now set to the number of bytes remaining to be
2898	 * transferred.  If it is zero, it means we are done.  Note it is
2899	 * possible that there is more data requested, i.e. uio_resid(uio) > 0,
2900	 * but that just means the request goes beyond the end of the
2901	 * attribute.
2902	 */
2903	if (!count) {
2904		ntfs_debug("Done (resident, cached, returned 0x%llx bytes).",
2905				(unsigned long long)size);
2906		goto err;
2907	}
2908	/*
2909	 * We failed to transfer everything.  That really means we failed to
2910	 * transfer anything at all as we are guaranteed that a resident
2911	 * attribute is smaller than a page thus either the page is there and
2912	 * valid and we transfer everything or it is not and we transfer
2913	 * nothing.
2914	 */
2915	if (count != size) {
2916		ntfs_warning(ni->vol->mp, "Unexpected partial transfer from "
2917				"cached page (size 0x%llx, count 0x%x).",
2918				(unsigned long long)size, count);
2919		ofs = uio_offset(uio);
2920	}
2921	/*
2922	 * The page is not in cache or is not valid.  We need to bring it into
2923	 * cache and make it valid so we can then copy the data out.  The
2924	 * easiest way to do this is to just map the page which will take care
2925	 * of everything for us.  We can than uiomove() straight out of the
2926	 * page into the @uio and then unmap the page again.
2927	 *
2928	 * Note this will take the inode lock again but this is ok as in both
2929	 * cases the lock is taken shared.
2930	 */
2931	err = ntfs_page_map(ni, 0, &upl, &pl, &kaddr, FALSE);
2932	if (err) {
2933		ntfs_error(ni->vol->mp, "Failed to map page (error %d).", err);
2934		goto err;
2935	}
2936	err = uiomove((caddr_t)(kaddr + ofs), count, uio);
2937	ntfs_page_unmap(ni, upl, pl, FALSE);
2938	if (!err)
2939		ntfs_debug("Done (resident, not cached, returned 0x%llx "
2940				"bytes).", (unsigned long long)size -
2941				uio_resid(uio));
2942	else
2943		ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err);
2944err:
2945	/*
2946	 * Update the last_access_time (atime) if something was read and this
2947	 * is the base ntfs inode or it is a named stream (this is what HFS+
2948	 * does, too).
2949	 *
2950	 * Skip the update if atime updates are disabled via the noatime mount
2951	 * option or the volume is read only or this is a symbolic link.
2952	 *
2953	 * Also, skip the core system files except for the root directory.
2954	 */
2955	if (uio_resid(uio) < start_count && !NVolReadOnly(ni->vol) &&
2956			!(vfs_flags(ni->vol->mp) & MNT_NOATIME) &&
2957			!S_ISLNK(base_ni->mode) &&
2958			(ni == base_ni || ni->type == AT_DATA)) {
2959		BOOL need_update_time;
2960
2961		need_update_time = TRUE;
2962		if (ni->vol->major_ver > 1) {
2963			if (base_ni->mft_no <= FILE_Extend &&
2964					base_ni != ni->vol->root_ni)
2965				need_update_time = FALSE;
2966		} else {
2967			if (base_ni->mft_no <= FILE_UpCase &&
2968					base_ni != ni->vol->root_ni)
2969				need_update_time = FALSE;
2970		}
2971		if (need_update_time) {
2972			base_ni->last_access_time = ntfs_utc_current_time();
2973			NInoSetDirtyTimes(base_ni);
2974		}
2975	}
2976	if (!locked)
2977		lck_rw_unlock_shared(&ni->lock);
2978	return err;
2979}
2980
2981/**
2982 * ntfs_vnop_read - read a number of bytes from a file into memory
2983 * @a:		arguments to read function
2984 *
2985 * @a contains:
2986 *	vnode_t a_vp;		vnode of file whose data to read into memory
2987 *	uio_t a_uio;		destination in which to return the read data
2988 *	int a_ioflag;		flags further describing the read request
2989 *	vfs_context_t a_context;
2990 *
2991 * Read uio_resid(@a->a_uio) bytes from the vnode @a-a_vp, starting at byte
2992 * offset uio_offset(@a->a_uio) into the vnode into the destination buffer
2993 * pointed to by @uio.
2994 *
2995 * The flags in @a->a_ioflag further describe the read request.  The following
2996 * ioflags are currently defined in OS X kernel (a lot of them are not
2997 * applicable to VNOP_READ() however):
2998 *	IO_UNIT		- Do i/o as atomic unit.
2999 *	IO_APPEND	- Append write to end.
3000 *	IO_SYNC		- Do i/o synchronously.
3001 *	IO_NODELOCKED	- Underlying node already locked.
3002 *	IO_NDELAY	- FNDELAY flag set in file table.
3003 *	IO_NOZEROFILL	- F_SETSIZE fcntl uses this to prevent zero filling.
3004 *	IO_TAILZEROFILL	- Zero fills at the tail of write.
3005 *	IO_HEADZEROFILL	- Zero fills at the head of write.
3006 *	IO_NOZEROVALID	- Do not zero fill if valid page.
3007 *	IO_NOZERODIRTY	- Do not zero fill if page is dirty.
3008 *	IO_CLOSE	- The i/o was issued from close path.
3009 *	IO_NOCACHE	- Same effect as VNOCACHE_DATA, but only for this i/o.
3010 *	IO_RAOFF	- Same effect as VRAOFF, but only for this i/o.
3011 *	IO_DEFWRITE	- Defer write if vfs.defwrite is set.
3012 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
3013 *
3014 * For encrypted attributes we abort for now as we do not support them yet.
3015 *
3016 * For non-resident attributes we use cluster_read_ext() which deals with both
3017 * normal and multi sector transfer protected attributes and
3018 * ntfs_vnop_read_compressed() which deals with compressed attributes.
3019 *
3020 * For resident attributes we read the data from the vm page cache and if it is
3021 * not there we cause the vm page cache to be populated by reading the buffer
3022 * at offset 0 in the attribute.
3023 *
3024 * Return 0 on success and errno on error.
3025 */
3026static int ntfs_vnop_read(struct vnop_read_args *a)
3027{
3028	vnode_t vn = a->a_vp;
3029	ntfs_inode *ni = NTFS_I(vn);
3030
3031	if (!ni) {
3032		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
3033		return EINVAL;
3034	}
3035	/*
3036	 * We can only read from regular files and named streams.
3037	 *
3038	 * Also, do not allow reading from system files or mst protected
3039	 * attributes.
3040	 */
3041	if (vnode_issystem(vn) || NInoMstProtected(ni) ||
3042			(!S_ISREG(ni->mode) && !(NInoAttr(ni) &&
3043			ni->type == AT_DATA))) {
3044		if (S_ISDIR(ni->mode))
3045			return EISDIR;
3046		return EPERM;
3047	}
3048	return (int)ntfs_read(ni, a->a_uio, a->a_ioflag, FALSE);
3049}
3050
3051// TODO: Rename to ntfs_inode_write and move to ntfs_inode.[hc]?
3052/**
3053 * ntfs_write - write a number of bytes from a memory buffer into a file
3054 * @ni:			ntfs inode to write to
3055 * @uio:		source containing the data to write
3056 * @ioflags:		flags further describing the write request
3057 * @write_locked:	if true the ntfs inode lock is already taken for writing
3058 *
3059 * Write uio_resid(@uio) bytes from the source buffer specified by @uio to the
3060 * ntfs inode @ni, starting at byte offset uio_offset(@uio) into the inode.
3061 *
3062 * The flags in @ioflags further describe the write request.  The following
3063 * ioflags are currently defined in OS X kernel (not all of them are applicable
3064 * to VNOP_WRITE() however):
3065 *	IO_UNIT		- Do i/o as atomic unit.
3066 *	IO_APPEND	- Append write to end.
3067 *	IO_SYNC		- Do i/o synchronously.
3068 *	IO_NODELOCKED	- Underlying node already locked.
3069 *	IO_NDELAY	- FNDELAY flag set in file table.
3070 *	IO_NOZEROFILL	- F_SETSIZE fcntl uses this to prevent zero filling.
3071 *	IO_TAILZEROFILL	- Zero fills at the tail of write.
3072 *	IO_HEADZEROFILL	- Zero fills at the head of write.
3073 *	IO_NOZEROVALID	- Do not zero fill if valid page.
3074 *	IO_NOZERODIRTY	- Do not zero fill if page is dirty.
3075 *	IO_CLOSE	- The i/o was issued from close path.
3076 *	IO_NOCACHE	- Same effect as VNOCACHE_DATA, but only for this i/o.
3077 *	IO_RAOFF	- Same effect as VRAOFF, but only for this i/o.
3078 *	IO_DEFWRITE	- Defer write if vfs.defwrite is set.
3079 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
3080 *
3081 * For compressed and encrypted attributes we abort for now as we do not
3082 * support them yet.
3083 *
3084 * For non-resident attributes we use cluster_write_ext() which deals with
3085 * normal attributes.
3086 *
3087 * Return 0 on success and errno on error.
3088 *
3089 * Note it is up to the caller to verify that writing to the inode @ni makes
3090 * sense.  We cannot do the verification inside ntfs_write() as it is called
3091 * from various VNOPs which all have different requirements.  For example
3092 * VNOP_SYMLINK(), i.e. ntfs_vnop_symlink(), needs to write to S_ISLNK() inodes
3093 * whilst VNOP_WRITE(), i.e. ntfs_vnop_write(), needs to not allow S_ISLNK()
3094 * but needs to allow S_IFREG() instead but only if it is not a system file.
3095 */
3096static errno_t ntfs_write(ntfs_inode *ni, uio_t uio, int ioflags,
3097		BOOL write_locked)
3098{
3099	s64 old_size, size, end, nr_truncated;
3100	user_ssize_t old_count, count;
3101	off_t old_ofs, ofs;
3102	vnode_t vn = ni->vn;
3103	ntfs_inode *base_ni;
3104	upl_t upl;
3105	upl_page_info_array_t pl;
3106	u8 *kaddr;
3107	int cnt;
3108	errno_t err;
3109	BOOL was_locked, need_uptodate;
3110
3111	/* Do not allow writing if mounted read-only. */
3112	if (NVolReadOnly(ni->vol))
3113		return EROFS;
3114	nr_truncated = 0;
3115	ofs = old_ofs = uio_offset(uio);
3116	count = old_count = uio_resid(uio);
3117	ntfs_debug("Entering for file inode 0x%llx, offset 0x%llx, count "
3118			"0x%llx, ioflags 0x%x, write_locked is %s.",
3119			(unsigned long long)ni->mft_no,
3120			(unsigned long long)ofs,
3121			(unsigned long long)count, ioflags,
3122			write_locked ? "true" : "false");
3123	/* If nothing to do return success. */
3124	if (!count)
3125		return 0;
3126	/* Cannot write to a negative offset. */
3127	if (ofs < 0)
3128		return EINVAL;
3129	/* TODO: Deny access to encrypted attributes, just like NT4. */
3130	if (NInoEncrypted(ni)) {
3131		ntfs_warning(ni->vol->mp, "Denying write to encrypted "
3132				"attribute (EACCES).");
3133		return EACCES;
3134	}
3135	/* TODO: We do not support writing to compressed files. */
3136	if (NInoCompressed(ni)) {
3137		ntfs_error(ni->vol->mp, "Writing to compressed files is not "
3138				"implemented yet.  Sorry.");
3139		return ENOTSUP;
3140	}
3141#if 1	// TODO: Remove this when sparse support is done...
3142	if (NInoSparse(ni))
3143		return ENOTSUP;
3144#endif
3145	base_ni = ni;
3146	if (NInoAttr(ni))
3147		base_ni = ni->base_ni;
3148	/* The first byte after the write. */
3149	end = ofs + count;
3150	/*
3151	 * If we are going to extend the initialized size take the inode lock
3152	 * for writing and take it for reading otherwise.
3153	 *
3154	 * Appending will always cause the initialized size to be extended thus
3155	 * always take the lock for writing.
3156	 *
3157	 * Writing into holes requires us to take the lock for writing thus if
3158	 * this is a sparse file take the lock for writing just in case.
3159	 */
3160	was_locked = write_locked;
3161	if (ioflags & IO_APPEND) {
3162		if (!was_locked) {
3163			lck_rw_lock_exclusive(&ni->lock);
3164			write_locked = TRUE;
3165		}
3166		/*
3167		 * Do not allow messing with the inode once it has been
3168		 * deleted.
3169		 */
3170		if (NInoDeleted(ni)) {
3171			if (!was_locked)
3172				lck_rw_unlock_exclusive(&ni->lock);
3173			/* Remove the inode from the name cache. */
3174			cache_purge(ni->vn);
3175			return ENOENT;
3176		}
3177		lck_spin_lock(&ni->size_lock);
3178		ofs = ni->data_size;
3179		lck_spin_unlock(&ni->size_lock);
3180		uio_setoffset(uio, ofs);
3181		ntfs_debug("Write to mft_no 0x%llx, IO_APPEND flag is set, "
3182				"setting uio_offset() to file size 0x%llx.",
3183				(unsigned long long)ni->mft_no,
3184				(unsigned long long)ofs);
3185		/* Update the first byte after the write with the new offset. */
3186		end = ofs + count;
3187	} else {
3188		if (!was_locked) {
3189			if (NInoSparse(ni)) {
3190				lck_rw_lock_exclusive(&ni->lock);
3191				write_locked = TRUE;
3192			} else {
3193				lck_rw_lock_shared(&ni->lock);
3194				write_locked = FALSE;
3195			}
3196		}
3197recheck_deleted:
3198		/*
3199		 * Do not allow messing with the inode once it has been
3200		 * deleted.
3201		 */
3202		if (NInoDeleted(ni)) {
3203			if (!was_locked) {
3204				if (write_locked)
3205					lck_rw_unlock_exclusive(&ni->lock);
3206				else
3207					lck_rw_unlock_shared(&ni->lock);
3208			}
3209			/* Remove the inode from the name cache. */
3210			cache_purge(ni->vn);
3211			return ENOENT;
3212		}
3213		lck_spin_lock(&ni->size_lock);
3214		size = ni->initialized_size;
3215		lck_spin_unlock(&ni->size_lock);
3216		if (!write_locked && end > size) {
3217			/* If we fail to convert the lock, take it. */
3218			if (!lck_rw_lock_shared_to_exclusive(&ni->lock))
3219				lck_rw_lock_exclusive(&ni->lock);
3220			write_locked = TRUE;
3221			goto recheck_deleted;
3222		}
3223		ntfs_debug("Mft_no 0x%llx, inode lock taken for %s.",
3224				(unsigned long long)ni->mft_no,
3225				write_locked ? "writing" : "reading");
3226	}
3227	/*
3228	 * We do not want any form of zero filling to happen at the starting
3229	 * offset of the write as we sort this out ourselves.
3230	 *
3231	 * Further, we never want to zero fill at the end of the write as this
3232	 * is pointless.  We automatically get zero filling at the end of the
3233	 * page when a page is read in and when the initialized size is
3234	 * extended.
3235	 */
3236	ioflags &= ~(IO_HEADZEROFILL | IO_TAILZEROFILL);
3237	/*
3238	 * We do not want to zero any valid/dirty pages as they could already
3239	 * have new data written via mmap() for example and we do not want to
3240	 * lose that.
3241	 */
3242	ioflags |= IO_NOZEROVALID | IO_NOZERODIRTY;
3243	lck_spin_lock(&ni->size_lock);
3244	old_size = ni->data_size;
3245	size = ni->allocated_size;
3246	lck_spin_unlock(&ni->size_lock);
3247	/*
3248	 * If this is a sparse attribute and the write overlaps the existing
3249	 * allocated size we need to fill any holes overlapping the write.  We
3250	 * can skip resident attributes as they cannot have sparse regions.
3251	 *
3252	 * As allocated size goes in units of clusters we need to round down
3253	 * the start offset to the nearest cluster boundary and we need to
3254	 * round up the end offset to the next cluster boundary.
3255	 */
3256	if (NInoSparse(ni) && NInoNonResident(ni) &&
3257			(ofs & ~ni->vol->cluster_size_mask) < size) {
3258		s64 aligned_end, new_end;
3259
3260		if (!write_locked)
3261			panic("%s(): !write_locked\n", __FUNCTION__);
3262		aligned_end = (end + ni->vol->cluster_size_mask) &
3263				~ni->vol->cluster_size_mask;
3264		/*
3265		 * Only need to instantiate holes up to the allocated size
3266		 * itself.  Everything else is an extension and will be dealt
3267		 * with by ntfs_attr_extend_allocation() below.
3268		 */
3269		if (aligned_end > size)
3270			aligned_end = size;
3271		err = ntfs_attr_instantiate_holes(ni,
3272				ofs & ~ni->vol->cluster_size_mask, aligned_end,
3273				&new_end, ioflags & IO_UNIT);
3274		if (err) {
3275			ntfs_error(ni->vol->mp, "Cannot perform write to "
3276					"mft_no 0x%llx because instantiation "
3277					"of sparse regions failed (error %d).",
3278					(unsigned long long)ni->mft_no, err);
3279			uio_setoffset(uio, old_ofs);
3280			uio_setresid(uio, old_count);
3281			if (!was_locked)
3282				lck_rw_unlock_exclusive(&ni->lock);
3283			return err;
3284		}
3285		/* If the instantiation was partial, truncate the write. */
3286		if (new_end < aligned_end) {
3287			s64 new_count;
3288
3289			if (ioflags & IO_UNIT)
3290				panic("%s(): new_end < aligned_end && "
3291						"ioflags & IO_UNIT\n",
3292						__FUNCTION__);
3293			ntfs_debug("Truncating write to mft_no 0x%llx because "
3294					"instantiation of sparse regions was "
3295					"only partially completed.",
3296					(unsigned long long)ni->mft_no);
3297			if (new_end > end)
3298				panic("%s(): new_end > end\n", __FUNCTION__);
3299			end = new_end;
3300			new_count = new_end - ofs;
3301			if (new_count >= count)
3302				panic("%s(): new_count >= count\n",
3303						__FUNCTION__);
3304			nr_truncated += count - new_count;
3305			count = new_count;
3306			uio_setresid(uio, new_count);
3307		}
3308	}
3309	/*
3310	 * If the write goes beyond the allocated size, extend the allocation
3311	 * to cover the whole of the write, rounded up to the nearest cluster.
3312	 */
3313	if (end > size) {
3314		if (!write_locked)
3315			panic("%s(): !write_locked\n", __FUNCTION__);
3316		/* Extend the allocation without changing the data size. */
3317		err = ntfs_attr_extend_allocation(ni, end, -1, ofs, NULL,
3318				&size, ioflags & IO_UNIT);
3319		if (!err) {
3320			if (ofs >= size)
3321				panic("%s(): ofs >= size\n", __FUNCTION__);
3322			/* If the extension was partial truncate the write. */
3323			if (end > size) {
3324				s64 new_count;
3325
3326				if (ioflags & IO_UNIT)
3327					panic("%s(): end > size && "
3328							"ioflags & IO_UNIT\n",
3329							__FUNCTION__);
3330				ntfs_debug("Truncating write to mft_no 0x%llx "
3331						"because the allocation was "
3332						"only partially extended.",
3333						(unsigned long long)ni->mft_no);
3334				end = size;
3335				new_count = size - ofs;
3336				if (new_count >= count)
3337					panic("%s(): new_count >= count\n",
3338							__FUNCTION__);
3339				nr_truncated += count - new_count;
3340				count = new_count;
3341				uio_setresid(uio, new_count);
3342			}
3343		} else /* if (err) */ {
3344			lck_spin_lock(&ni->size_lock);
3345			size = ni->allocated_size;
3346			lck_spin_unlock(&ni->size_lock);
3347			/* Perform a partial write if possible or fail. */
3348			if (ofs < size && !(ioflags & IO_UNIT)) {
3349				s64 new_count;
3350
3351				ntfs_debug("Truncating write to mft_no 0x%llx "
3352						"because extending the "
3353						"allocation failed (error %d).",
3354						(unsigned long long)ni->mft_no,
3355						err);
3356				end = size;
3357				new_count = size - ofs;
3358				if (new_count >= count)
3359					panic("%s(): new_count >= count\n",
3360							__FUNCTION__);
3361				nr_truncated += count - new_count;
3362				count = new_count;
3363				uio_setresid(uio, new_count);
3364			} else {
3365				ntfs_error(ni->vol->mp, "Cannot perform write "
3366						"to mft_no 0x%llx because "
3367						"extending the allocation "
3368						"failed (error %d).",
3369						(unsigned long long)ni->mft_no,
3370						err);
3371				goto abort;
3372			}
3373		}
3374	}
3375	/*
3376	 * If the write starts beyond the initialized size, extend it up to the
3377	 * beginning of the write and initialize all non-sparse space between
3378	 * the old initialized size and the new one.  This automatically also
3379	 * increments the data size as well as the ubc size to keep it above or
3380	 * equal to the initialized size.
3381	 */
3382	lck_spin_lock(&ni->size_lock);
3383	size = ni->initialized_size;
3384	lck_spin_unlock(&ni->size_lock);
3385	if (ofs > size) {
3386		if (!write_locked)
3387			panic("%s(): !write_locked 2\n", __FUNCTION__);
3388		err = ntfs_attr_extend_initialized(ni, ofs);
3389		if (err) {
3390			ntfs_error(ni->vol->mp, "Cannot perform write to "
3391					"mft_no 0x%llx because extending the "
3392					"initialized size failed (error %d).",
3393					(unsigned long long)ni->mft_no, err);
3394			goto abort;
3395		}
3396		size = ofs;
3397	}
3398	if (NInoNonResident(ni)) {
3399		int (*callback)(buf_t, void *);
3400
3401		if (NInoCompressed(ni) && !NInoRaw(ni)) {
3402#if 0
3403			err = ntfs_vnop_write_compressed(ni, uio, size,
3404					ioflags);
3405			if (!err)
3406				ntfs_debug("Done (ntfs_vnop_write_compressed()"
3407						").");
3408			else
3409				ntfs_error(ni->vol->mp, "Failed ("
3410						"ntfs_vnop_write_compressed(), "
3411						"error %d).", err);
3412#endif
3413			/*
3414			 * TODO: At present we should never get here for
3415			 * compressed files as this case is aborted at the
3416			 * start of the function.
3417			 */
3418			panic("%s(): NInoCompressed(ni) && !NInoRaw(ni)\n",
3419					__FUNCTION__);
3420		}
3421		callback = NULL;
3422		if (NInoEncrypted(ni)) {
3423			callback = ntfs_cluster_iodone;
3424			/*
3425			 * TODO: At present we should never get here for
3426			 * encrypted files as this case is aborted at the start
3427			 * of the function.
3428			 */
3429			panic("%s(): NInoEncrypted(ni)\n", __FUNCTION__);
3430		}
3431		/* Determine the new file size. */
3432		size = ubc_getsize(vn);
3433		if (end > size)
3434			size = end;
3435		/*
3436		 * Note the first size is the original file size and the second
3437		 * file size is the new file size when the write is complete.
3438		 */
3439		err = cluster_write_ext(vn, uio, ubc_getsize(vn), size, 0, 0,
3440				ioflags, callback, NULL);
3441		if (err) {
3442			/*
3443			 * There was an error.  We do not know where.  Ensure
3444			 * everything is set up as if the write never happened.
3445			 */
3446			ntfs_error(ni->vol->mp, "Failed (cluster_write_ext(), "
3447					"error %d).", err);
3448			goto abort;
3449		}
3450		goto done;
3451	}
3452	/*
3453	 * The attribute is resident thus we have to deal with it by ourselves.
3454	 * First of all, try to copy the data to the vm page cache.  This will
3455	 * work on the second and all later writes so this is the hot path.  If
3456	 * the attribute has not been accessed at all before or its cached
3457	 * pages were dropped due to vm pressure this will fail to copy any
3458	 * data due to the lack of a valid page and we will drop into the slow
3459	 * path.
3460	 */
3461	if (ofs > PAGE_SIZE)
3462		panic("%s(): ofs > PAGE_SIZE\n", __FUNCTION__);
3463	cnt = (int)count;
3464	if (count > PAGE_SIZE - ofs) {
3465		cnt = PAGE_SIZE - ofs;
3466		ntfs_warning(ni->vol->mp, "Unexpected count (0x%llx) > "
3467				"PAGE_SIZE - ofs (0x%x), overriding it to "
3468				"PAGE_SIZE - ofs.", (unsigned long long)count,
3469				cnt);
3470	}
3471	/*
3472	 * Note we pass mark_dirty = 1 (the last parameter) which means the
3473	 * pages that are written to will be marked dirty.
3474	 */
3475	err = cluster_copy_ubc_data(vn, uio, &cnt, 1);
3476	if (err) {
3477		/*
3478		 * The copying (uiomove()) failed with an error.  Ensure
3479		 * everything is set up as if the write never happened.
3480		 */
3481		ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() failed "
3482				"(error %d).", err);
3483		goto abort;
3484	}
3485	/*
3486	 * @cnt is now set to the number of bytes remaining to be transferred.
3487	 * If it is zero, it means we are done.
3488	 */
3489	if (!cnt)
3490		goto done;
3491	/*
3492	 * We failed to transfer everything.  That really means we failed to
3493	 * transfer anything at all as we are guaranteed that a resident
3494	 * attribute is smaller than a page thus either the page is there and
3495	 * valid and we transfer everything or it is not and we transfer
3496	 * nothing.
3497	 */
3498	if (cnt != count) {
3499		ntfs_warning(ni->vol->mp, "Unexpected partial transfer to "
3500				"cached page (count 0x%llx, cnt 0x%x).",
3501				(unsigned long long)count, cnt);
3502		/* Ensure everything is as it was before. */
3503		uio_setoffset(uio, old_ofs);
3504		uio_setresid(uio, old_count - nr_truncated);
3505	}
3506	/*
3507	 * The page is not in cache or is not valid.  We need to bring it into
3508	 * cache and make it valid so we can then copy the data in.  The
3509	 * easiest way to do this is to just map the page which will take care
3510	 * of everything for us.  We can then uiomove() straight into the page
3511	 * from the @uio and then mark the page dirty and unmap it again.
3512	 *
3513	 * As an optimization, if the write covers the whole existing attribute
3514	 * we grab the page without bringing it uptodate if it is not valid
3515	 * already thus saving a pagein from disk.
3516	 */
3517	need_uptodate = (ofs || end < size);
3518	err = ntfs_page_map_ext(ni, 0, &upl, &pl, &kaddr, need_uptodate, TRUE);
3519	if (err) {
3520		ntfs_error(ni->vol->mp, "Failed to map page (error %d).", err);
3521		goto abort;
3522	}
3523	err = uiomove((caddr_t)(kaddr + ofs), cnt, uio);
3524	if (err) {
3525		/*
3526		 * If we just caused the page to exist and did not bring it
3527		 * up-to-date or caching is disabled on the vnode or for this
3528		 * i/o, dump the page.  Otherwise release it back to the VM.
3529		 */
3530		if (upl_valid_page(pl, 0) || (need_uptodate &&
3531				!vnode_isnocache(vn) &&
3532				!(ioflags & IO_NOCACHE)))
3533			ntfs_page_unmap(ni, upl, pl, FALSE);
3534		else
3535			ntfs_page_dump(ni, upl, pl);
3536		/*
3537		 * The copying (uiomove()) failed with an error.  Ensure
3538		 * everything is set up as if the write never happened.
3539		 */
3540		ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err);
3541		goto abort;
3542	}
3543	/*
3544	 * If the page is not uptodate and we did not bring it up-to-date when
3545	 * mapping it, zero the remainder of the page now thus bringing it
3546	 * up-to-date.
3547	 */
3548	if (!need_uptodate && !upl_valid_page(pl, 0)) {
3549		const off_t cur_ofs = uio_offset(uio);
3550		if (cur_ofs > PAGE_SIZE)
3551			panic("%s(): cur_ofs > PAGE_SIZE\n", __FUNCTION__);
3552		bzero(kaddr + cur_ofs, PAGE_SIZE - cur_ofs);
3553	}
3554	/*
3555	 * Unmap the page marking it dirty.
3556	 *
3557	 * Note we leave the page cached even if no caching is requested for
3558	 * simplicity.  That way we do not need to touch the mft record at all
3559	 * and can instead rely on the next sync to propagate the dirty data
3560	 * from the page into the mft record and then to disk.  In the sync i/o
3561	 * case we will call ntfs_inode_sync() at the end of this function.
3562	 */
3563	ntfs_page_unmap(ni, upl, pl, TRUE);
3564done:
3565	/*
3566	 * If the write went past the end of the initialized size update it
3567	 * both in the ntfs inode and in the base attribute record.
3568	 *
3569	 * Also update the data size and the ubc size if the write went past
3570	 * the end of the data size.  Note this is automatically done by
3571	 * ntfs_attr_set_initialized_size() so we do not need to do it here.
3572	 */
3573	size = uio_offset(uio);
3574	lck_spin_lock(&ni->size_lock);
3575	if (size > ni->initialized_size) {
3576		lck_spin_unlock(&ni->size_lock);
3577		if (!write_locked)
3578			panic("%s(): !write_locked 3\n", __FUNCTION__);
3579		err = ntfs_attr_set_initialized_size(ni, size);
3580		if (err) {
3581			ntfs_error(ni->vol->mp, "Failed to update the "
3582					"initialized size of mft_no 0x%llx "
3583					"(error %d).",
3584					(unsigned long long)ni->mft_no, err);
3585			/*
3586			 * If the write was meant to be atomic, the write
3587			 * started beyond the end of the initialized size, or
3588			 * nothing was written ensure everything is set up as
3589			 * if the write never happened.
3590			 */
3591			lck_spin_lock(&ni->size_lock);
3592			size = ni->initialized_size;
3593			lck_spin_unlock(&ni->size_lock);
3594			if (ioflags & IO_UNIT || old_ofs >= size ||
3595					uio_resid(uio) >= old_count)
3596				goto abort;
3597			/*
3598			 * Something was written before the initialized size
3599			 * thus turn the error into a partial, successful write
3600			 * up to the initialized size.
3601			 */
3602			uio_setoffset(uio, size);
3603			uio_setresid(uio, size - old_ofs);
3604			err = 0;
3605		}
3606	} else
3607		lck_spin_unlock(&ni->size_lock);
3608	// TODO: If we wrote anything at all we have to clear the S_ISUID and
3609	// S_ISGID bits in the file mode as a precaution against tampering
3610	// (see xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_write()).
3611	/*
3612	 * Update the last_data_change_time (mtime) and last_mft_change_time
3613	 * (ctime) on the base ntfs inode @base_ni unless this is an attribute
3614	 * inode update in which case only update the ctime as named stream/
3615	 * extended attribute semantics expect on OS X.
3616	 */
3617	base_ni->last_mft_change_time = ntfs_utc_current_time();
3618	if (ni == base_ni)
3619		base_ni->last_data_change_time = base_ni->last_mft_change_time;
3620	NInoSetDirtyTimes(base_ni);
3621	/*
3622	 * If this is not a directory or it is an encrypted directory, set the
3623	 * needs archiving bit except for the core system files.
3624	 */
3625	if (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni)) {
3626		BOOL need_set_archive_bit = TRUE;
3627		if (ni->vol->major_ver >= 2) {
3628			if (ni->mft_no <= FILE_Extend)
3629				need_set_archive_bit = FALSE;
3630		} else {
3631			if (ni->mft_no <= FILE_UpCase)
3632				need_set_archive_bit = FALSE;
3633		}
3634		if (need_set_archive_bit) {
3635			base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
3636			NInoSetDirtyFileAttributes(base_ni);
3637		}
3638	}
3639	/*
3640	 * If we truncated the write add back the number of truncated bytes to
3641	 * the number of bytes remaining.
3642	 */
3643	if (nr_truncated > 0) {
3644		if (ioflags & IO_UNIT)
3645			panic("%s(): ioflags & IO_UNIT\n", __FUNCTION__);
3646		uio_setresid(uio, uio_resid(uio) + nr_truncated);
3647	}
3648	/*
3649	 * If the write was partial we need to trim off any extra allocated
3650	 * space by truncating the attribute to its old size.  We can only have
3651	 * extended the allocation if we hold the inode lock for writing so do
3652	 * not bother going through this code if we only hold the lock for
3653	 * reading.
3654	 *
3655	 * There is one exception and that is that if the write was meant to be
3656	 * atomic a partial write is not acceptable thus we need to abort the
3657	 * write completely in this case.
3658	 */
3659	size = uio_resid(uio);
3660	if (write_locked && size > nr_truncated) {
3661		s64 truncate_size;
3662		errno_t err2;
3663		int rflags;
3664
3665		/*
3666		 * If the write was meant to be atomic or nothing was written
3667		 * reset everything as if the write never happened thus
3668		 * releasing any extra space we may have allocated.
3669		 */
3670		if (ioflags & IO_UNIT || size >= old_count) {
3671			if (size > old_count)
3672				panic("%s(): size > old_count\n", __FUNCTION__);
3673abort:
3674			uio_setoffset(uio, old_ofs);
3675			uio_setresid(uio, old_count);
3676			if (!write_locked) {
3677				if (!err)
3678					panic("%s(): !err\n", __FUNCTION__);
3679				goto skip_truncate;
3680			}
3681			truncate_size = old_size;
3682		} else /* if (uio_resid(uio) < old_count) */ {
3683			/*
3684			 * At least something was written.  Truncate the
3685			 * attribute to the successfully written size thus
3686			 * releasing any extra space we allocated but ensure we
3687			 * do not truncate to less than the old size.
3688			 */
3689			truncate_size = uio_offset(uio);
3690			if (truncate_size < old_size)
3691				truncate_size = old_size;
3692		}
3693		/*
3694		 * Truncate the attribute to @truncate_size.
3695		 *
3696		 * The truncate must be complete or no need to bother at all so
3697		 * set the IO_UNIT flag.  Also remove unwanted flags.
3698		 */
3699		rflags = (ioflags | IO_UNIT) & ~(IO_APPEND | IO_SYNC |
3700				IO_NOZEROFILL);
3701		err2 = ntfs_attr_resize(ni, truncate_size, rflags, NULL);
3702		if (err2) {
3703			BOOL is_dirty;
3704
3705			/*
3706			 * If no other error has occured failing the truncate
3707			 * will at worst mean that we have too much allocated
3708			 * space which is not a disaster so carry on in this
3709			 * case.
3710			 *
3711			 * If another error has occured any of a number of
3712			 * things can now be wrong and in particular if the
3713			 * data size is not equal to @truncate_size this is
3714			 * very bad news so mark the volume dirty and warn the
3715			 * user about it.
3716			 */
3717			is_dirty = (err);
3718			if (is_dirty) {
3719				lck_spin_lock(&ni->size_lock);
3720				if (truncate_size == ni->data_size)
3721					is_dirty = FALSE;
3722				lck_spin_unlock(&ni->size_lock);
3723			}
3724			ntfs_error(ni->vol->mp, "Truncate failed (error %d).%s",
3725					err2, is_dirty ? "  Leaving "
3726					"inconsistent data on disk.  Unmount "
3727					"and run chkdsk." : "");
3728			if (is_dirty)
3729				NVolSetErrors(ni->vol);
3730		}
3731	}
3732skip_truncate:
3733	if (!was_locked) {
3734		if (!write_locked)
3735			lck_rw_unlock_shared(&ni->lock);
3736		else
3737			lck_rw_unlock_exclusive(&ni->lock);
3738		/*
3739		 * If the write was successful and synchronous i/o was
3740		 * requested, sync all changes to the backing store.  We
3741		 * dropped the inode lock already to be able to call
3742		 * ntfs_inode_sync() thus if it fails we cannot do anything
3743		 * about it so we just return the error even though the
3744		 * operation has otherwise been performed.
3745		 *
3746		 * Note we cannot do this if the inode was already locked or
3747		 * the call to ntfs_inode_sync() would cause a deadlock.
3748		 */
3749		if (!err && ioflags & IO_SYNC) {
3750			/* Mask out undersired @ioflags. */
3751			ioflags &= ~(IO_UNIT | IO_APPEND | IO_DEFWRITE);
3752			err = ntfs_inode_sync(ni, ioflags, FALSE);
3753		}
3754	}
3755	return err;
3756}
3757
3758/**
3759 * ntfs_vnop_write - write a number of bytes from a memory buffer into a file
3760 * @a:		arguments to write function
3761 *
3762 * @a contains:
3763 *	vnode_t a_vp;		vnode of file to write to
3764 *	uio_t a_uio;		source containing the data to write
3765 *	int a_ioflag;		flags further describing the write request
3766 *	vfs_context_t a_context;
3767 *
3768 * Write uio_resid(@a->a_uio) bytes from the source buffer specified by
3769 * @a->a_uio to the vnode @a-a_vp, starting at byte offset
3770 * uio_offset(@a->a_uio) into the vnode.
3771 *
3772 * The flags in @a->a_ioflag further describe the write request.  The following
3773 * ioflags are currently defined in OS X kernel (not all of them are applicable
3774 * to VNOP_WRITE() however):
3775 *	IO_UNIT		- Do i/o as atomic unit.
3776 *	IO_APPEND	- Append write to end.
3777 *	IO_SYNC		- Do i/o synchronously.
3778 *	IO_NODELOCKED	- Underlying node already locked.
3779 *	IO_NDELAY	- FNDELAY flag set in file table.
3780 *	IO_NOZEROFILL	- F_SETSIZE fcntl uses this to prevent zero filling.
3781 *	IO_TAILZEROFILL	- Zero fills at the tail of write.
3782 *	IO_HEADZEROFILL	- Zero fills at the head of write.
3783 *	IO_NOZEROVALID	- Do not zero fill if valid page.
3784 *	IO_NOZERODIRTY	- Do not zero fill if page is dirty.
3785 *	IO_CLOSE	- The i/o was issued from close path.
3786 *	IO_NOCACHE	- Same effect as VNOCACHE_DATA, but only for this i/o.
3787 *	IO_RAOFF	- Same effect as VRAOFF, but only for this i/o.
3788 *	IO_DEFWRITE	- Defer write if vfs.defwrite is set.
3789 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
3790 *
3791 * For compressed and encrypted attributes we abort for now as we do not
3792 * support them yet.
3793 *
3794 * For non-resident attributes we use cluster_write_ext() which deals with
3795 * normal attributes.
3796 *
3797 * Return 0 on success and errno on error.
3798 */
3799static int ntfs_vnop_write(struct vnop_write_args *a)
3800{
3801	vnode_t vn = a->a_vp;
3802	ntfs_inode *ni = NTFS_I(vn);
3803
3804	if (!ni) {
3805		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
3806		return EINVAL;
3807	}
3808	/*
3809	 * We can only write to regular files and named streams.
3810	 *
3811	 * Also, do not allow writing to system files and mst protected
3812	 * attributes.
3813	 */
3814	if (vnode_issystem(vn) || NInoMstProtected(ni) ||
3815			(!S_ISREG(ni->mode) && !(NInoAttr(ni) &&
3816			ni->type == AT_DATA))) {
3817		if (S_ISDIR(ni->mode))
3818			return EISDIR;
3819		return EPERM;
3820	}
3821	return (int)ntfs_write(ni, a->a_uio, a->a_ioflag, FALSE);
3822}
3823
3824/**
3825 * ntfs_vnop_ioctl -
3826 *
3827 */
3828static int ntfs_vnop_ioctl(struct vnop_ioctl_args *a)
3829{
3830	errno_t err;
3831
3832	ntfs_debug("Entering.");
3833	// TODO:
3834	err = ENOTSUP;
3835	ntfs_debug("Done (error %d).", (int)err);
3836	return err;
3837}
3838
3839/**
3840 * ntfs_vnop_select -
3841 *
3842 */
3843static int ntfs_vnop_select(struct vnop_select_args *a)
3844{
3845	errno_t err;
3846
3847	ntfs_debug("Entering.");
3848	// TODO:
3849	err = ENOTSUP;
3850	ntfs_debug("Done (error %d).", (int)err);
3851	return err;
3852}
3853
3854/**
3855 * ntfs_vnop_exchange -
3856 *
3857 */
3858static int ntfs_vnop_exchange(struct vnop_exchange_args *a)
3859{
3860	errno_t err;
3861
3862	ntfs_debug("Entering.");
3863	// TODO:
3864	err = ENOTSUP;
3865	ntfs_debug("Done (error %d).", (int)err);
3866	return err;
3867}
3868
3869/**
3870 * ntfs_vnop_mmap - map a file (vnode) into memory
3871 * @a:		arguments to mmap function
3872 *
3873 * @a contains:
3874 *	vnode_t a_vp;			file vnode which to map into memory
3875 *	int a_fflags;			mapping flags for the vnode
3876 *	vfs_context_t a_context;
3877 *
3878 * Map the file vnode @a->a_vp into memory applying the mapping flags
3879 * @a->a_fflags which are a combination of one or more of PROT_READ,
3880 * PROT_WRITE, and PROT_EXEC.
3881 *
3882 * VNOP_MMAP() and hence ntfs_vnop_mmap() gets called from ubc_map() which in
3883 * turn gets called from the mmap() system call when a file is being mapped
3884 * into memory.
3885 *
3886 * The mmap() system call does the necessary permission checking and in fact
3887 * ignores the return value from ubc_map() and relies on things not working
3888 * later on for error handling.
3889 *
3890 * ubc_map() on the other hand does look at the return value of VNOP_MMAP() but
3891 * it only cares for one error code and that is EPERM.  All other errors are
3892 * ignored and not passed to its caller.  Thus for any return value not equal
3893 * to EPERM, ubc_map() takes an extra reference on the vnode and sets the flags
3894 * UI_ISMAPPED and UI_WASMAPPED in the ubc info of the vnode and for EPERM it
3895 * does not do anything and just returns EPERM to the caller.
3896 *
3897 * In effect neither class of return value (EPERM or not EPERM) actually has
3898 * any effect at all so we do not bother doing any checking here and defer all
3899 * checks to VNOP_PAGEIN() and hence ntfs_vnop_pagein().
3900 *
3901 * FIXME: This is a huge problem because it means that anyone can use mmap() on
3902 * a system file and then write rubbish into the mapped memory and then trash
3903 * the metadata in the mapped memory by calling msync() to write the rubbish
3904 * out into the system file on disk!  This will need to be fixed in the kernel
3905 * I think, i.e. the mmap() system call must fail if VNOP_MMAP() fails.  This
3906 * is because we have no way to tell who is causing a page{in,out} at
3907 * ntfs_vnop_page{in,out}() time and for what reason so we have to always
3908 * permit page{in,out} to be called.
3909 *
3910 * Return 0 on success and EPERM on error.
3911 */
3912static int ntfs_vnop_mmap(struct vnop_mmap_args *a)
3913{
3914#ifdef DEBUG
3915	ntfs_inode *ni = NTFS_I(a->a_vp);
3916
3917	if (ni)
3918		ntfs_debug("Mapping mft_no 0x%llx, type 0x%x, name_len 0x%x, "
3919				"mapping flags 0x%x.",
3920				(unsigned long long)ni->mft_no,
3921				le32_to_cpu(ni->type), (unsigned)ni->name_len,
3922				a->a_fflags);
3923#endif
3924	/* Nothing to do. */
3925	return 0;
3926}
3927
3928/**
3929 * ntfs_vnop_mnomap - unmap a file (vnode) from memory
3930 * @a:		arguments to mnomap function
3931 *
3932 * @a contains:
3933 *	vnode_t a_vp;			file vnode which to unmap from memory
3934 *	vfs_context_t a_context;
3935 *
3936 * Remove the memory mapping of the file vnode @a->a_vp that was previously
3937 * established via ntfs_vnop_mmap().
3938 *
3939 * VNOP_MNOMAP() and hence ntfs_vnop_mnomap() gets called from ubc_unmap() when
3940 * a file is being unmapped from memory via the munmap() system call.
3941 *
3942 * ubc_unmap() only calls VNOP_MNOMAP() if the previous VNOP_MMAP() call did
3943 * not return EPERM.
3944 *
3945 * ubc_unmap() completely ignores the return value from VNOP_MNOMAP().
3946 *
3947 * Always return 0 as the return value is always ignored.
3948 */
3949static int ntfs_vnop_mnomap(struct vnop_mnomap_args *a)
3950{
3951#ifdef DEBUG
3952	ntfs_inode *ni = NTFS_I(a->a_vp);
3953
3954	if (ni)
3955		ntfs_debug("Unmapping mft_no 0x%llx, type 0x%x, name_len "
3956				"0x%x.", (unsigned long long)ni->mft_no,
3957				le32_to_cpu(ni->type), (unsigned)ni->name_len);
3958#endif
3959	/* Nothing to do. */
3960	return 0;
3961}
3962
3963/**
3964 * ntfs_vnop_fsync - synchronize a vnode's in-core state with that on disk
3965 * @a:		arguments to fsync function
3966 *
3967 * @a contains:
3968 *	vnode_t a_vp;			vnode which to sync
3969 *	int a_waitfor;			if MNT_WAIT wait for i/o to complete
3970 *	vfs_context_t a_context;
3971 *
3972 * Write all dirty cached data belonging/related to the vnode @a->a_vp to disk.
3973 *
3974 * If @a->a_waitfor is MNT_WAIT, wait for all i/o to complete before returning.
3975 *
3976 * Note: When called from reclaim, the vnode has a zero v_iocount and
3977 *	 v_usecount and vnode_isrecycled() is true.
3978 *
3979 * Return 0 on success and the error code on error.
3980 */
3981static int ntfs_vnop_fsync(struct vnop_fsync_args *a)
3982{
3983	vnode_t vn = a->a_vp;
3984	ntfs_inode *ni = NTFS_I(vn);
3985	int sync, err;
3986
3987	if (!ni) {
3988		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
3989		return 0;
3990	}
3991	/* If we are mounted read-only, we do not need to sync anything. */
3992	if (NVolReadOnly(ni->vol))
3993		return 0;
3994	sync = (a->a_waitfor == MNT_WAIT) ? IO_SYNC : 0;
3995	ntfs_debug("Entering for inode 0x%llx, waitfor 0x%x, %ssync i/o.",
3996			(unsigned long long)ni->mft_no, a->a_waitfor,
3997			(sync == IO_SYNC) ? "a" : "");
3998	/*
3999	 * We need to allow ENOENT errors since the unlink system call can call
4000	 * VNOP_FSYNC() during vclean().
4001	 */
4002	err = ntfs_inode_sync(ni, sync, FALSE);
4003	if (err == ENOENT)
4004		err = 0;
4005	ntfs_debug("Done (error %d).", err);
4006	return err;
4007}
4008
4009/**
4010 * ntfs_unlink_internal - unlink and ntfs inode from its parent directory
4011 * @dir_ni:	directory ntfs inode from which to unlink the ntfs inode
4012 * @ni:		base ntfs inode to unlink
4013 * @name:	Unicode name of the inode to unlink
4014 * @name_len:	length of the name in Unicode characters
4015 * @name_type:	Namespace the name is in (i.e. FILENAME_{DOS,WIN32,POSIX,etc})
4016 * @is_rename:	if true ntfs_unlink_internal() is called for a rename operation
4017 *
4018 * Unlink an inode with the ntfs inode @ni and name @name with length @name_len
4019 * Unicode characters and of namespace @name_type from the directory with ntfs
4020 * inode @dir_ni.
4021 *
4022 * If @is_rename is true the caller was ntfs_vnop_rename() in which case the
4023 * link count of the inode to unlink @ni will be one higher than the link count
4024 * in the mft record.
4025 *
4026 * Return 0 on success and the error code on error.
4027 *
4028 * Note that if the name of the inode to be removed is in the WIN32 or DOS
4029 * namespaces, both the WIN32 and the corresponding DOS names are removed.
4030 *
4031 * Note that for a hard link this function simply removes the name and its
4032 * directory entry and decrements the hard link count whilst for the last name,
4033 * i.e. the last link to an inode, it only removes the directory entry, i.e. it
4034 * does not remove the name, however it does decrement the hard link count to
4035 * zero.  This is so that the inode can be undeleted and its original name
4036 * restored.  In any case, we do not actually delete the inode here as it may
4037 * still be open and UNIX semantics require an unlinked inode to be still
4038 * accessible through already opened file descriptors.  When the last file
4039 * descriptor is closed, we causes the inode to be deleted when the VFS
4040 * notifies us of the last close by calling VNOP_INACTIVE(), i.e.
4041 * ntfs_vnop_inactive().
4042 */
4043static errno_t ntfs_unlink_internal(ntfs_inode *dir_ni, ntfs_inode *ni,
4044		ntfschar *name, signed name_len, FILENAME_TYPE_FLAGS name_type,
4045		const BOOL is_rename)
4046{
4047	ntfs_volume *vol;
4048	ntfs_inode *objid_o_ni;
4049	ntfschar *ntfs_name;
4050	MFT_RECORD *m;
4051	ntfs_attr_search_ctx *actx;
4052	ATTR_RECORD *a;
4053	ntfs_index_context *ictx;
4054	FILENAME_ATTR *fn, *tfn;
4055	signed ntfs_name_len;
4056	unsigned fn_count, tfn_alloc;
4057	errno_t err;
4058	BOOL seen_dos;
4059	FILENAME_TYPE_FLAGS seek_type, fn_type;
4060
4061	vol = ni->vol;
4062	objid_o_ni = vol->objid_o_ni;
4063	ntfs_debug("Unlinking mft_no 0x%llx from directory mft_no 0x%llx, "
4064			"name type 0x%x.", (unsigned long long)ni->mft_no,
4065			(unsigned long long)dir_ni->mft_no,
4066			(unsigned)name_type);
4067	if (NInoAttr(ni))
4068		panic("%s(): Target inode is an attribute inode.\n",
4069				__FUNCTION__);
4070	/* Start the unlink by evicting the target from the name cache. */
4071	cache_purge(ni->vn);
4072	/*
4073	 * We now need to look up the target name in the target mft record.
4074	 *
4075	 * If @name_type is FILENAME_POSIX then @name and @name_len contain the
4076	 * correctly cased name and length in Unicode characters, respectively
4077	 * so we simply set @ntfs_name and @ntfs_name_len to @name and
4078	 * @name_len, respectively.
4079	 *
4080	 * If @name_type is anything else, i.e. FILENAME_WIN32, FILENAME_DOS,
4081	 * or FILENAME_WIN32_AND_DOS we simply need to look for that type of
4082	 * name in the target mft record as there can only be one filename
4083	 * attribute of this type thus the name is uniquely identified by type
4084	 * so the lookup can be optimized that way.
4085	 */
4086	seek_type = 0;
4087	if (name_type == FILENAME_POSIX) {
4088		ntfs_name = name;
4089		ntfs_name_len = name_len;
4090	} else {
4091		/*
4092		 * Set @ntfs_name to NULL so we know to do the look up based on
4093		 * the filename namespace @seek_type instead.
4094		 */
4095		ntfs_name = NULL;
4096		ntfs_name_len = 0;
4097		seek_type = name_type;
4098		/*
4099		 * If the target name is the WIN32 name we first need to delete
4100		 * the DOS name thus re-set @seek_type accordingly (see below
4101		 * for details).
4102		 */
4103		if (seek_type == FILENAME_WIN32)
4104			seek_type = FILENAME_DOS;
4105	}
4106	/*
4107	 * We know this is the base inode since we bailed out for attribute
4108	 * inodes above.
4109	 */
4110	err = ntfs_mft_record_map(ni, &m);
4111	if (err) {
4112		ntfs_error(vol->mp, "Failed to map mft record 0x%llx (error "
4113				"%d).", (unsigned long long)ni->mft_no, err);
4114		goto err;
4115	}
4116	/*
4117	 * Sanity check that the inode link count is in step with the mft
4118	 * record link count.
4119	 */
4120	if ((!is_rename && ni->link_count != le16_to_cpu(m->link_count)) ||
4121			(is_rename && ni->link_count !=
4122			(unsigned)le16_to_cpu(m->link_count) + 1))
4123		panic("%s(): ni->link_count != le16_to_cpu(m->link_count)\n",
4124				__FUNCTION__);
4125	actx = ntfs_attr_search_ctx_get(ni, m);
4126	if (!actx) {
4127		err = ENOMEM;
4128		goto unm_err;
4129	}
4130	/*
4131	 * Find the name in the target mft record.
4132	 *
4133	 * If it is a name in the WIN32 or DOS namespace (but not both), we
4134	 * remove the DOS name from both the directory index it is in and from
4135	 * the mft record and we decrement the link count both in the base mft
4136	 * record and in the ntfs inode.  In the case of a WIN32 name, we find
4137	 * the corresponding DOS name first and proceed as described.
4138	 *
4139	 * If the removal of the DOS name from the directory index is
4140	 * successful, we change the namespace of the remaining WIN32 name to
4141	 * the POSIX namespace, thus if we fail to remove the remaining name
4142	 * after successfully removing the DOS name, we still have a consistent
4143	 * file system.  This also has the side effect of allowing undelete to
4144	 * work properly as otherwise the undelete would restore a WIN32 name
4145	 * without a corresponding DOS name which would result in an illegal
4146	 * inode.
4147	 *
4148	 * We thus reduce the problem to a normal single name unlink and we can
4149	 * now determine whether this unlink is just a hard link removal or the
4150	 * final name removal, i.e. the inode is being deleted.
4151	 */
4152	seen_dos = FALSE;
4153restart_name:
4154	/*
4155	 * Before looking for the last name and removing it from its directory
4156	 * index entry, i.e. before unlinking the inode and targeting it for
4157	 * deletion, we need to check if the inode has an object id and if so
4158	 * we need to remove it from the object id index on the volume (present
4159	 * in $O index of $Extend/$ObjId system file), so that the inode cannot
4160	 * be found via its object id any more either.  Also, when the deleted
4161	 * inode gets reused for different purposes, we do not want the old
4162	 * object id to still point at it.
4163	 *
4164	 * If the volume is pre-NTFS 3.0, i.e. it does not support object ids,
4165	 * @vol->objid_o_ni will be NULL.  It will also be NULL if the volume
4166	 * is NTFS 3.0+ but no object ids are present on the volume, thus we
4167	 * can make the check conditional on @objid_o_ni not being NULL.
4168	 *
4169	 * We do this before deleting the last directory entry so that we can
4170	 * abort the unlink if we fail to remove the object id from the index
4171	 * to ensure the volume does not become inconsistent.
4172	 */
4173	if (objid_o_ni && ni->link_count <= 1) {
4174		err = ntfs_attr_lookup(AT_OBJECT_ID, AT_UNNAMED, 0, 0, NULL, 0,
4175				actx);
4176		if (err) {
4177			if (err != ENOENT) {
4178				ntfs_error(vol->mp, "Failed to look up object "
4179						"id in mft_no 0x%llx (error "
4180						"%d).",
4181						(unsigned long long)ni->mft_no,
4182						err);
4183				goto put_err;
4184			}
4185			/*
4186			 * The object id was not found which is fine.  The
4187			 * inode simply does not have an object id assigned to
4188			 * it so there is nothing for us to do.
4189			 */
4190			ntfs_debug("Target mft_no 0x%llx does not have an "
4191					"object id assigned to it.",
4192					(unsigned long long)ni->mft_no);
4193		} else /* if (!err) */ {
4194			INDEX_ENTRY *ie;
4195			GUID object_id;
4196
4197			/* The inode has an object id assigned to it. */
4198			ntfs_debug("Deleting object id from target mft_no "
4199					"0x%llx.",
4200					(unsigned long long)ni->mft_no);
4201			a = actx->a;
4202			/*
4203			 * We need to make a copy of the object id and release
4204			 * the mft record before looking up the object id in
4205			 * the $ObjID/$O index otherwise we could deadlock if
4206			 * the currently mapped mft record is in the same page
4207			 * as one of the mft records of $ObjId.
4208			 */
4209			memcpy(&object_id, &((OBJECT_ID_ATTR*)((u8*)a +
4210					le16_to_cpu(a->value_offset)))->
4211					object_id, sizeof(object_id));
4212			ntfs_attr_search_ctx_put(actx);
4213			ntfs_mft_record_unmap(ni);
4214			err = vnode_get(objid_o_ni->vn);
4215			if (err) {
4216				ntfs_error(vol->mp, "Failed to get index "
4217						"vnode for $ObjId/$O.");
4218				goto err;
4219			}
4220			lck_rw_lock_exclusive(&objid_o_ni->lock);
4221			ictx = ntfs_index_ctx_get(objid_o_ni);
4222			if (!ictx) {
4223				ntfs_error(vol->mp, "Failed to get index "
4224						"context.");
4225				err = ENOMEM;
4226				goto iput_err;
4227			}
4228restart_ictx:
4229			/* Get the index entry matching the object id. */
4230			err = ntfs_index_lookup(&object_id, sizeof(object_id),
4231					&ictx);
4232			if (err) {
4233				if (err == ENOENT) {
4234					ntfs_error(vol->mp, "Failed to delete "
4235							"object id of target "
4236							"inode 0x%llx from "
4237							"object id index "
4238							"because the object "
4239							"id was not found in "
4240							"the object id "
4241							"index.  Volume is "
4242							"corrupt.  Run "
4243							"chkdsk.",
4244							(unsigned long long)
4245							ni->mft_no);
4246					NVolSetErrors(vol);
4247					err = EIO;
4248				} else
4249					ntfs_error(vol->mp, "Failed to delete "
4250							"object id of target "
4251							"inode 0x%llx from "
4252							"object id index "
4253							"because looking up "
4254							"the object id in the "
4255							"object id index "
4256							"failed (error %d)." ,
4257							(unsigned long long)
4258							ni->mft_no, err);
4259				goto iput_err;
4260			}
4261			ie = ictx->entry;
4262			/* We now have the index entry, delete it. */
4263			err = ntfs_index_entry_delete(ictx);
4264			if (err) {
4265				if (err == -EAGAIN) {
4266					ntfs_debug("Restarting object id "
4267							"delete as tree was "
4268							"rearranged.");
4269					ntfs_index_ctx_reinit(ictx, objid_o_ni);
4270					goto restart_ictx;
4271				}
4272				ntfs_error(vol->mp, "Failed to delete object "
4273						"id of target inode 0x%llx "
4274						"from object id index (error "
4275						"%d).",
4276						(unsigned long long)ni->mft_no,
4277						err);
4278				goto iput_err;
4279			}
4280			ntfs_index_ctx_put(ictx);
4281			lck_rw_unlock_exclusive(&objid_o_ni->lock);
4282			(void)vnode_put(objid_o_ni->vn);
4283			/*
4284			 * Now get back the mft record so we can re-look up the
4285			 * object id attribute so we can delete it.
4286			 *
4287			 * This means we do not need to worry about
4288			 * inconsistencies to do with the object id in our
4289			 * error handling code paths later on.
4290			 */
4291			err = ntfs_mft_record_map(ni, &m);
4292			if (err) {
4293				ntfs_error(vol->mp, "Failed to re-map mft "
4294						"record 0x%llx (error %d).  "
4295						"Leaving inconstent "
4296						"metadata.  Run chkdsk.",
4297						(unsigned long long)ni->mft_no,
4298						err);
4299				NVolSetErrors(vol);
4300				goto err;
4301			}
4302			actx = ntfs_attr_search_ctx_get(ni, m);
4303			if (!actx) {
4304				ntfs_error(vol->mp, "Failed to re-get "
4305						"attribute search context for "
4306						"mft record 0x%llx (error "
4307						"%d).  Leaving inconstent "
4308						"metadata.  Run chkdsk.",
4309						(unsigned long long)ni->mft_no,
4310						err);
4311				NVolSetErrors(vol);
4312				err = ENOMEM;
4313				goto unm_err;
4314			}
4315			err = ntfs_attr_lookup(AT_OBJECT_ID, AT_UNNAMED, 0, 0,
4316					NULL, 0, actx);
4317			if (err) {
4318				ntfs_error(vol->mp, "Failed to re-look up "
4319						"object id in mft_no 0x%llx "
4320						"(error %d).  Leaving "
4321						"inconsistent metadata.  Run "
4322						"chkdsk.",
4323						(unsigned long long)ni->mft_no,
4324						err);
4325				NVolSetErrors(ni->vol);
4326				err = EIO;
4327				goto put_err;
4328			}
4329			/*
4330			 * Remove the object id attribute from the mft record
4331			 * and mark the mft record dirty.
4332			 */
4333			err = ntfs_attr_record_delete(ni, actx);
4334			if (err) {
4335				ntfs_error(vol->mp, "Failed to delete object "
4336						"id in mft_no 0x%llx (error "
4337						"%d).  Leaving inconsistent "
4338						"metadata.  Run chkdsk.",
4339						(unsigned long long)ni->mft_no,
4340						err);
4341				goto put_err;
4342			}
4343		}
4344		/* Reinit the search context for the AT_FILENAME lookup. */
4345		ntfs_attr_search_ctx_reinit(actx);
4346	}
4347	/* Use label and goto instead of a loop to reduce indentation. */
4348	fn_count = 0;
4349next_name:
4350	/* Increment the filename attribute counter. */
4351	fn_count++;
4352	err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0, actx);
4353	if (err) {
4354		if (err == ENOENT) {
4355			/*
4356			 * If the name we are looking for is not found there is
4357			 * either some corruption or a bug given that a call to
4358			 * ntfs_lookup_inode_by_name() just found the name in
4359			 * the directory index.
4360			 */
4361			ntfs_error(vol->mp, "The target filename was not "
4362					"found in the mft record 0x%llx.  "
4363					"This is not possible.  This is "
4364					"either due to corruption or due to a "
4365					"driver bug.  Run chkdsk.",
4366					(unsigned long long)ni->mft_no);
4367			NVolSetErrors(vol);
4368			err = EIO;
4369		} else
4370			ntfs_error(vol->mp, "Failed to look up target "
4371					"filename in the mft record 0x%llx "
4372					"(error %d).",
4373					(unsigned long long)ni->mft_no, err);
4374		goto put_err;
4375	}
4376	a = actx->a;
4377	fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset));
4378	fn_type = fn->filename_type;
4379	/*
4380	 * If this is a specific DOS or WIN32 or combined name lookup, no need
4381	 * to compare the actual name as there can only be one DOS and one
4382	 * WIN32 name or only one combined name in an inode.
4383	 */
4384	if (seek_type && seek_type != FILENAME_POSIX) {
4385		/*
4386		 * If this filename attribute does not match the target name
4387		 * try the next one.
4388		 */
4389		if (seek_type != fn_type)
4390			goto next_name;
4391		/* We found the filename attribute matching the target name. */
4392		if (fn_type == FILENAME_WIN32) {
4393			/*
4394			 * We were looking for the WIN32 name so we can remove
4395			 * it after having removed the DOS name.  We now found
4396			 * it, so switch it to the POSIX namespace as described
4397			 * above and then go ahead and delete it.
4398			 */
4399			ntfs_debug("Switching namespace of filename attribute "
4400					"from WIN32 to POSIX.");
4401			fn_type = fn->filename_type = FILENAME_POSIX;
4402			NInoSetMrecNeedsDirtying(actx->ni);
4403		}
4404		goto found_name;
4405	}
4406	/* If this is the DOS name, note that we have seen it. */
4407	if (fn_type == FILENAME_DOS)
4408		seen_dos = TRUE;
4409	/* If the names do not match, continue searching. */
4410	if (fn->filename_length != ntfs_name_len)
4411		goto next_name;
4412	if (MREF_LE(fn->parent_directory) != dir_ni->mft_no)
4413		goto next_name;
4414	if (bcmp(fn->filename, ntfs_name, ntfs_name_len * sizeof(ntfschar)))
4415		goto next_name;
4416	/* Found the matching name. */
4417	if (fn_type == FILENAME_WIN32) {
4418		/*
4419		 * Pure WIN32 name.  Repeat the lookup but for the DOS name
4420		 * this time so we can remove that first.
4421		 */
4422		seek_type = FILENAME_DOS;
4423		/*
4424		 * If @seen_dos is true, then restart the lookup from the
4425		 * beginning and if not then continue the lookup where we left
4426		 * off.
4427		 */
4428		if (seen_dos) {
4429			ntfs_attr_search_ctx_reinit(actx);
4430			fn_count = 0;
4431		}
4432		goto next_name;
4433	}
4434	if (fn_type == FILENAME_DOS) {
4435		/*
4436		 * This cannot happen as ntfs_lookup_inode_by_name() always
4437		 * returns @name for pure DOS names and hence we would have
4438		 * @seek_type == FILENAME_DOS and thus would have picked this
4439		 * filename attribute up above without ever doing a name based
4440		 * match.
4441		 */
4442		ntfs_error(vol->mp, "Filename is in DOS namespace.  This is "
4443				"not possible.  This is either due to "
4444				"corruption or due to a driver bug.  Run "
4445				"chkdsk.");
4446		NVolSetErrors(vol);
4447		err = EIO;
4448		goto put_err;
4449	}
4450found_name:
4451	/*
4452	 * We found the target filename attribute and can now remove it from
4453	 * the directory index.  But before we can do that we need to make a
4454	 * copy of the filename attribute value so we can release the mft
4455	 * record before we delete the directory index entry.  This is needed
4456	 * because when we hold the target mft record and we call
4457	 * ntfs_dir_entry_delete() this would cause the mft record for the
4458	 * directory to be mapped which could result in a deadlock in the event
4459	 * that both mft records are in the same page.
4460	 */
4461	tfn_alloc = le32_to_cpu(a->value_length);
4462	tfn = OSMalloc(tfn_alloc, ntfs_malloc_tag);
4463	if (!tfn) {
4464		/*
4465		 * TODO: If @seek_type == FILENAME_WIN32 &&
4466		 * @fn->filename_type == FILENAME_POSIX we need to update the
4467		 * directory entry filename_type to FILENAME_POSIX.  See below
4468		 * for how this is done for the error case in
4469		 * ntfs_dir_entry_delete().  Given a memory allocation just
4470		 * failed it is highly unlikely we would succeed in trying to
4471		 * look up the directory entry so that we could change the
4472		 * filename_type in it so at least for now just set the volume
4473		 * has errors flag instead.
4474		 */
4475		ntfs_error(vol->mp, "Failed to allocate memory for temporary "
4476				"filename attribute.  Leaving inconsistent "
4477				"metadata.  Run chkdsk.");
4478		NVolSetErrors(vol);
4479		err = EIO;
4480		goto put_err;
4481	}
4482	memcpy(tfn, fn, tfn_alloc);
4483	ntfs_attr_search_ctx_put(actx);
4484	ntfs_mft_record_unmap(ni);
4485	/*
4486	 * We copied the name and can now remove it from the directory index.
4487	 * If the name is in the POSIX namespace, we may have converted it from
4488	 * a pure WIN32 name after removing the corresponding DOS name, in
4489	 * which case we need to update the index entry to reflect the
4490	 * conversion should we fail to remove it from the directory index.
4491	 * ntfs_dir_entry_delete() takes care of this for us.
4492	 */
4493	err = ntfs_dir_entry_delete(dir_ni, ni, tfn, tfn_alloc);
4494	if (err) {
4495		ntfs_error(vol->mp, "Failed to delete directory index entry "
4496				"(error %d).", err);
4497		goto err;
4498	}
4499	/*
4500	 * Now get back the mft record.
4501	 *
4502	 * If getting back the mft record fails there is nothing we can do to
4503	 * recover and must bail out completely leaving inconsistent metadata.
4504	 *
4505	 * TODO: We could try to add the dir entry back again in an attempt to
4506	 * recover but as above we likely fail a memory allocation it is highly
4507	 * unlikely we would succeed in trying to do the lookup and addition of
4508	 * the directory entry.
4509	 */
4510	err = ntfs_mft_record_map(ni, &m);
4511	if (err) {
4512		ntfs_error(vol->mp, "Failed to re-map mft record 0x%llx "
4513				"(error %d).  Leaving inconsistent metadata.  "
4514				"Run chkdsk.", (unsigned long long)ni->mft_no,
4515				err);
4516		NVolSetErrors(vol);
4517		goto err;
4518	}
4519	actx = ntfs_attr_search_ctx_get(ni, m);
4520	if (!actx) {
4521		ntfs_error(vol->mp, "Failed to re-get attribute search "
4522				"context for mft record 0x%llx (error %d).  "
4523				"Leaving inconsitent metadata.  Run chkdsk.",
4524				(unsigned long long)ni->mft_no, err);
4525		NVolSetErrors(vol);
4526		err = EIO;
4527		goto unm_err;
4528	}
4529	/*
4530	 * If the name is in the DOS namespace or this is not the last name we
4531	 * also need to remove the name from the mft record it is in and
4532	 * decrement the link count in the base mft record.
4533	 */
4534	if (fn_type == FILENAME_DOS || ni->link_count > 1) {
4535		/* Now need to re-lookup the target filename attribute. */
4536		while (fn_count > 0) {
4537			fn_count--;
4538			err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0,
4539					NULL, 0, actx);
4540			if (!err)
4541				continue;
4542			ntfs_error(vol->mp, "Failed to re-look up target "
4543					"filename in mft_no 0x%llx (error %d).",
4544					(unsigned long long)ni->mft_no, err);
4545			NVolSetErrors(vol);
4546			err = EIO;
4547			goto put_err;
4548		}
4549		a = actx->a;
4550		if (a->type != AT_FILENAME)
4551			panic("%s(): a->type (0x%x) != AT_FILENAME (0x30)\n",
4552					__FUNCTION__, le32_to_cpu(a->type));
4553		fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset));
4554		if (fn_type != fn->filename_type)
4555			panic("%s(): fn_type != fn->filename_type\n",
4556					__FUNCTION__);
4557		/* Remove the filename from the mft record, too. */
4558		err = ntfs_attr_record_delete(ni, actx);
4559		if (err) {
4560			ntfs_error(vol->mp, "Failed to delete filename "
4561					"attribute from mft_no 0x%llx (error "
4562					"%d).", (unsigned long long)ni->mft_no,
4563					err);
4564			NVolSetErrors(vol);
4565			err = EIO;
4566			goto put_err;
4567		}
4568		/*
4569		 * Update the hard link count in the base mft record.  Note we
4570		 * subtract one from the inode link count if this is a rename
4571		 * as the link count has been elevated by one by the caller.
4572		 */
4573		m->link_count = cpu_to_le16(ni->link_count - 1 -
4574				(is_rename ? 1 : 0));
4575	} else /* if (fn_type != FILENAME_DOS && ni->link_count <= 1) */ {
4576		/*
4577		 * This is the last name, so we need to mark the mft record as
4578		 * unused in the mft record flags so no-one can open it by
4579		 * accident and so that, in case of a crash between now and the
4580		 * deletion of the inode, ntfsck will know that we meant to
4581		 * delete the inode rather than that we were in the process of
4582		 * allocating or renaming it so it will do the Right Thing(TM)
4583		 * and complete the deletion process.
4584		 */
4585		m->flags &= ~MFT_RECORD_IN_USE;
4586		/* Ensure the base mft record gets written out. */
4587		NInoSetMrecNeedsDirtying(ni);
4588	}
4589	/*
4590	 * We have either deleted the filename completely or we only removed
4591	 * the directory index entry if this is the last name.
4592	 *
4593	 * In either case, we need to update the hard link count and the ctime
4594	 * in the ntfs inode (the ctime is the last_mft_change_time on NTFS).
4595	 */
4596	ni->link_count--;
4597	ni->last_mft_change_time = dir_ni->last_mft_change_time;
4598	NInoSetDirtyTimes(ni);
4599	/*
4600	 * If this is the DOS name, we now need to find the WIN32 name, so it
4601	 * can be deleted, too.  Otherwise we are done.
4602	 */
4603	if (fn_type == FILENAME_DOS) {
4604		seek_type = FILENAME_WIN32;
4605		/*
4606		 * We looked up the DOS name above thus we need to reinitialize
4607		 * the search context for the WIN32 name lookup.
4608		 */
4609		ntfs_attr_search_ctx_reinit(actx);
4610		fn_count = 0;
4611		goto restart_name;
4612	}
4613	/*
4614	 * If we removed a hard link but the inode is not deleted yet we need
4615	 * to remove the parent vnode from the vnode as this association may no
4616	 * longer exist.
4617	 *
4618	 * The same is true for the vnode name as we have just unlinked it.
4619	 *
4620	 * Note we skip this for the rename case because the subsequent call to
4621	 * ntfs_link_internal() is going to update the vnode identity with the
4622	 * new name and parent so no need to do wipe them here.
4623	 */
4624	if (ni->link_count > 0 && !is_rename)
4625		vnode_update_identity(ni->vn, NULL, NULL, 0, 0,
4626				VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
4627	ntfs_debug("Done.");
4628put_err:
4629	ntfs_attr_search_ctx_put(actx);
4630unm_err:
4631	ntfs_mft_record_unmap(ni);
4632err:
4633	return err;
4634iput_err:
4635	if (ictx)
4636		ntfs_index_ctx_put(ictx);
4637	lck_rw_unlock_exclusive(&objid_o_ni->lock);
4638	(void)vnode_put(objid_o_ni->vn);
4639	return err;
4640}
4641
4642/**
4643 * ntfs_unlink - unlink and ntfs inode from its parent directory
4644 * @dir_ni:	directory ntfs inode from which to unlink the ntfs inode
4645 * @ni:		base ntfs inode to unlink
4646 * @cn:		name of the inode to unlink
4647 * @flags:	flags describing the unlink request
4648 * @is_rmdir:	true if called from VNOP_RMDIR() and hence ntfs_vnop_rmdir()
4649 *
4650 * Unlink an inode with the ntfs inode @ni and name as specified in @cn from
4651 * the directory with ntfs inode @dir_ni.
4652 *
4653 * The flags in @flags further describe the unlink request.  The following
4654 * flags are currently defined in OS X kernel:
4655 *	VNODE_REMOVE_NODELETEBUSY	- Do not delete busy files, i.e. use
4656 *					  Carbon delete semantics).
4657 *
4658 * If @is_rmdir is true the caller is VNOP_RMDIR() and hence ntfs_vnop_rmdir()
4659 * and if @is_rmdir is false the caller is VNOP_REMOVE() and hence
4660 * ntfs_vnop_remove().  Note @flags is always zero if @is_rmdir is true.
4661 *
4662 * Return 0 on success and the error code on error.
4663 *
4664 * Note that if the name of the inode to be removed is in the WIN32 or DOS
4665 * namespaces, both the WIN32 and the corresponding DOS names are removed.
4666 *
4667 * Note that for a hard link this function simply removes the name and its
4668 * directory entry and decrements the hard link count whilst for the last name,
4669 * i.e. the last link to an inode, it only removes the directory entry, i.e. it
4670 * does not remove the name, however it does decrement the hard link count to
4671 * zero.  This is so that the inode can be undeleted and its original name
4672 * restored.  In any case, we do not actually delete the inode here as it may
4673 * still be open and UNIX semantics require an unlinked inode to be still
4674 * accessible through already opened file descriptors.  When the last file
4675 * descriptor is closed, we causes the inode to be deleted when the VFS
4676 * notifies us of the last close by calling VNOP_INACTIVE(), i.e.
4677 * ntfs_vnop_inactive().
4678 */
4679static errno_t ntfs_unlink(ntfs_inode *dir_ni, ntfs_inode *ni,
4680		struct componentname *cn, const int flags, const BOOL is_rmdir)
4681{
4682	MFT_REF mref;
4683	ntfs_volume *vol;
4684	ntfs_inode *objid_o_ni;
4685	ntfschar *ntfs_name;
4686	ntfs_dir_lookup_name *name = NULL;
4687	size_t ntfs_name_size;
4688	signed ntfs_name_len;
4689	errno_t err;
4690	FILENAME_TYPE_FLAGS ntfs_name_type;
4691	ntfschar ntfs_name_buf[NTFS_MAX_NAME_LEN];
4692
4693	vol = ni->vol;
4694	objid_o_ni = vol->objid_o_ni;
4695	ntfs_debug("Unlinking %s%.*s with mft_no 0x%llx from directory "
4696			"mft_no 0x%llx, flags 0x%x.",
4697			is_rmdir ? "directory " : "", (int)cn->cn_namelen,
4698			cn->cn_nameptr, (unsigned long long)ni->mft_no,
4699			(unsigned long long)dir_ni->mft_no, flags);
4700	/*
4701	 * Do not allow attribute inodes or raw inodes to be deleted.  Note
4702	 * raw inodes are always attribute inodes, too.
4703	 */
4704	if (NInoAttr(ni)) {
4705		ntfs_debug("Target %.*s, mft_no 0x%llx is a%s inode, "
4706				"returning EPERM.", (int)cn->cn_namelen,
4707				cn->cn_nameptr, (unsigned long long)ni->mft_no,
4708				NInoAttr(ni) ? "n attribute" : " raw");
4709		return EPERM;
4710	}
4711	/* The parent inode must be a directory. */
4712	if (!S_ISDIR(dir_ni->mode)) {
4713		ntfs_debug("Parent mft_no 0x%llx is not a directory, "
4714				"returning ENOTDIR.",
4715				(unsigned long long)dir_ni->mft_no);
4716		return ENOTDIR;
4717	}
4718	/* Check for "." removal. */
4719	if (ni == dir_ni) {
4720		ntfs_debug("Target %.*s, mft_no 0x%llx is the same as its "
4721				"parent directory, returning EINVAL.",
4722				(int)cn->cn_namelen, cn->cn_nameptr,
4723				(unsigned long long)ni->mft_no);
4724		return EINVAL;
4725	}
4726	/* Lock both the parent directory and the target inode for writing. */
4727	lck_rw_lock_exclusive(&dir_ni->lock);
4728	lck_rw_lock_exclusive(&ni->lock);
4729	/* Ensure the parent directory has not been deleted. */
4730	if (!dir_ni->link_count) {
4731		ntfs_debug("Parent directory mft_no 0x%llx has been deleted, "
4732				"returning ENOENT.",
4733				(unsigned long long)dir_ni->mft_no);
4734		/*
4735		 * If the directory is somehow still in the name cache remove
4736		 * it now.
4737		 */
4738		cache_purge(dir_ni->vn);
4739		err = ENOENT;
4740		goto err;
4741	}
4742	/* Ensure tha target has not been deleted by someone else already. */
4743	if (!ni->link_count) {
4744		ntfs_debug("Target %.*s, mft_no 0x%llx has been deleted, "
4745				"returning ENOENT.", (int)cn->cn_namelen,
4746				cn->cn_nameptr, (unsigned long long)ni->mft_no);
4747		/*
4748		 * If the target is somehow still in the name cache remove it
4749		 * now.
4750		 */
4751		cache_purge(ni->vn);
4752		err = ENOENT;
4753		goto err;
4754	}
4755	/*
4756	 * If this is a directory removal, i.e. rmdir, need to check that the
4757	 * directory is empty.
4758	 *
4759	 * Note we already checked for "." removal and we do not need to check
4760	 * for ".." removal because that would fail the directory is empty
4761	 * check as the parent directory would at least have one entry and that
4762	 * is the current directory.
4763	 */
4764	if (is_rmdir) {
4765		err = ntfs_dir_is_empty(ni);
4766		if (err) {
4767			if (err == ENOTEMPTY)
4768				ntfs_debug("Target directory %.*s, mft_no "
4769						"0x%llx is not empty, "
4770						"returning ENOTEMPTY.",
4771						(int)cn->cn_namelen,
4772						cn->cn_nameptr,
4773						(unsigned long long)ni->mft_no);
4774			else
4775				ntfs_error(vol->mp, "Failed to determine if "
4776						"target directory %.*s, "
4777						"mft_no 0x%llx is empty "
4778						"(error %d).",
4779						(int)cn->cn_namelen,
4780						cn->cn_nameptr,
4781						(unsigned long long)ni->mft_no,
4782						err);
4783			goto err;
4784		}
4785	} else {
4786		/* Do not allow directories to be unlinked. */
4787		if (S_ISDIR(ni->mode)) {
4788			ntfs_debug("Target %.*s, mft_no 0x%llx is a "
4789					"directory, returning EPERM.",
4790					(int)cn->cn_namelen, cn->cn_nameptr,
4791					(unsigned long long)ni->mft_no);
4792			err = EPERM;
4793			goto err;
4794		}
4795	}
4796	/*
4797	 * Do not allow any of the system files to be deleted.
4798	 *
4799	 * For NTFS 3.0+ volumes do not allow any of the extended system files
4800	 * to be deleted, either.
4801	 *
4802	 * Note we specifically blacklist all system files that we make use of
4803	 * except for the transaction log $UsnJrnl as that is allowed to be
4804	 * deleted and its deletion means that transaction logging is disabled.
4805	 *
4806	 * Note that if the transaction log is present it will be held busy by
4807	 * the NTFS driver thus unlinking the $UsnJrnl will not actually delete
4808	 * it until the driver is unmounted.  FIXME: Should we leave it like
4809	 * this or should we detach the $UsnJrnl vnodes from the volume and
4810	 * release them so they can be deleted immediately?
4811	 *
4812	 * TODO: What about all the new metadata files introduced with Windows
4813	 * Vista?  We are currently ignoring them and allowing them to be
4814	 * deleted...
4815	 */
4816	if (ni->file_attributes & FILE_ATTR_SYSTEM) {
4817		BOOL is_system = FALSE;
4818		if (vol->major_ver <= 1) {
4819			if (ni->mft_no < FILE_Extend)
4820				is_system = TRUE;
4821		} else {
4822			if (ni->mft_no <= FILE_Extend)
4823				is_system = TRUE;
4824			if (dir_ni == vol->extend_ni) {
4825				if (ni == vol->objid_ni ||
4826						ni == vol->quota_ni)
4827					is_system = TRUE;
4828			}
4829		}
4830		if (is_system) {
4831			ntfs_debug("Target %.*s, mft_no 0x%llx is a%s system "
4832					"file, returning EPERM.",
4833					(int)cn->cn_namelen, cn->cn_nameptr,
4834					(unsigned long long)ni->mft_no,
4835					(dir_ni == vol->extend_ni) ?
4836					"n extended" : "");
4837			err = EPERM;
4838			goto err;
4839		}
4840	}
4841	/*
4842	 * Ensure the file is not read-only (the read-only bit is ignored for
4843	 * directories.
4844	 */
4845	if (!S_ISDIR(ni->mode) && ni->file_attributes & FILE_ATTR_READONLY) {
4846		ntfs_debug("Target %.*s, mft_no 0x%llx is marked read-only, "
4847				"returning EPERM.", (int)cn->cn_namelen,
4848				cn->cn_nameptr,
4849				(unsigned long long)ni->mft_no);
4850		err = EPERM;
4851		goto err;
4852	}
4853	/*
4854	 * If the inode is a reparse point or if the inode is offline we cannot
4855	 * remove a name from it yet.  TODO: Implement this.
4856	 */
4857	if (ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
4858			FILE_ATTR_OFFLINE)) {
4859		ntfs_error(vol->mp, "Target %.*s, mft_no 0x%llx is %s.  "
4860				"Deleting names from such inodes is not "
4861				"supported yet, returning ENOTSUP.",
4862				(int)cn->cn_namelen, cn->cn_nameptr,
4863				(unsigned long long)ni->mft_no,
4864				ni->file_attributes & FILE_ATTR_REPARSE_POINT ?
4865				"a reparse point" : "offline");
4866		err = ENOTSUP;
4867		goto err;
4868	}
4869	/*
4870	 * If Carbon delete semantics are requested, do not allow busy files to
4871	 * be unlinked.  Note we do not use vnode_isinuse() as that accounts
4872	 * for open named streams/extended attributes as well which we do not
4873	 * care about.  We only care for actually opened files thus we keep
4874	 * track of them ourselves.
4875	 */
4876	if (flags & VNODE_REMOVE_NODELETEBUSY && ni->nr_opens) {
4877		ntfs_debug("Target %.*s, mft_no 0x%llx is busy (nr_opens "
4878				"0x%x) and Carbon delete semantics were "
4879				"requested, returning EBUSY.",
4880				(int)cn->cn_namelen, cn->cn_nameptr,
4881				(unsigned long long)ni->mft_no,
4882				(unsigned)ni->nr_opens);
4883		err = EBUSY;
4884		goto err;
4885	}
4886	/*
4887	 * We need to make sure the target still has the name specified in @cn
4888	 * that is being unlinked.  It could have been unlinked or renamed
4889	 * before we took the locks on the parent directory and the target.
4890	 *
4891	 * To do this, first convert the name of the target from utf8 to
4892	 * Unicode then look up the converted name in the directory index.
4893	 */
4894	ntfs_name = ntfs_name_buf;
4895	ntfs_name_size = sizeof(ntfs_name_buf);
4896	ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen,
4897			&ntfs_name, &ntfs_name_size);
4898	if (ntfs_name_len < 0) {
4899		err = -ntfs_name_len;
4900		if (err == ENAMETOOLONG)
4901			ntfs_debug("Failed (name is too long).");
4902		else
4903			ntfs_error(vol->mp, "Failed to convert name to "
4904					"Unicode (error %d).", err);
4905		goto err;
4906	}
4907	err = ntfs_lookup_inode_by_name(dir_ni, ntfs_name, ntfs_name_len,
4908			&mref, &name);
4909	if (err) {
4910		if (err != ENOENT) {
4911			ntfs_error(vol->mp, "Failed to find name in directory "
4912					"(error %d).", err);
4913			goto err;
4914		}
4915enoent:
4916		/*
4917		 * The name does not exist in the directory @dir_ni.
4918		 *
4919		 * This means someone renamed or deleted the name from the
4920		 * directory before we managed to take the locks.
4921		 */
4922		ntfs_debug("Target %.*s, mft_no 0x%llx has been renamed or "
4923				"deleted already, returning ENOENT.",
4924				(int)cn->cn_namelen, cn->cn_nameptr,
4925				(unsigned long long)ni->mft_no);
4926		/*
4927		 * If the target is somehow still in the name cache remove it
4928		 * now.
4929		 */
4930		cache_purge(ni->vn);
4931		err = ENOENT;
4932		goto err;
4933	}
4934	/*
4935	 * We found the target name in the directory index but does it still
4936	 * point to the same mft record?  The sequence number check ensures the
4937	 * inode was not deleted and recreated with the same name and the same
4938	 * mft record number.
4939	 */
4940	if (mref != MK_MREF(ni->mft_no, ni->seq_no))
4941		goto enoent;
4942	/*
4943	 * We are going to go ahead with unlinking the target.
4944	 *
4945	 * There are several different types of outcome from the above lookup
4946	 * that need to be handled.
4947	 *
4948	 * If @name is NULL @ntfs_name contains the correctly cased name thus
4949	 * we can simply look for that.  In this case we set the name type to 0
4950	 * as we do not know which namespace the name is in.
4951	 *
4952	 * If @name is not NULL the correctly cased name is in @name->name thus
4953	 * we look for that.  In this case we do know which namespace the name
4954	 * is in as it is @name->type.
4955	 */
4956	ntfs_name_type = 0;
4957	if (name) {
4958		ntfs_name = name->name;
4959		ntfs_name_len = name->len;
4960		ntfs_name_type = name->type;
4961	}
4962	/* Now we can perform the actual unlink. */
4963	err = ntfs_unlink_internal(dir_ni, ni, ntfs_name, ntfs_name_len,
4964			ntfs_name_type, FALSE);
4965	if (err)
4966		ntfs_error(vol->mp, "Failed to unlink %.*s with mft_no 0x%llx "
4967				"from directory mft_no 0x%llx (error %d).",
4968				(int)cn->cn_namelen, cn->cn_nameptr,
4969				(unsigned long long)ni->mft_no,
4970				(unsigned long long)dir_ni->mft_no, err);
4971	else
4972		ntfs_debug("Done.");
4973err:
4974	if (name)
4975		OSFree(name, sizeof(*name), ntfs_malloc_tag);
4976	lck_rw_unlock_exclusive(&ni->lock);
4977	lck_rw_unlock_exclusive(&dir_ni->lock);
4978	return err;
4979}
4980
4981/**
4982 * ntfs_vnop_remove - unlink a file
4983 * @a:		arguments to remove function
4984 *
4985 * @a contains:
4986 *	vnode_t a_dvp;			directory from which to unlink the file
4987 *	vnode_t a_vp;			file to unlink
4988 *	struct componentname *a_cnp;	name of the file to unlink
4989 *	int a_flags;			flags describing the unlink request
4990 *	vfs_context_t a_context;
4991 *
4992 * Unlink a file with vnode @a->a_vp and name as specified in @a->a_cnp form
4993 * the directory with vnode @a->a_dvp.
4994 *
4995 * The flags in @a->a_flags further describe the unlink request.  The following
4996 * flags are currently defined in OS X kernel:
4997 *	VNODE_REMOVE_NODELETEBUSY	- Do not delete busy files, i.e. use
4998 *					  Carbon delete semantics).
4999 *
5000 * Return 0 on success and errno on error.
5001 *
5002 * Note that if the name of the inode to be removed is in the WIN32 or DOS
5003 * namespaces, both the WIN32 and the corresponding DOS names are removed.
5004 *
5005 * Note that for a hard link this function simply removes the name and its
5006 * directory entry and decrements the hard link count whilst for the last name,
5007 * i.e. the last link to an inode, it only removes the directory entry, i.e. it
5008 * does not remove the name, however it does decrement the hard link count to
5009 * zero.  This is so that the inode can be undeleted and its original name
5010 * restored.  In any case, we do not actually delete the inode here as it may
5011 * still be open and UNIX semantics require an unlinked inode to be still
5012 * accessible through already opened file descriptors.  When the last file
5013 * descriptor is closed, we causes the inode to be deleted when the VFS
5014 * notifies us of the last close by calling VNOP_INACTIVE(), i.e.
5015 * ntfs_vnop_inactive().
5016 */
5017static int ntfs_vnop_remove(struct vnop_remove_args *a)
5018{
5019	ntfs_inode *dir_ni = NTFS_I(a->a_dvp);
5020	ntfs_inode *ni = NTFS_I(a->a_vp);
5021	errno_t err;
5022
5023	if (!dir_ni || !ni) {
5024		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
5025		return EINVAL;
5026	}
5027	ntfs_debug("Entering.");
5028	err = ntfs_unlink(NTFS_I(a->a_dvp), NTFS_I(a->a_vp), a->a_cnp,
5029			a->a_flags, FALSE);
5030	ntfs_debug("Done (error %d).", (int)err);
5031	return err;
5032}
5033
5034/**
5035 * ntfs_link_internal - create a hard link to an inode
5036 * @ni:		base ntfs inode to create hard link to
5037 * @dir_ni:	directory ntfs inode in which to create the hard link
5038 * @cn:		componentname specifying name of the hard link to create
5039 * @is_rename:	if true ntfs_link_internal() is called for a rename
5040 * @name:	Unicode name of the inode to unlink
5041 * @name_len:	length of the name in Unicode characters
5042 *
5043 * Create a hard link to the ntfs inode @ni with name as specified in @cn in
5044 * the directory ntfs inode @dir_ni.
5045 *
5046 * If @is_rename is true the caller was ntfs_vnop_rename() in which case the
5047 * link count of the inode to link to will be one higher than the link count in
5048 * the mft record and @name and @name_len specify the Unicode name and length
5049 * in Unicode characters corresponding to @cn, respectively so we do not have
5050 * to convert @cn to Unicode in this case.
5051 *
5052 * If @is_rename is false then @name and @name_len are undefined.
5053 *
5054 * Return 0 on success and errno on error.
5055 *
5056 * Note we always create filenames in the POSIX namespace.
5057 */
5058static errno_t ntfs_link_internal(ntfs_inode *ni, ntfs_inode *dir_ni,
5059		struct componentname *cn, const BOOL is_rename,
5060		const ntfschar *name, const signed name_len)
5061{
5062	ntfs_volume *vol;
5063	FILENAME_ATTR *fn;
5064	ntfschar *ntfs_name;
5065	MFT_RECORD *m;
5066	ntfs_attr_search_ctx *ctx;
5067	size_t ntfs_name_size;
5068	signed ntfs_name_len;
5069	unsigned fn_alloc, fn_size;
5070	errno_t err, err2;
5071	BOOL is_dir;
5072
5073	vol = ni->vol;
5074	ntfs_debug("Creating a hard link to mft_no 0x%llx, named %.*s in "
5075			"directory mft_no 0x%llx.",
5076			(unsigned long long)ni->mft_no, (int)cn->cn_namelen,
5077			cn->cn_nameptr, (unsigned long long)dir_ni->mft_no);
5078	if (NInoAttr(ni))
5079		panic("%s(): Inode to link to is an attribute/raw inode.\n",
5080				__FUNCTION__);
5081	is_dir = S_ISDIR(ni->mode);
5082	/*
5083	 * Create a temporary filename attribute so we can find the correct
5084	 * place to insert it into.  We also need a temporary copy so we can
5085	 * release the mft record before we add the directory entry.  This is
5086	 * needed because when we hold the mft record for the inode and we call
5087	 * ntfs_dir_entry_add() this would cause the mft record for the
5088	 * directory to be mapped which would result in a deadlock in the event
5089	 * that both mft records are in the same page.
5090	 */
5091	fn_alloc = sizeof(FILENAME_ATTR) + NTFS_MAX_NAME_LEN * sizeof(ntfschar);
5092	fn = OSMalloc(fn_alloc, ntfs_malloc_tag);
5093	if (!fn) {
5094		ntfs_error(vol->mp, "Failed to allocate memory for temporary "
5095				"filename attribute.");
5096		err = ENOMEM;
5097		goto err;
5098	}
5099	bzero(fn, fn_alloc);
5100	/* Begin setting up the temporary filename attribute. */
5101	fn->parent_directory = MK_LE_MREF(dir_ni->mft_no, dir_ni->seq_no);
5102	/* FILENAME_POSIX is zero and the attribute is already zeroed. */
5103	/* fn->filename_type = FILENAME_POSIX; */
5104	/*
5105	 * If this is not a rename then convert the name from utf8 to Unicode.
5106	 * If this is a rename on the other hand then we have the name in
5107	 * Unicode already so just copy that over.
5108	 */
5109	ntfs_name = fn->filename;
5110	ntfs_name_size = NTFS_MAX_NAME_LEN * sizeof(ntfschar);
5111	if (!is_rename) {
5112		ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr,
5113				cn->cn_namelen, &ntfs_name, &ntfs_name_size);
5114		if (ntfs_name_len < 0) {
5115			err = -ntfs_name_len;
5116			if (err == ENAMETOOLONG)
5117				ntfs_debug("Failed (name is too long).");
5118			else
5119				ntfs_error(vol->mp, "Failed to convert name to "
5120						"Unicode (error %d).", err);
5121			goto err;
5122		}
5123	} else {
5124		memcpy(ntfs_name, name, name_len * sizeof(ntfschar));
5125		ntfs_name_len = name_len;
5126	}
5127	/* Set the filename length in the temporary filename attribute. */
5128	fn->filename_length = ntfs_name_len;
5129	fn_size = sizeof(FILENAME_ATTR) + ntfs_name_len * sizeof(ntfschar);
5130	/*
5131	 * Copy the times from the standard information attribute which we have
5132	 * cached in the ntfs inode.
5133	 */
5134	fn->creation_time = utc2ntfs(ni->creation_time);
5135	fn->last_data_change_time = utc2ntfs(ni->last_data_change_time);
5136	fn->last_mft_change_time = utc2ntfs(ni->last_mft_change_time);
5137	fn->last_access_time = utc2ntfs(ni->last_access_time);
5138	if (!is_dir) {
5139		lck_spin_lock(&ni->size_lock);
5140		fn->allocated_size = cpu_to_sle64(NInoNonResident(ni) &&
5141				(NInoSparse(ni) || NInoCompressed(ni)) ?
5142				ni->compressed_size : ni->allocated_size);
5143		fn->data_size = cpu_to_sle64(ni->data_size);
5144		lck_spin_unlock(&ni->size_lock);
5145	} else {
5146		/*
5147		 * Directories use 0 for the sizes in the filename attribute
5148		 * and the attribute is already zeroed.
5149		 */
5150		/* fn->data_size = fn->allocated_size = 0; */
5151	}
5152	/*
5153	 * If this is not a directory or it is an encrypted directory, set the
5154	 * needs archiving bit except for the core system files.
5155	 */
5156	fn->file_attributes = ni->file_attributes;
5157	if (!is_dir || NInoEncrypted(ni)) {
5158		BOOL need_set_archive_bit = TRUE;
5159		if (vol->major_ver >= 2) {
5160			if (ni->mft_no <= FILE_Extend)
5161				need_set_archive_bit = FALSE;
5162		} else {
5163			if (ni->mft_no <= FILE_UpCase)
5164				need_set_archive_bit = FALSE;
5165		}
5166		if (need_set_archive_bit) {
5167			ni->file_attributes |= FILE_ATTR_ARCHIVE;
5168			fn->file_attributes = ni->file_attributes;
5169			NInoSetDirtyFileAttributes(ni);
5170		}
5171	}
5172	/*
5173	 * Directories need the FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set
5174	 * in their filename attributes both in their mft records and in the
5175	 * index entries pointing to them but not in the standard information
5176	 * attribute which is why it is not set in @ni->file_attributes.
5177	 */
5178	if (is_dir)
5179		fn->file_attributes |= FILE_ATTR_DUP_FILENAME_INDEX_PRESENT;
5180	/*
5181	 * TODO: We need to find out whether it is true that ea_length takes
5182	 * precedence over reparse_tag, i.e. we need to check that if both EAs
5183	 * are present and this is a reparse point, we need to set the
5184	 * ea_length rather than the reparse_tag.  So far I have not been able
5185	 * to create EAs on a reparse point and vice versa so perhaps the two
5186	 * are mutually exclusive in which case we are fine...
5187	 *
5188	 * The attribute is already zeroed so no need to set anything to zero.
5189	 */
5190#if 0
5191	if (ni->ea_length) {
5192		fn->ea_length = cpu_to_le16(ni->ea_length);
5193		/* fn->reserved = 0; */
5194	} else if (ni->file_attributes & FILE_ATTR_REPARSE_POINT) {
5195		// TODO: Instead of zero use actual value if/when we enable
5196		// creating hard links to reparse points...
5197		/* fn->reparse_tag = 0; */
5198	} else {
5199		/*
5200		 * We need to initialize the unused field to zero but as we
5201		 * have already zeroed the attribute we do not need to do
5202		 * anything now.
5203		 */
5204		/* fn->reparse_tag = 0; */
5205	}
5206#endif
5207	/*
5208	 * Add the created filename attribute to the parent directory index.
5209	 *
5210	 * We know @ni is the base inode since we bailed out for attribute
5211	 * inodes above so we can use it to generate the mft reference.
5212	 */
5213	err = ntfs_dir_entry_add(dir_ni, fn, fn_size,
5214			MK_LE_MREF(ni->mft_no, ni->seq_no));
5215	if (err)
5216		goto err;
5217	/*
5218	 * The ea_length and reparse_tag are only set in the directory index
5219	 * entries and not in filename attributes in the mft record so zero
5220	 * them here, before adding the filename attribute to the mft record.
5221	 */
5222	fn->reparse_tag = 0;
5223	/*
5224	 * Add the created filename attribute to the mft record as well.
5225	 *
5226	 * Again, we know @ni is the base inode.
5227	 */
5228	err = ntfs_mft_record_map(ni, &m);
5229	if (err) {
5230		ntfs_error(vol->mp, "Failed to map mft record 0x%llx (error "
5231				"%d).", (unsigned long long)ni->mft_no, err);
5232		goto rm_err;
5233	}
5234	ctx = ntfs_attr_search_ctx_get(ni, m);
5235	if (!ctx) {
5236		err = ENOMEM;
5237		goto unm_err;
5238	}
5239	err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, fn, fn_size, ctx);
5240	if (err != ENOENT) {
5241		if (!err) {
5242			ntfs_debug("Failed (filename already present in "
5243					"inode.");
5244			err = EEXIST;
5245		} else
5246			ntfs_error(vol->mp, "Failed to add filename to mft_no "
5247					"0x%llx because looking up the "
5248					"filename in the mft record failed "
5249					"(error %d).",
5250					(unsigned long long)ni->mft_no, err);
5251		goto put_err;
5252	}
5253	/*
5254	 * The current implementation of ntfs_attr_lookup() will always return
5255	 * pointing into the base mft record when an attribute was not found.
5256	 */
5257	if (ni != ctx->ni)
5258		panic("%s(): ni != ctx->ni\n", __FUNCTION__);
5259	if (m != ctx->m)
5260		panic("%s(): m != ctx->m\n", __FUNCTION__);
5261	/*
5262	 * @ctx->a now points to the location in the mft record at which we
5263	 * need to insert the filename attribute, so insert it now.
5264	 *
5265	 * Note we ignore the case where @ctx->is_error is true because we do
5266	 * not need the attribute any more for anything after it has been
5267	 * inserted so we do not care that we failed to map its mft record.
5268	 */
5269	err = ntfs_resident_attr_record_insert(ni, ctx, AT_FILENAME, NULL, 0,
5270			fn, fn_size);
5271	if (err) {
5272		ntfs_error(vol->mp, "Failed to add filename to mft_no 0x%llx "
5273				"because inserting the filename attribute "
5274				"failed (error %d).",
5275				(unsigned long long)ni->mft_no, err);
5276		goto put_err;
5277	}
5278	/*
5279	 * Update the hard link count in the mft record.  Note we subtract one
5280	 * from the inode link count if this is a rename as the link count has
5281	 * been elevated by one by the caller.
5282	 */
5283	ni->link_count++;
5284	m->link_count = cpu_to_le16(ni->link_count - (is_rename ? 1 : 0));
5285	/*
5286	 * Update the ctime in the inode by copying it from the target
5287	 * directory inode where it will have been updated by the above call to
5288	 * ntfs_dir_entry_add().
5289	 */
5290	ni->last_mft_change_time = dir_ni->last_mft_change_time;
5291	NInoSetDirtyTimes(ni);
5292	/*
5293	 * Invalidate negative cache entries in the directory.  We need to do
5294	 * this because there may be negative cache entries which would match
5295	 * the name of the just created inode but in a different case.  Such
5296	 * negative cache entries would now be incorrect thus we need to throw
5297	 * away all negative cache entries to ensure there cannot be any
5298	 * incorrectly negative entries in the name cache.
5299	 */
5300	cache_purge_negatives(dir_ni->vn);
5301	/*
5302	 * We should add the new hard link to the name cache.  Problem is that
5303	 * this is likely not to be a useful thing to do as the original name
5304	 * is likely in the name cache already and the OS X name cache only
5305	 * allows one name per vnode and cache_enter() simply returns without
5306	 * doing anything if a name is already present in the name cache for
5307	 * the vnode.  Thus we could use vnode_update_identity() instead to
5308	 * switch the cached name from the original name to the new hard link.
5309	 *
5310	 * FIXME: The question is whether this is a useful thing to do.  On the
5311	 * one hand people creating a hard link are likely to want to then
5312	 * access the inode via the new name but on the other hand hard links
5313	 * are often used in applications for locking purposes and in this case
5314	 * after the hard link is created the application is likely to unlink
5315	 * the original name thus it would be beneficial if that remains in the
5316	 * cache until this happens which will automatically remove the name
5317	 * from the name cache and the next lookup of the new name will insert
5318	 * the new one.  Thus it is best if we do nothing at all now.  If OS X
5319	 * ever allows multiple name links per vnode we can uncomment the below
5320	 * cache_enter() call.
5321	 *
5322	 * For the rename case we have just removed the original name, thus it
5323	 * makes sense to add the new name now and whilst at it also update the
5324	 * vnode identity with the new name and parent as the old ones are no
5325	 * longer valid.
5326	 */
5327	if (is_rename) {
5328		vnode_update_identity(ni->vn, dir_ni->vn, cn->cn_nameptr,
5329				cn->cn_namelen, cn->cn_hash,
5330				VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
5331		cache_enter(dir_ni->vn, ni->vn, cn);
5332		cn->cn_flags &= ~MAKEENTRY;
5333	}
5334	/*
5335	 * Ensure the base mft record is written to disk.
5336	 *
5337	 * Note we do not set any of the NInoDirty*() flags because we have
5338	 * just created the inode thus all the fields are in sync between the
5339	 * ntfs_inode @ni and its mft record @m.
5340	 *
5341	 * Also note we defer the unmapping of the mft record to here so that
5342	 * we do not get racing time updates, etc during concurrent runs of
5343	 * link(2) and rename(2) where the source inode for the rename is the
5344	 * inode that has a new hardlink created to it at the same time.  This
5345	 * case can happen because we do not lock the source inode in
5346	 * ntfs_vnop_rename().
5347	 */
5348	NInoSetMrecNeedsDirtying(ni);
5349	/* We are done with the mft record. */
5350	ntfs_attr_search_ctx_put(ctx);
5351	ntfs_mft_record_unmap(ni);
5352	/* Free the temporary filename attribute. */
5353	OSFree(fn, fn_alloc, ntfs_malloc_tag);
5354	ntfs_debug("Done.");
5355	return 0;
5356put_err:
5357	ntfs_attr_search_ctx_put(ctx);
5358unm_err:
5359	ntfs_mft_record_unmap(ni);
5360rm_err:
5361#if 0
5362	if (ni->ea_length) {
5363		fn->ea_length = cpu_to_le16(ni->ea_length);
5364		/* fn->reserved = 0; */
5365	} else if (ni->file_attributes & FILE_ATTR_REPARSE_POINT) {
5366		// TODO: Instead of zero use actual value if/when we enable
5367		// creating hard links to reparse points...
5368		/* fn->reparse_tag = 0; */
5369	} else {
5370		/*
5371		 * We need to initialize the unused field to zero but as we
5372		 * have already zeroed the attribute we do not need to do
5373		 * anything now.
5374		 */
5375		/* fn->reparse_tag = 0; */
5376	}
5377#endif
5378	err2 = ntfs_dir_entry_delete(dir_ni, ni, fn, fn_size);
5379	if (err2) {
5380		ntfs_error(vol->mp, "Failed to rollback index entry creation "
5381				"in error handling code path (error %d).  "
5382				"Leaving inconsistent metadata.  Run chkdsk.",
5383				err2);
5384		NVolSetErrors(vol);
5385	}
5386err:
5387	if (fn)
5388		OSFree(fn, fn_alloc, ntfs_malloc_tag);
5389	if (err != EEXIST)
5390		ntfs_error(vol->mp, "Failed (error %d).", err);
5391	else
5392		ntfs_debug("Failed (error EEXIST).");
5393	return err;
5394}
5395
5396/**
5397 * ntfs_vnop_link - create a hard link to an inode
5398 * @a:		arguments to link function
5399 *
5400 * @a contains:
5401 *	vnode_t a_vp;			vnode to create hard link to
5402 *	vnode_t a_tdvp;			destination directory for the hard link
5403 *	struct componentname *a_cnp;	name of the hard link to create
5404 *	vfs_context_t a_context;
5405 *
5406 * Create a hard link to the inode specified by the vnode @a->a_vp with name as
5407 * specified in @a->a_cnp in the directory specified by the vnode @a->a_tdvp.
5408 *
5409 * Return 0 on success and errno on error.
5410 *
5411 * Note we always create filenames in the POSIX namespace.
5412 */
5413static int ntfs_vnop_link(struct vnop_link_args *a)
5414{
5415	ntfs_inode *ni, *dir_ni;
5416	ntfs_volume *vol;
5417	struct componentname *cn;
5418	errno_t err;
5419
5420	ni = NTFS_I(a->a_vp);
5421	vol = ni->vol;
5422	dir_ni = NTFS_I(a->a_tdvp);
5423	if (!dir_ni || !ni) {
5424		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
5425		return EINVAL;
5426	}
5427	cn = a->a_cnp;
5428	ntfs_debug("Creating a hard link to mft_no 0x%llx, named %.*s in "
5429			"directory mft_no 0x%llx.",
5430			(unsigned long long)ni->mft_no, (int)cn->cn_namelen,
5431			cn->cn_nameptr, (unsigned long long)dir_ni->mft_no);
5432	/* Do not allow attribute/raw inodes to be linked to. */
5433	if (NInoAttr(ni)) {
5434		ntfs_debug("Mft_no 0x%llx is a%s inode, returning EPERM.",
5435				(unsigned long long)ni->mft_no,
5436				NInoRaw(ni) ? " raw" : "n attribute");
5437		return EPERM;
5438	}
5439	/* The target inode must be a directory. */
5440	if (!S_ISDIR(dir_ni->mode)) {
5441		ntfs_debug("Target mft_no 0x%llx is not a directory, "
5442				"returning ENOTDIR.",
5443				(unsigned long long)dir_ni->mft_no);
5444		return ENOTDIR;
5445	}
5446	/* Lock the target directory inode for writing. */
5447	lck_rw_lock_exclusive(&dir_ni->lock);
5448	/* The inode being linked to must not be a directory. */
5449	if (S_ISDIR(ni->mode)) {
5450		lck_rw_unlock_exclusive(&dir_ni->lock);
5451		ntfs_debug("Mft_no 0x%llx to link to is a directory, cannot "
5452				"create hard link %.*s to it, returning "
5453				"EPERM.", (unsigned long long)ni->mft_no,
5454				(int)cn->cn_namelen, cn->cn_nameptr);
5455		return EPERM;
5456	}
5457	/* Lock the inode to link to for writing. */
5458	lck_rw_lock_exclusive(&ni->lock);
5459	/* Ensure the target directory has not been deleted. */
5460	if (!dir_ni->link_count) {
5461		ntfs_debug("Target directory mft_no 0x%llx has been deleted, "
5462				"returning ENOENT.",
5463				(unsigned long long)dir_ni->mft_no);
5464		/*
5465		 * If the directory is somehow still in the name cache remove
5466		 * it now.
5467		 */
5468		cache_purge(dir_ni->vn);
5469		err = ENOENT;
5470		goto err;
5471	}
5472	/*
5473	 * Ensure the inode has not been deleted.  Note we really should be
5474	 * checking that the source of the hard link has not been unlinked yet
5475	 * but we do not know what the source name was as the caller does not
5476	 * provide it to us and we do not know which name we were called for
5477	 * from just looking at the source vnode/inode.
5478	 */
5479	if (!ni->link_count) {
5480		ntfs_debug("Inode %.*s, mft_no 0x%llx has been deleted, "
5481				"returning ENOENT.", (int)cn->cn_namelen,
5482				cn->cn_nameptr, (unsigned long long)ni->mft_no);
5483		/*
5484		 * If the target is somehow still in the name cache remove it
5485		 * now.
5486		 */
5487		cache_purge(ni->vn);
5488		err = ENOENT;
5489		goto err;
5490	}
5491	/*
5492	 * The inode being linked to must not be a directory or device special
5493	 * file.  TODO: Extend the checks when we support device special files.
5494	 */
5495	if (S_ISDIR(ni->mode)) {
5496		ntfs_debug("Mft_no 0x%llx to link to is a directory, cannot "
5497				"create hard link %.*s to it, returning "
5498				"EPERM.", (unsigned long long)ni->mft_no,
5499				(int)cn->cn_namelen, cn->cn_nameptr);
5500		err = EPERM;
5501		goto err;
5502	}
5503	/*
5504	 * Do not allow any of the system files to be linked to.
5505	 *
5506	 * For NTFS 3.0+ volumes do not allow any of the extended system files
5507	 * to be linked to, either.
5508	 *
5509	 * Note we specifically blacklist all system files that we make use of.
5510	 *
5511	 * TODO: What about all the new metadata files introduced with Windows
5512	 * Vista?  We are currently ignoring them and allowing them to be
5513	 * linked to...
5514	 */
5515	if (ni->file_attributes & FILE_ATTR_SYSTEM) {
5516		BOOL is_system = FALSE;
5517		if (vol->major_ver <= 1) {
5518			if (ni->mft_no < FILE_Extend)
5519				is_system = TRUE;
5520		} else {
5521			if (ni->mft_no <= FILE_Extend)
5522				is_system = TRUE;
5523			if (ni == vol->objid_ni || ni == vol->quota_ni ||
5524					ni == vol->usnjrnl_ni)
5525				is_system = TRUE;
5526		}
5527		if (is_system) {
5528			ntfs_debug("Mft_no 0x%llx is a%s system file, "
5529					"returning EPERM.",
5530					(unsigned long long)ni->mft_no,
5531					(ni->mft_no > FILE_Extend) ?
5532					"n extended" : "");
5533			err = EPERM;
5534			goto err;
5535		}
5536	}
5537	/*
5538	 * Ensure the inode to link to is not read-only (we already checked
5539	 * that @ni is not a directory).
5540	 */
5541	if (ni->file_attributes & FILE_ATTR_READONLY) {
5542		ntfs_debug("Mft_no 0x%llx is marked read-only, returning "
5543				"EPERM.", (unsigned long long)ni->mft_no);
5544		err = EPERM;
5545		goto err;
5546	}
5547	/*
5548	 * TODO: Test if Windows is happy with a reparse point having a hard
5549	 * link and if so remove this check and copy in the reparse point tag
5550	 * into the filename attribute below.  For mount point reparse points
5551	 * the reparse point is a directory so the link attempt would already
5552	 * have been aborted.
5553	 *
5554	 * TODO: Test if Windows is happy with an offline inode having a hard
5555	 * link and if so remove this check.
5556	 */
5557	if (ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
5558			FILE_ATTR_OFFLINE)) {
5559		ntfs_debug("Mft_no 0x%llx is %s.  Creating hard links to such "
5560				"inodes is not allowed, returning EPERM.",
5561				(unsigned long long)ni->mft_no,
5562				(ni->file_attributes &
5563				FILE_ATTR_REPARSE_POINT) ?
5564				"a reparse point" : "offline");
5565		err = EPERM;
5566		goto err;
5567	}
5568	/* Check if the maximum link count is already reached. */
5569	if (ni->link_count >= NTFS_MAX_HARD_LINKS) {
5570		ntfs_debug("Cannot create hard link to mft_no 0x%llx because "
5571				"it already has too many hard links.",
5572				(unsigned long long)ni->mft_no);
5573		err = EMLINK;
5574		goto err;
5575	}
5576	/* Go ahead and create the hard link. */
5577	err = ntfs_link_internal(ni, dir_ni, cn, FALSE, NULL, 0);
5578	if (err) {
5579		if (err != EEXIST)
5580			ntfs_error(vol->mp, "Failed to create hard link to "
5581					"mft_no 0x%llx, named %.*s, in "
5582					"directory mft_no 0x%llx (error %d).",
5583					(unsigned long long)ni->mft_no,
5584					(int)cn->cn_namelen, cn->cn_nameptr,
5585					(unsigned long long)dir_ni->mft_no,
5586					err);
5587		else
5588			ntfs_debug("Failed to create hard link to mft_no "
5589					"0x%llx, named %.*s, in directory "
5590					"mft_no 0x%llx (error EEXIST).",
5591					(unsigned long long)ni->mft_no,
5592					(int)cn->cn_namelen, cn->cn_nameptr,
5593					(unsigned long long)dir_ni->mft_no);
5594	} else
5595		ntfs_debug("Done.");
5596err:
5597	/* We are done, unlock the inode and the target directory. */
5598	lck_rw_unlock_exclusive(&ni->lock);
5599	lck_rw_unlock_exclusive(&dir_ni->lock);
5600	return err;
5601}
5602
5603/**
5604 * ntfs_vnop_rename - rename an inode (file/directory/symbolic link/etc)
5605 * @a:		arguments to rename function
5606 *
5607 * @a contains:
5608 *	vnode_t a_fdvp;			directory containing source inode
5609 *	vnode_t a_fvp;			source inode to be renamed
5610 *	struct componentname *a_fcnp;	name of the inode to rename
5611 *	vnode_t a_tdvp;			target directory to move the source to
5612 *	vnode_t a_tvp;			target inode to be deleted
5613 *	struct componentname *a_tcnp;	name of the inode to delete
5614 *	vfs_context_t a_context;
5615 *
5616 * Rename the inode @a_fvp with name as specified in @a->a_fcnp located in the
5617 * directory @a->a_fdvp to the new name specified in a->a_tcnp placing it in
5618 * the target directory @a->a_tdvp.
5619 *
5620 * If @a->a_tvp is not NULL it means that the rename target already exists
5621 * which means we have to delete the rename target before we can perform the
5622 * rename.  In this case @a->a_tvp is the existing target inode and its name is
5623 * the rename target name specified in @a->a_tcnp and it is located in the
5624 * target directory @a->a_tdvp.
5625 *
5626 * Return 0 on success and errno on error.
5627 *
5628 * Note we always create the target name @a->a_tcnp in the POSIX namespace.
5629 *
5630 * Rename is a complicated operation because there are several special cases
5631 * that need consideration:
5632 *
5633 * First of all unchecked renaming can create directory loops which are not
5634 * attached to the file system root, e.g. take the directory tree /a/b/c and
5635 * perform a rename of /a/b to /a/b/c/ which if allowed to proceed would create
5636 * /a and b/c/b where the latter is a loop in that b points back to c which
5637 * points back to b.  Also this loop no longer is attached to the file system
5638 * directory tree and there is no way to access it any more as there is no link
5639 * from /a to b or c any more.  Thus we have to check for this case and return
5640 * EINVAL error instead of doing the rename.  Also a concurrent rename could
5641 * reshape the tree after our check so that our case would result in a loop
5642 * after all thus all tree reshaping renames must be done under a rename lock.
5643 * Note the VFS already holds the mnt_renamelock mutex for some renames but it
5644 * does not hold it in all cases we need it to be held so we still need our own
5645 * NTFS rename lock.
5646 *
5647 * Further VNOP_RENAME() must observe the following rules:
5648 *
5649 * - Source and destination must either both be directories, or both not be
5650 *   directories.  If this is not the case return ENOTDIR if the target is not
5651 *   a directory and EISDIR if the target is a directory.
5652 *
5653 * - If the target is a directory, it must be empty.  Return ENOTEMPTY if not.
5654 *
5655 * - It is not allowed to rename "/", ".", or "..".  Return EINVAL if this is
5656 *   attempted.
5657 *
5658 * - If the source inode and the target inode are the same and the mount is
5659 *   case sensitive or the parent directories are also the same and the names
5660 *   are the same do not do anything at all and return success, i.e. 0.  Note
5661 *   this is a violation of POSIX but it is needed to allow renaming of files
5662 *   from one case to another, i.e. when a mount is not case sensitive but case
5663 *   preserving (this is the default for NTFS) and the source and target inodes
5664 *   and their parent directories match but the names do not match we want to
5665 *   perform the rename rather than just return success.  If we still find that
5666 *   the target exists as a hard link rather than this being a case changing
5667 *   rename we still need to abort and return success to comply with POSIX.
5668 *
5669 *   FIXME: There is a bug in the VFS in that it never calls VNOP_RENAME() at
5670 *   all when it is called with source and target strings being the same.  This
5671 *   is wrong when the string matches the name but does not have the same case,
5672 *   i.e. the rename would normally succeed switching the case to the new case.
5673 *   The VFS is currently forbidding this to happen.  <rdar://problem/5485782>
5674 */
5675static int ntfs_vnop_rename(struct vnop_rename_args *a)
5676{
5677	MFT_REF src_mref, dst_mref;
5678	ntfs_inode *src_dir_ni, *src_ni, *dst_dir_ni, *dst_ni;
5679	struct componentname *src_cn, *dst_cn;
5680	ntfs_volume *vol;
5681	ntfschar *ntfs_name_buf, *orig_ntfs_name, *dst_ntfs_name;
5682	ntfschar *src_ntfs_name, *target_ntfs_name;
5683	ntfs_dir_lookup_name *src_name, *dst_name;
5684	size_t orig_ntfs_name_size, dst_ntfs_name_size;
5685	signed orig_ntfs_name_len, dst_ntfs_name_len, src_ntfs_name_len;
5686	signed target_ntfs_name_len;
5687	errno_t err, err2;
5688	FILENAME_TYPE_FLAGS src_ntfs_name_type, target_ntfs_name_type;
5689	BOOL have_unlinked = FALSE;
5690
5691	dst_name = src_name = NULL;
5692	src_dir_ni = NTFS_I(a->a_fdvp);
5693	src_ni = NTFS_I(a->a_fvp);
5694	src_cn = a->a_fcnp;
5695	dst_dir_ni = NTFS_I(a->a_tdvp);
5696	if (!src_dir_ni || !src_ni || !dst_dir_ni) {
5697		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
5698		return EINVAL;
5699	}
5700	vol = src_dir_ni->vol;
5701	dst_cn = a->a_tcnp;
5702	if (a->a_tvp) {
5703		dst_ni = NTFS_I(a->a_tvp);
5704		if (!dst_ni) {
5705			ntfs_debug("Entered with NULL ntfs_inode, aborting.");
5706			return EINVAL;
5707		}
5708		ntfs_debug("Entering for source mft_no 0x%llx, name %.*s, "
5709				"parent directory mft_no 0x%llx and "
5710				"destination mft_no 0x%llx, name %.*s, parent "
5711				"directory mft_no 0x%llx.",
5712				(unsigned long long)src_ni->mft_no,
5713				(int)src_cn->cn_namelen, src_cn->cn_nameptr,
5714				(unsigned long long)src_dir_ni->mft_no,
5715				(unsigned long long)dst_ni->mft_no,
5716				(int)dst_cn->cn_namelen, dst_cn->cn_nameptr,
5717				(unsigned long long)dst_dir_ni->mft_no);
5718		if (src_ni == dst_ni && NVolCaseSensitive(vol)) {
5719			ntfs_debug("Source and destination inodes are the "
5720					"same and the volume is case "
5721					"sensitive.  Returning success "
5722					"without doing anything as required "
5723					"by POSIX.");
5724			return 0;
5725		}
5726	} else {
5727		dst_ni = NULL;
5728		ntfs_debug("Entering for source mft_no 0x%llx, name %.*s, "
5729				"parent directory mft_no 0x%llx and no "
5730				"destination mft_no, destination name %.*s, "
5731				"parent directory mft_no 0x%llx.",
5732				(unsigned long long)src_ni->mft_no,
5733				(int)src_cn->cn_namelen, src_cn->cn_nameptr,
5734				(unsigned long long)src_dir_ni->mft_no,
5735				(int)dst_cn->cn_namelen, dst_cn->cn_nameptr,
5736				(unsigned long long)dst_dir_ni->mft_no);
5737	}
5738	/*
5739	 * The source and target parent inodes must be directories which
5740	 * implies they are base inodes.
5741	 */
5742	if (!S_ISDIR(src_dir_ni->mode) || !S_ISDIR(dst_dir_ni->mode)) {
5743		ntfs_debug("%s parent inode 0x%llx is not a directory, "
5744				"returning ENOTDIR.",
5745				!S_ISDIR(src_dir_ni->mode) ?
5746				"Source" : "Destination", (unsigned long long)
5747				(!S_ISDIR(src_dir_ni->mode) ?
5748				src_dir_ni->mft_no : dst_dir_ni->mft_no));
5749		return ENOTDIR;
5750	}
5751	/*
5752	 * All inodes must be locked in parent -> child order so we need to
5753	 * check whether the source and target parent inodes have a
5754	 * parent/child relationship with each other.
5755	 *
5756	 * If both are the same we have the easiest case and we just lock the
5757	 * single directory inode.
5758	 *
5759	 * If the two are not the same we need to exclude all other tree
5760	 * reshaping renames from happening as they could change the
5761	 * relationship between the parent directory inodes under our feet.  To
5762	 * do this we use a per ntfs volume lock so we can then go on to
5763	 * determine their parent/child relationship.
5764	 *
5765	 * Once we have established if there is a parent/child relationship we
5766	 * lock the parent followed by the child and if the two are completely
5767	 * unrelated the order of locking does not matter so we just lock the
5768	 * destination followed by the source.
5769	 *
5770	 * Note that we take this opportunity of walking the directory tree up
5771	 * to the root starting from @dst_dir_ni to also check whether @src_ni
5772	 * is either equal to or a parent of @dst_dir_ni in which case a
5773	 * directory loop would be caused by the rename so we have to abort it
5774	 * with EINVAL error.
5775	 */
5776	if (src_dir_ni == dst_dir_ni)
5777		lck_rw_lock_exclusive(&src_dir_ni->lock);
5778	else {
5779		BOOL is_parent;
5780
5781		lck_mtx_lock(&vol->rename_lock);
5782		err = ntfs_inode_is_parent(src_dir_ni, dst_dir_ni, &is_parent,
5783				src_ni);
5784		if (err) {
5785			lck_mtx_unlock(&vol->rename_lock);
5786			/*
5787			 * @err == EINVAL means @src_ni matches or is a parent
5788			 * of @dst_dir_ni.  This would create a directory
5789			 * loop so abort the rename but do not emit an error
5790			 * message as there is no error as such.
5791			 */
5792			if (err != EINVAL)
5793				ntfs_error(vol->mp, "Failed to determine "
5794						"whether source directory "
5795						"mft_no 0x%llx is a parent of "
5796						"destination directory mft_no "
5797						"0x%llx (error %d).",
5798						(unsigned long long)
5799						src_dir_ni->mft_no,
5800						(unsigned long long)
5801						dst_dir_ni->mft_no, err);
5802			return err;
5803		}
5804		/*
5805		 * If @src_dir_ni is a parent of @dst_dir_ni, lock @src_dir_ni
5806		 * followed by @dst_dir_ni.
5807		 *
5808		 * Otherwise either @dst_dir_ni is a parent of @src_dir_ni, in
5809		 * which case we have to lock @dst_dir_ni followed by
5810		 * @src_dir_ni, or they are unrelated in which case lock
5811		 * ordering does not matter thus we do not need to distinguish
5812		 * those two cases and can simply lock @dst_dir_ni followed by
5813		 * @src_dir_ni.
5814		 */
5815		if (is_parent) {
5816			lck_rw_lock_exclusive(&src_dir_ni->lock);
5817			lck_rw_lock_exclusive(&dst_dir_ni->lock);
5818		} else {
5819			lck_rw_lock_exclusive(&dst_dir_ni->lock);
5820			lck_rw_lock_exclusive(&src_dir_ni->lock);
5821		}
5822	}
5823	/*
5824	 * The source cannot be the source directory and the destination cannot
5825	 * be the destination directory.  Also as we are about to lock the
5826	 * target ensure it does not equal the source directory either.  We
5827	 * have already checked for the source being equal to the target
5828	 * directory above so no need to check again.
5829	 */
5830	if (dst_ni && dst_ni == src_dir_ni) {
5831		ntfs_debug("The source parent directory equals the target, "
5832				"returning ENOTEMPTY.");
5833		err = ENOTEMPTY;
5834		/* Set @dst_ni to NULL so we do not try to unlock it. */
5835		dst_ni = NULL;
5836		goto err;
5837	}
5838	if (src_ni == src_dir_ni || (dst_ni && dst_ni == dst_dir_ni)) {
5839		ntfs_debug("The source and/or the target is/are equal to "
5840				"their parent directories, returning EINVAL.");
5841		err = EINVAL;
5842		/* Set @dst_ni to NULL so we do not try to unlock it. */
5843		dst_ni = NULL;
5844		goto err;
5845	}
5846	/*
5847	 * If the destination inode exists lock it so it can be unlinked
5848	 * safely.  For example if it is a directory we need to ensure that it
5849	 * is empty and that no-one creates an entry in it whilst the delete is
5850	 * in progress which requires us to hold an exclusive lock on it.
5851	 */
5852	if (dst_ni)
5853		lck_rw_lock_exclusive(&dst_ni->lock);
5854	/*
5855	 * Because we have locked the parent inode of the source inode there is
5856	 * no need to lock the source inode itself.  We are not going to unlink
5857	 * it completely, just move it from one location/name to another name
5858	 * and/or place in the directory tree and the mft record will be mapped
5859	 * and thus locked for exclusive access whenever we modify the inode
5860	 * which will serialize any potential concurrent operations on the
5861	 * inode.  The only concurrent operation to watch out for is when the
5862	 * source inode is a directory and someone calls VNOP_REMOVE() or
5863	 * VNOP_RMDIR() on any of its child inodes.  This can end up in the
5864	 * situation where the index root node is locked in
5865	 * ntfs_index_entry_delete() and hence the mft record is mapped whilst
5866	 * the free space in the mft record is evaluated but then before this
5867	 * information is used the mft record is unmapped and then mapped again
5868	 * as part of a call to ntfs_index_entry_lock_two() and if our
5869	 * VNOP_RENAME() manages to map the mft record whilst it is temporarily
5870	 * unmapped during the ntfs_index_entry_lock_two() we can cause the
5871	 * free space in the mft record to decrease and thus the
5872	 * ntfs_index_entry_delete() may then encounter an out of space
5873	 * condition when it thought it had determined the amount of free space
5874	 * already and thus assume something has gone wrong and panic().  We
5875	 * overcome this problem inside ntfs_index_entry_delete() by rechecking
5876	 * the free space after reacquiring the lock and dealing with it as
5877	 * appropriate.
5878	 *
5879	 * First, ensure the parent directories have not been deleted.
5880	 */
5881	if (!src_dir_ni->link_count || !dst_dir_ni->link_count) {
5882		ntfs_debug("One or both of the parent directories mft_no "
5883				"0x%llx and mft_no 0x%llx has/have been "
5884				"deleted, returning ENOENT.",
5885				(unsigned long long)src_dir_ni->mft_no,
5886				(unsigned long long)dst_dir_ni->mft_no);
5887		/*
5888		 * If the directory is somehow still in the name cache remove
5889		 * it now.
5890		 */
5891		if (!src_dir_ni->link_count)
5892			cache_purge(src_dir_ni->vn);
5893		if (!dst_dir_ni->link_count)
5894			cache_purge(dst_dir_ni->vn);
5895		err = ENOENT;
5896		goto err;
5897	}
5898	/* Rename is not allowed on attribute/raw inodes. */
5899	if (NInoAttr(src_ni) || (dst_ni && NInoAttr(dst_ni))) {
5900		ntfs_debug("Source and/or target inode is/are attribute/raw "
5901				"inodes, returning EPERM.");
5902		err = EPERM;
5903		goto err;
5904	}
5905	/* Ensure the source has not been deleted by someone else already. */
5906	if (!src_ni->link_count) {
5907		ntfs_debug("Source %.*s, mft_no 0x%llx has been deleted, "
5908				"returning ENOENT.", (int)src_cn->cn_namelen,
5909				src_cn->cn_nameptr,
5910				(unsigned long long)src_ni->mft_no);
5911		/*
5912		 * If the source is somehow still in the name cache remove it
5913		 * now.
5914		 */
5915		cache_purge(src_ni->vn);
5916		err = ENOENT;
5917		goto err;
5918	}
5919	/*
5920	 * Ensure the target has not been deleted by someone else already.  If
5921	 * it has been deleted pretend the caller did not specify a target.
5922	 * This is what HFS+ does, too.
5923	 */
5924	if (dst_ni && !dst_ni->link_count) {
5925		ntfs_debug("Target %.*s, mft_no 0x%llx has been deleted, "
5926				"pretending no target was specified.",
5927				(int)dst_cn->cn_namelen, dst_cn->cn_nameptr,
5928				(unsigned long long)dst_ni->mft_no);
5929		/*
5930		 * If the target is somehow still in the name cache remove it
5931		 * now.
5932		 */
5933		cache_purge(dst_ni->vn);
5934		lck_rw_unlock_exclusive(&dst_ni->lock);
5935		dst_ni = NULL;
5936	}
5937	/*
5938	 * If the destination exists need to ensure that it is a directory if
5939	 * the source is a directory or that it is not a directory if the
5940	 * source is not a directory.
5941	 *
5942	 * Also, need to ensure the target directory is empty.
5943	 *
5944	 * If the source and destination are the same none of these checks
5945	 * apply so skip them.
5946	 */
5947	if (dst_ni && src_ni != dst_ni) {
5948		if (S_ISDIR(src_ni->mode)) {
5949			if (!S_ISDIR(dst_ni->mode)) {
5950				ntfs_debug("Source is a directory but "
5951						"destination is not, "
5952						"returning ENOTDIR");
5953				err = ENOTDIR;
5954				goto err;
5955			}
5956			/* The target is a directory, but is it empty? */
5957			err = ntfs_dir_is_empty(dst_ni);
5958			if (err) {
5959				if (err == ENOTEMPTY)
5960					ntfs_debug("Target directory %.*s, "
5961							"mft_no 0x%llx is not "
5962							"empty, returning "
5963							"ENOTEMPTY.",
5964							(int)dst_cn->cn_namelen,
5965							dst_cn->cn_nameptr,
5966							(unsigned long long)
5967							dst_ni->mft_no);
5968				else {
5969					ntfs_error(vol->mp, "Failed to "
5970							"determine if target "
5971							"directory %.*s, "
5972							"mft_no 0x%llx is "
5973							"empty (error %d).",
5974							(int)dst_cn->cn_namelen,
5975							dst_cn->cn_nameptr,
5976							(unsigned long long)
5977							dst_ni->mft_no, err);
5978					err = EIO;
5979				}
5980				goto err;
5981			}
5982		} else /* if (!S_ISDIR(src_ni->mode)) */ {
5983			if (S_ISDIR(dst_ni->mode)) {
5984				ntfs_debug("Source is not a directory but "
5985						"destination is, returning "
5986						"EISDIR");
5987				err = EISDIR;
5988				goto err;
5989			}
5990		}
5991	}
5992	/* Ensure none of the inodes are read-only. */
5993	if ((!S_ISDIR(src_ni->mode) &&
5994			src_ni->file_attributes & FILE_ATTR_READONLY) ||
5995			(dst_ni && !S_ISDIR(dst_ni->mode) &&
5996			dst_ni->file_attributes & FILE_ATTR_READONLY)) {
5997		ntfs_debug("One of the inodes involved in the rename is "
5998				"read-only, returning EPERM.");
5999		err = EPERM;
6000		goto err;
6001	}
6002	/*
6003	 * Do not allow any of the system files to be renamed/deleted.
6004	 *
6005	 * For NTFS 3.0+ volumes do not allow any of the extended system files
6006	 * to be renamed/deleted, either.
6007	 *
6008	 * Note we specifically blacklist all system files that we make use of.
6009	 *
6010	 * TODO: What about all the new metadata files introduced with Windows
6011	 * Vista?  We are currently ignoring them and allowing them to be
6012	 * renamed/deleted...
6013	 */
6014	if (src_ni->file_attributes & FILE_ATTR_SYSTEM || (dst_ni &&
6015			dst_ni->file_attributes & FILE_ATTR_SYSTEM)) {
6016		BOOL is_system = FALSE;
6017		if (vol->major_ver <= 1) {
6018			if (src_ni->mft_no < FILE_Extend || (dst_ni &&
6019					dst_ni->mft_no < FILE_Extend))
6020				is_system = TRUE;
6021		} else {
6022			if (src_ni->mft_no <= FILE_Extend || (dst_ni &&
6023					dst_ni->mft_no <= FILE_Extend))
6024				is_system = TRUE;
6025			if (src_dir_ni == vol->extend_ni) {
6026				if (src_ni == vol->objid_ni ||
6027						src_ni == vol->quota_ni ||
6028						src_ni == vol->usnjrnl_ni)
6029					is_system = TRUE;
6030			}
6031			if (dst_dir_ni == vol->extend_ni) {
6032				if (dst_ni == vol->objid_ni ||
6033						dst_ni == vol->quota_ni ||
6034						dst_ni == vol->usnjrnl_ni)
6035					is_system = TRUE;
6036			}
6037		}
6038		if (is_system) {
6039			ntfs_debug("Source and/or target inode is a system "
6040					"file, returning EPERM.");
6041			err = EPERM;
6042			goto err;
6043		}
6044	}
6045	/*
6046	 * If the source/target inodes are reparse points or if they are
6047	 * offline we cannot rename/delete them yet.  TODO: Implement this.
6048	 */
6049	if (src_ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
6050			FILE_ATTR_OFFLINE) || (dst_ni &&
6051			dst_ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
6052			FILE_ATTR_OFFLINE))) {
6053		ntfs_error(vol->mp, "Source or target inode is a reparse "
6054				"point or offline, renaming such indoes is "
6055				"notsupported yet, returning ENOTSUP.");
6056		err = ENOTSUP;
6057		goto err;
6058	}
6059	/*
6060	 * To proceed further we need to convert both the source and target
6061	 * names from utf8 to Unicode.  This is a good time to do both as the
6062	 * conversion also checks for invalid names, too long names, etc.
6063	 *
6064	 * Note we allocate both source and target names with a single buffer
6065	 * so we only have to call once into the allocator.
6066	 */
6067	ntfs_name_buf = OSMalloc(NTFS_MAX_NAME_LEN * 2, ntfs_malloc_tag);
6068	if (!ntfs_name_buf) {
6069		ntfs_debug("Not enough memory to allocate name buffer.");
6070		err = ENOMEM;
6071		goto err;
6072	}
6073	orig_ntfs_name = ntfs_name_buf;
6074	dst_ntfs_name = (ntfschar*)((u8*)ntfs_name_buf + NTFS_MAX_NAME_LEN);
6075	dst_ntfs_name_size = orig_ntfs_name_size = NTFS_MAX_NAME_LEN;
6076	orig_ntfs_name_len = utf8_to_ntfs(vol, (u8*)src_cn->cn_nameptr,
6077			src_cn->cn_namelen, &orig_ntfs_name,
6078			&orig_ntfs_name_size);
6079	if (orig_ntfs_name_len < 0) {
6080		err = -orig_ntfs_name_len;
6081		if (err == ENAMETOOLONG)
6082			ntfs_debug("Failed (source name is too long).");
6083		else
6084			ntfs_error(vol->mp, "Failed to convert name to "
6085					"Unicode (error %d).", err);
6086		goto free_err;
6087	}
6088	dst_ntfs_name_len = utf8_to_ntfs(vol, (u8*)dst_cn->cn_nameptr,
6089			dst_cn->cn_namelen, &dst_ntfs_name,
6090			&dst_ntfs_name_size);
6091	if (dst_ntfs_name_len < 0) {
6092		err = -dst_ntfs_name_len;
6093		if (err == ENAMETOOLONG)
6094			ntfs_debug("Failed (target name is too long).");
6095		else
6096			ntfs_error(vol->mp, "Failed to convert target name to "
6097					"Unicode (error %d).", err);
6098		goto free_err;
6099	}
6100	/*
6101	 * We need to make sure the source still has the name specified in
6102	 * @src_cn.  It could have been unlinked or renamed before we took the
6103	 * lock on the parent directory.
6104	 *
6105	 * To do this, look up the converted source name in the source parent
6106	 * directory index.
6107	 */
6108	err = ntfs_lookup_inode_by_name(src_dir_ni, orig_ntfs_name,
6109			orig_ntfs_name_len, &src_mref, &src_name);
6110	if (err) {
6111		if (err != ENOENT) {
6112			ntfs_error(vol->mp, "Failed to find source name in "
6113					"directory (error %d).", err);
6114			goto free_err;
6115		}
6116src_enoent:
6117		/*
6118		 * The source name does not exist in the source parent
6119		 * directory.
6120		 *
6121		 * This means someone renamed or deleted the name from the
6122		 * directory before we managed to take the locks.
6123		 */
6124		ntfs_debug("Source has been renamed or deleted already, "
6125				"returning ENOENT.");
6126		/*
6127		 * If the source is somehow still in the name cache remove it
6128		 * now.
6129		 */
6130		cache_purge(src_ni->vn);
6131		err = ENOENT;
6132		goto free_err;
6133	}
6134	/*
6135	 * We found the source name in the directory index but does it still
6136	 * point to the same mft record?  The sequence number check ensures the
6137	 * inode was not deleted and recreated with the same name and the same
6138	 * mft record number.
6139	 */
6140	if (src_mref != MK_MREF(src_ni->mft_no, src_ni->seq_no))
6141		goto src_enoent;
6142	/*
6143	 * We now have verified everything to do with the source.  Set the
6144	 * source name to be the correctly cased name (unless it was correctly
6145	 * cased already in which case @src_name will be NULL and
6146	 * @orig_ntfs_name contains the correcly cased name).
6147	 */
6148	if (src_name) {
6149		src_ntfs_name = src_name->name;
6150		src_ntfs_name_len = src_name->len;
6151		src_ntfs_name_type = src_name->type;
6152	} else {
6153		src_ntfs_name = orig_ntfs_name;
6154		src_ntfs_name_len = orig_ntfs_name_len;
6155		src_ntfs_name_type = 0;
6156	}
6157	/*
6158	 * Now we need to verify the target.  In an ideal world, either it has
6159	 * to be specified in @dst_ni in which case it also has to exist in the
6160	 * destination parent directory @dst_dir_ni, or @dst_ni has to be NULL
6161	 * in which case the target name must not exist in the destination
6162	 * parent directory.
6163	 *
6164	 * But because the VFS obtains the target before we take the necessary
6165	 * locks it is possible for the above ideal not to be true.  There are
6166	 * several possible cases:
6167	 *
6168	 * - Target was specified but deleted.  We have detected this case
6169	 *   above and have set @dst_ni to NULL thus we do not need to worry
6170	 *   about this case any more.
6171	 * - Target was not specified but another inode was created with the
6172	 *   same name.  In this case we return EEXIST which is what HFS+ does,
6173	 *   too.
6174	 * - Target was specified but renamed.  This means we may or may not
6175	 *   find a directory entry of the same name.  If we do not find a
6176	 *   matching directory entry we know the target has been renamed thus
6177	 *   we can simply set @dst_ni to NULL and pretend it does not exist.
6178	 *   If we do find a directory entry that matches in name but does not
6179	 *   point to the same mft reference we know the target was renamed and
6180	 *   another inode was created with the same name.  In this case we
6181	 *   return EEXIST which is what HFS+ does, too.
6182	 */
6183	err = ntfs_lookup_inode_by_name(dst_dir_ni, dst_ntfs_name,
6184			dst_ntfs_name_len, &dst_mref, &dst_name);
6185	if (err) {
6186		if (err != ENOENT) {
6187			ntfs_error(vol->mp, "Failed to find target name in "
6188					"directory (error %d).", err);
6189			goto free_err;
6190		}
6191		/*
6192		 * The destination name does not exist in the destination
6193		 * parent directory which means that the target must have been
6194		 * renamed to something else before we took the locks.  We
6195		 * treat this the same as if had been deleted, i.e. we pretend
6196		 * the caller did not specify a target.
6197		 */
6198		if (dst_ni) {
6199			ntfs_debug("Target %.*s, mft_no 0x%llx has been "
6200					"renamed, pretending no target was "
6201					"specified.", (int)dst_cn->cn_namelen,
6202					dst_cn->cn_nameptr,
6203					(unsigned long long)dst_ni->mft_no);
6204			lck_rw_unlock_exclusive(&dst_ni->lock);
6205			dst_ni = NULL;
6206		}
6207	} else /* if (!err) */ {
6208		/*
6209		 * The destination name exists in the directory index.
6210		 *
6211		 * If the caller did not specify it in @dst_ni or the
6212		 * destination inode has been deleted (in which case we set
6213		 * @dst_ni to NULL above) or the target was renamed and another
6214		 * inode was created with the same name return error EEXIST
6215		 * which is what HFS+ does, too.
6216		 *
6217		 * FIXME: Technically it would probably be more correct to get
6218		 * the new target ntfs inode and restart the function but at
6219		 * least for now stick with the same behaviour as HFS+.
6220		 */
6221		if (!dst_ni || dst_mref != MK_MREF(dst_ni->mft_no,
6222				dst_ni->seq_no)) {
6223			ntfs_debug("Target name %.*s exists but %s, returning "
6224					"EEXIST.", (int)dst_cn->cn_namelen,
6225					dst_cn->cn_nameptr, !dst_ni ?
6226					"target inode was not specified or it "
6227					"was already deleted" :
6228					"does not match specified target "
6229					"inode (it must have been renamed and "
6230					"a new inode created with the same "
6231					"name)");
6232			err = EEXIST;
6233			goto free_err;
6234		}
6235		/*
6236		 * We still need the destination name thus use a new variable
6237		 * to store the correctly cased target name.
6238		 */
6239		if (!dst_name) {
6240			target_ntfs_name = dst_ntfs_name;
6241			target_ntfs_name_len = dst_ntfs_name_len;
6242			target_ntfs_name_type = 0;
6243		} else {
6244			target_ntfs_name = dst_name->name;
6245			target_ntfs_name_len = dst_name->len;
6246			target_ntfs_name_type = dst_name->type;
6247		}
6248		/*
6249		 * We have verified everything to do with the target.  We now
6250		 * need to unlink it unless the source and the target are the
6251		 * same, i.e. we are changing the case of an existing filename.
6252		 * We need to distinguish two cases.  If the volume is mounted
6253		 * case sensitive or it is not case sensitive and the source
6254		 * and destination names do not match (i.e. they are different
6255		 * hard links to the same inode) we do not proceed and return
6256		 * success (this is required by POSIX).  Otherwise the volume
6257		 * is not case sensitive and the source and destination names
6258		 * match (i.e. they are the same hard link) and we can either
6259		 * return success when the source and destination names are
6260		 * identical (same case) or we can proceed with the rename when
6261		 * the case differs.
6262		 *
6263		 * Note we have caught the case of the inodes being equal and
6264		 * the volume being mounted case sensitive earlier on so we now
6265		 * know that the volume is not mounted case sensitive.
6266		 */
6267		if (src_ni == dst_ni) {
6268			/*
6269			 * If the two names are not the same hardlink return
6270			 * success not doing anything as required by POSIX.
6271			 *
6272			 * Note we do not need to care about case when
6273			 * comparing because we are comparing the correctly
6274			 * cased names.
6275			 */
6276			if (src_ntfs_name_len != target_ntfs_name_len ||
6277					bcmp(src_ntfs_name, target_ntfs_name,
6278					src_ntfs_name_len * sizeof(ntfschar))) {
6279				ntfs_debug("Source and target inodes are the "
6280						"same but the source and "
6281						"target names are different "
6282						"hard links.  Returning "
6283						"success without doing "
6284						"anything as required by "
6285						"POSIX.");
6286				goto done;
6287			}
6288			/*
6289			 * The names are the same hard link.  If the existing
6290			 * name is the same as the destination name (i.e. the
6291			 * target name before case correction) there is
6292			 * nothing to do and we can return success.
6293			 */
6294			if (src_ntfs_name_len == dst_ntfs_name_len &&
6295					!bcmp(src_ntfs_name, dst_ntfs_name,
6296					src_ntfs_name_len * sizeof(ntfschar))) {
6297				ntfs_debug("Source and destination are "
6298						"identical so no need to do "
6299						"anything.  Returning "
6300						"success.");
6301				goto done;
6302			}
6303			/*
6304			 * The names are the same hard link but they differ in
6305			 * case thus there is no target to be removed as it
6306			 * will be removed as part of the actual rename when
6307			 * the source name is removed.
6308			 */
6309		} else /* if (dst_ni && src_ni != dst_ni) */ {
6310			/*
6311			 * The source and the target are not the same thus now
6312			 * unlink the target.  We can do this atomically before
6313			 * adding the new entry because both the parent
6314			 * directory inode and the target inode are locked for
6315			 * writing thus no-one can access either until we have
6316			 * finished.  FIXME: The only pitfal is what happens if
6317			 * the rename fails after we have removed the target?
6318			 * We just ignore this problem for now and let the
6319			 * target disappear.  This is what HFS does also so at
6320			 * least we are not the only non-POSIX conformant file
6321			 * system on OS X...  In fact as long as we return EIO
6322			 * on error once we have unlinked the target POSIX
6323			 * still considers this ok.  (This is what HFS does,
6324			 * too.)
6325			 *
6326			 * Note we do not set @is_rename to true here as this
6327			 * is just a normal unlink operation.
6328			 */
6329			err = ntfs_unlink_internal(dst_dir_ni, dst_ni,
6330					target_ntfs_name, target_ntfs_name_len,
6331					target_ntfs_name_type, FALSE);
6332			if (err) {
6333				ntfs_error(vol->mp, "Rename failed because "
6334						"the target mft_no 0x%llx "
6335						"could not be removed from "
6336						"directory mft_no 0x%llx "
6337						"(error %d).",
6338						(unsigned long long)
6339						dst_ni->mft_no,
6340						(unsigned long long)
6341						dst_dir_ni->mft_no, err);
6342				goto free_err;
6343			}
6344			/*
6345			 * Set @have_unlinked to true so that we know that we
6346			 * have to return error EIO from now on if we fail to
6347			 * complete the rename.
6348			 */
6349			have_unlinked = TRUE;
6350		}
6351		/*
6352		 * Release the lock on the destination inode and set it to NULL
6353		 * so we assume it does not exist from now on.
6354		 */
6355		lck_rw_unlock_exclusive(&dst_ni->lock);
6356		dst_ni = NULL;
6357	}
6358	/*
6359	 * We dealt with the target if there was one thus now we can begin the
6360	 * actual rename.
6361	 *
6362	 * To start with we lock the source inode for writing which allows us
6363	 * to split the removal of the source name and the addition of the
6364	 * destination name into two events.
6365	 *
6366	 * Note we cheat a little and set @dst_ni to @src_ni so that @src_ni is
6367	 * unlocked at the end of the function/on error.
6368	 */
6369	if (dst_ni)
6370		panic("%s(): dst_ni\n", __FUNCTION__);
6371	dst_ni = src_ni;
6372	lck_rw_lock_exclusive(&src_ni->lock);
6373	/*
6374	 * As the source inode is now locked for writing we can perform the
6375	 * rename in two stages.  First we remove the source name and then we
6376	 * add the destination name both to the mft record of the inode and to
6377	 * the parent directory indexes.  We can do this atomically because
6378	 * both the parent directory and the source inode are locked for
6379	 * writing thus no-one can access either until we are finished.
6380	 *
6381	 * As removal of the source name can leave the source inode with a zero
6382	 * link count we artificially increment the link count here to ensure
6383	 * it cannot reach zero.  This is required to guarantee that the unlink
6384	 * of the source name will remove the filename attribute and to ensure
6385	 * that the object id is not deleted.  Finally, this also ensures
6386	 * no-one can ever see the inode in a deleted state (although this
6387	 * should never happen anyway as we have the inode locked for writing).
6388	 *
6389	 * Note the link count in the ntfs inode is unsigned int type, i.e. at
6390	 * least 32-bit, to allow us to overflow 16-bits here if needed.  In
6391	 * this way we do not need to worry about the link count overflowing
6392	 * here which makes the code simpler.
6393	 *
6394	 * We set @is_rename to true as we have elevated the link count by one.
6395	 */
6396	src_ni->link_count++;
6397	err = ntfs_unlink_internal(src_dir_ni, src_ni, src_ntfs_name,
6398			src_ntfs_name_len, src_ntfs_name_type, TRUE);
6399	if (err) {
6400		ntfs_error(vol->mp, "Rename failed because the source name, "
6401				"%.*s mft_no 0x%llx could not be removed from "
6402				"directory mft_no 0x%llx (error %d).",
6403				(int)src_cn->cn_namelen, src_cn->cn_nameptr,
6404				(unsigned long long)src_ni->mft_no,
6405				(unsigned long long)src_dir_ni->mft_no, err);
6406		goto dec_err;
6407	}
6408	/*
6409	 * The source name is now removed both from the source parent directory
6410	 * index and from the mft record of the source inode.
6411	 *
6412	 * Now add the destination name as a hard link to the mft record of the
6413	 * source inode and to the destination parent directory index.
6414	 *
6415	 * Calling ntfs_link_internal() also sets the "needs to be archived"
6416	 * bit on the ntfs inode unless we are renaming an unencrypted
6417	 * directory inode so we do not need to worry about setting it
6418	 * ourselves.
6419	 */
6420	err = ntfs_link_internal(src_ni, dst_dir_ni, dst_cn, TRUE,
6421			dst_ntfs_name, dst_ntfs_name_len);
6422	if (err)
6423		goto link_err;
6424	/* We are done, decrement the link count back to its correct value. */
6425	src_ni->link_count--;
6426done:
6427	if (src_name)
6428		OSFree(src_name, sizeof(*src_name), ntfs_malloc_tag);
6429	if (dst_name)
6430		OSFree(dst_name, sizeof(*dst_name), ntfs_malloc_tag);
6431	OSFree(ntfs_name_buf, NTFS_MAX_NAME_LEN * 2, ntfs_malloc_tag);
6432err:
6433	/* If the destination inode existed we locked it so unlock it now. */
6434	if (dst_ni)
6435		lck_rw_unlock_exclusive(&dst_ni->lock);
6436	/* Drop the source and destination parent directory inode locks. */
6437	lck_rw_unlock_exclusive(&src_dir_ni->lock);
6438	if (src_dir_ni != dst_dir_ni) {
6439		lck_rw_unlock_exclusive(&dst_dir_ni->lock);
6440		lck_mtx_unlock(&vol->rename_lock);
6441	}
6442	ntfs_debug("Done (error %d).", (int)err);
6443	return err;
6444link_err:
6445	ntfs_error(vol->mp, "Rename failed because the destination name %.*s, "
6446			"mft_ni 0x%llx could not be added to directory mft_no "
6447			"0x%llx (error %d).", (int)dst_cn->cn_namelen,
6448			dst_cn->cn_nameptr, (unsigned long long)src_ni->mft_no,
6449			(unsigned long long)dst_dir_ni->mft_no, err);
6450	/*
6451	 * Try to roll back the unlink of the source by creating a new hard
6452	 * link with the old name.
6453	 */
6454	err2 = ntfs_link_internal(src_ni, src_dir_ni, src_cn, TRUE,
6455			orig_ntfs_name, orig_ntfs_name_len);
6456	if (err2) {
6457		ntfs_error(vol->mp, "Failed to roll back partially completed "
6458				"rename (error %d).  Leaving corrupt "
6459				"metadata and returning EIO.  Unmount and run "
6460				"chkdsk.", err2);
6461		NVolSetErrors(vol);
6462		err = EIO;
6463	} else
6464		ntfs_debug("Re-linking of source name succeeded.");
6465dec_err:
6466	src_ni->link_count--;
6467free_err:
6468	if (have_unlinked) {
6469		/* We unlinked an existing target, need to re-link it now. */
6470		ntfs_debug("Rename failed but the target was already unlinked "
6471				"and relinking it is not implemented (yet), "
6472				"returning EIO.  (Given you were renaming "
6473				"over it chances are you did not care about "
6474				"the target anyway.)");
6475		err = EIO;
6476	}
6477	goto done;
6478}
6479
6480/**
6481 * ntfs_vnop_mkdir - create a directory
6482 * @a:		arguments to mkdir function
6483 *
6484 * @a contains:
6485 *	vnode_t a_dvp;			directory in which to create the dir
6486 *	vnode_t *a_vpp;			destination pointer for the created dir
6487 *	struct componentname *a_cnp;	name of the directory to create
6488 *	struct vnode_attr *a_vap;	attributes to set on the created dir
6489 *	vfs_context_t a_context;
6490 *
6491 * Create a directory with name as specified in @a->a_cnp in the directory
6492 * specified by the vnode @a->a_dvp.  Assign the attributes @a->a_vap to the
6493 * created directory.  Finally return the vnode of the created directory in
6494 * *@a->a_vpp.
6495 *
6496 * Return 0 on success and errno on error.
6497 *
6498 * Note we always create directory names in the POSIX namespace.
6499 */
6500static int ntfs_vnop_mkdir(struct vnop_mkdir_args *a)
6501{
6502	errno_t err;
6503#ifdef DEBUG
6504	ntfs_inode *ni = NTFS_I(a->a_dvp);
6505
6506	if (ni)
6507		ntfs_debug("Creating a directory named %.*s in directory "
6508				"mft_no 0x%llx.", (int)a->a_cnp->cn_namelen,
6509				a->a_cnp->cn_nameptr,
6510				(unsigned long long)ni->mft_no);
6511#endif
6512	err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE);
6513	ntfs_debug("Done (error %d).", (int)err);
6514	return err;
6515}
6516
6517/**
6518 * ntfs_vnop_rmdir - remove an empty directory
6519 * @a:		arguments to rmdir function
6520 *
6521 * @a contains:
6522 *	vnode_t a_dvp;			parent directory remove from
6523 *	vnode_t a_vp;			directory to remove
6524 *	struct componentname *a_cnp;	name of the dircetory to remove
6525 *	vfs_context_t a_context;
6526 *
6527 * Make sure that the directory with vnode @a->a_vp and name as specified in
6528 * @a->a_cnp is empty and if so remove it from its parent directory with vnode
6529 * @a->a_dvp.
6530 *
6531 * Return 0 on success and errno on error.
6532 *
6533 * Note that if the name of the directory to be removed is in the WIN32 or DOS
6534 * namespaces, both the WIN32 and the corresponding DOS names are removed.
6535 *
6536 * Note that this function only removes the directory entry, i.e. it does not
6537 * remove the name, however it does decrement the hard link count to zero.
6538 * This is so that the directory can be undeleted and its original name
6539 * restored.  In any case, we do not actually delete the inode here as it may
6540 * still be open and UNIX semantics require an unlinked inode to be still
6541 * accessible through already opened file descriptors.  When the last file
6542 * descriptor is closed, we causes the inode to be deleted when the VFS
6543 * notifies us of the last close by calling VNOP_INACTIVE(), i.e.
6544 * ntfs_vnop_inactive().
6545 */
6546static int ntfs_vnop_rmdir(struct vnop_rmdir_args *a)
6547{
6548	ntfs_inode *dir_ni = NTFS_I(a->a_dvp);
6549	ntfs_inode *ni = NTFS_I(a->a_vp);
6550	errno_t err;
6551
6552	ntfs_debug("Entering.");
6553	if (!dir_ni || !ni) {
6554		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
6555		return EINVAL;
6556	}
6557	err = ntfs_unlink(dir_ni, ni, a->a_cnp, 0, TRUE);
6558	ntfs_debug("Done (error %d).", (int)err);
6559	return err;
6560}
6561
6562/**
6563 * ntfs_vnop_symlink - create a symbolic link
6564 * @a:		arguments to symlink function
6565 *
6566 * @a contains:
6567 *	vnode_t a_dvp;			directory to create the symlink in
6568 *	vnode_t *a_vpp;			destination pointer for the new symlink
6569 *	struct componentname *a_cnp;	name of the symlink to create
6570 *	struct vnode_attr *a_vap;	attributes to set on the new symlink
6571 *	char *a_target;			path to point the created symlink at
6572 *	vfs_context_t a_context;
6573 *
6574 * Create a symbolic link to the path string @a->a_target with name as
6575 * specified in @a->a_cnp in directory specified by the vnode @a->a_dvp.
6576 * Assign the attributes @a->a_vap to the created symlink.  Finally return the
6577 * vnode of the created symlink in *@a->a_vpp.
6578 *
6579 * We implement symbolic links the same way as SFM, i.e. a symbolic link is a
6580 * regular file as far as NTFS is concerned with an AFP_AfpInfo named stream
6581 * containing the finder info with the type set to 'slnk' and the creator set
6582 * to 'rhap'.  This is basically how HFS+ stores symbolic links, too.
6583 *
6584 * Return 0 on success and errno on error.
6585 *
6586 * Note, since IEEE Std 1003.1-2001 does not require any association of file
6587 * times with symbolic links, there is no requirement that file times be
6588 * updated by symlink(). - This is what POSIX says about updating times in
6589 * symlink() thus we do not update any of the times except as an indirect
6590 * result of calling ntfs_write() on the symbolic link inode.
6591 */
6592static int ntfs_vnop_symlink(struct vnop_symlink_args *a)
6593{
6594	uio_t uio;
6595	ntfs_inode *dir_ni, *ni, *raw_ni;
6596	int err, err2;
6597	unsigned len;
6598
6599	dir_ni = NTFS_I(a->a_dvp);
6600	if (!dir_ni) {
6601		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
6602		return EINVAL;
6603	}
6604	ntfs_debug("Creating a symbolic link named %.*s in directory mft_no "
6605			"0x%llx and pointing it at path \"%s\".",
6606			(int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr,
6607			(unsigned long long)dir_ni->mft_no, a->a_target);
6608	len = strlen(a->a_target);
6609	/* Zero length symbolic links are not allowed. */
6610	if (!len || len > MAXPATHLEN) {
6611		err = EINVAL;
6612		if (len)
6613			err = ENAMETOOLONG;
6614		ntfs_error(dir_ni->vol->mp, "Invalid symbolic link target "
6615				"length %d, returning %s.", len,
6616				len ? "ENAMETOOLONG" : "EINVAL");
6617		return err;
6618	}
6619retry:
6620	/* Create the symbolic link inode. */
6621	err = ntfs_create(dir_ni->vn, a->a_vpp, a->a_cnp, a->a_vap, TRUE);
6622	if (err) {
6623		if (err != EEXIST)
6624			ntfs_error(dir_ni->vol->mp, "Failed to create "
6625					"symbolic link named %.*s in "
6626					"directory mft_no 0x%llx and pointing "
6627					"to path \"%s\" (error %d).",
6628					(int)a->a_cnp->cn_namelen,
6629					a->a_cnp->cn_nameptr,
6630					(unsigned long long)dir_ni->mft_no,
6631					a->a_target, err);
6632		else
6633			ntfs_debug("Failed to create symbolic link named %.*s "
6634					"in directory mft_no 0x%llx and "
6635					"pointing to path \"%s\" (error "
6636					"EEXIST).", (int)a->a_cnp->cn_namelen,
6637					a->a_cnp->cn_nameptr,
6638					(unsigned long long)dir_ni->mft_no,
6639					a->a_target);
6640		return err;
6641	}
6642	/* Note the ntfs inode @ni is locked for writing. */
6643	ni = NTFS_I(*a->a_vpp);
6644	/* Make sure no-one deleted it under our feet. */
6645	if (NInoDeleted(ni)) {
6646		/* Remove the inode from the name cache. */
6647		cache_purge(ni->vn);
6648		/* Release the vnode and try the create again. */
6649		lck_rw_unlock_exclusive(&ni->lock);
6650		vnode_put(ni->vn);
6651		goto retry;
6652	}
6653	/*
6654	 * Create a uio and attach the target path to it so we can use
6655	 * ntfs_write() to do the work.
6656	 */
6657	uio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE);
6658	if (!uio) {
6659		err = ENOMEM;
6660		ntfs_error(dir_ni->vol->mp, "Failed to allocate UIO.");
6661		goto err;
6662	}
6663	err = uio_addiov(uio, (uintptr_t)a->a_target, len);
6664	if (err)
6665		panic("%s(): Failed to attach target path buffer to UIO "
6666				"(error %d).", __FUNCTION__, err);
6667	/*
6668	 * FIXME: At present the kernel does not allow VLNK vnodes to use the
6669	 * UBC (<rdar://problem/5794900>) thus we need to use a shadow VREG
6670	 * vnode to do the actual write of the symbolic link data.  Fortunately
6671	 * we already implemented this functionality for compressed files where
6672	 * we need to read the compressed data using a shadow vnode so we use
6673	 * the same implementation here, thus our shadow vnode is a raw inode.
6674	 */
6675	err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_EXCLUSIVE, &raw_ni);
6676	if (err) {
6677		ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).",
6678				err);
6679		goto err;
6680	}
6681	if (!NInoRaw(raw_ni))
6682		panic("%s(): Requested raw inode but got non-raw one.\n",
6683				__FUNCTION__);
6684	/*
6685	 * Write the symbolic link target to the created inode.  We pass in
6686	 * IO_UNIT as we want an atomic i/o operation.
6687	 *
6688	 * FIXME: ntfs_write() does not always honour the IO_UNIT flag so we
6689	 * still have to test for partial writes.
6690	 */
6691	err = ntfs_write(raw_ni, uio, IO_UNIT, TRUE);
6692	/*
6693	 * Update the sizes in the base inode.  Note there is no need to lock
6694	 * @raw_ni->size_lock as the values cannot change at present as we are
6695	 * holding the inode lock @raw_ni->lock for write.
6696	 */
6697	lck_spin_lock(&ni->size_lock);
6698	ni->initialized_size = raw_ni->initialized_size;
6699	ni->data_size = raw_ni->data_size;
6700	ni->allocated_size = raw_ni->allocated_size;
6701	ni->compressed_size = raw_ni->compressed_size;
6702	lck_spin_unlock(&ni->size_lock);
6703	if (NInoNonResident(raw_ni))
6704		NInoSetNonResident(ni);
6705	lck_rw_unlock_exclusive(&raw_ni->lock);
6706	vnode_put(raw_ni->vn);
6707	/* Check for write errors. */
6708	if (uio_resid(uio) && !err)
6709		err = EIO;
6710	/* We no longer need the uio. */
6711	uio_free(uio);
6712	if (!err) {
6713		lck_rw_unlock_exclusive(&ni->lock);
6714		ntfs_debug("Done.");
6715		return 0;
6716	}
6717	/* Write failed or was partial, unlink the created symbolic link. */
6718	ntfs_error(dir_ni->vol->mp, "Failed to write target path to symbolic "
6719			"link inode (error %d).", err);
6720err:
6721	lck_rw_unlock_exclusive(&ni->lock);
6722	err2 = ntfs_unlink(dir_ni, ni, a->a_cnp, 0, FALSE);
6723	if (err2) {
6724		ntfs_error(dir_ni->vol->mp, "Failed to unlink symbolic link "
6725				"inode in error code path (error %d).  Run "
6726				"chkdsk.", err2);
6727		NVolSetErrors(dir_ni->vol);
6728	}
6729	vnode_put(ni->vn);
6730	return err;
6731}
6732
6733/**
6734 * ntfs_vnop_readdir - read directory entries into a supplied buffer
6735 * @a:		arguments to readdir function
6736 *
6737 * @a contains:
6738 *	vnode_t a_vp;		directory vnode to read directory entries from
6739 *	uio_t a_uio;		destination in which to return the entries
6740 *	int a_flags;		flags describing the entries to return
6741 *	int *a_eofflag;		return end of file status (can be NULL)
6742 *	int *a_numdirent;	return number of entries returned (can be NULL)
6743 *	vfs_context_t a_context;
6744 *
6745 * See ntfs_dir.c::ntfs_readdir() for a description of the implemented
6746 * features.  In addition to those described features VNOP_READDIR() should
6747 * also implement the below features.
6748 *
6749 * @a->a_flags can have the following bits set:
6750 *	VNODE_READDIR_EXTENDED		use extended directory entries
6751 *	VNODE_READDIR_REQSEEKOFF	requires seek offset (cookies)
6752 *	VNODE_READDIR_SEEKOFF32		seek offset values should be 32-bit
6753 *
6754 * When VNODE_READDIR_EXTENDED is set, the format of the returned directory
6755 * entry structures changes to the direntry structure which is defined as:
6756 *
6757 *	u64 d_ino;			inode number of entry
6758 *	u64 d_seekoff;			seek offset (optional, used by servers)
6759 *	u16 d_reclen;			length of this record
6760 *	u16 d_namlen;			length of string in d_name
6761 *	u8 d_type;			inode type (one of DT_DIR, DT_REG, etc)
6762 *	char d_name[MAXPATHLEN];	null terminated filename
6763 *
6764 * If VNODE_READDIR_REQSEEKOFF is set, VNODE_READDIR_EXTENDED must also be set,
6765 * and it means that the seek offset (d_seekoff) in the direntry structure must
6766 * be set.  If VNODE_READDIR_REQSEEKOFF is not set, the seek offset can be set
6767 * to zero as the caller will ignore it.
6768 *
6769 * If VNODE_READDIR_SEEKOFF32 is set, both VNODE_READDIR_EXTENDED and
6770 * VNODE_READDIR_REQSEEKOFF must be set and it means that the seek offset must
6771 * be at most 32-bits, i.e. the most significant 32-bits of d_seekoff must be
6772 * zero.
6773 *
6774 * All the VNODE_READDIR_* flags are only ever set by the NFS server and given
6775 * we do not yet support NFS exporting of NTFS volumes we just abort if any of
6776 * them are set.
6777 *
6778 * If the directory is deleted-but-in-use, we do not synthesize entries for "."
6779 * and "..".
6780 *
6781 * Return 0 on success and the error code on error.
6782 */
6783static int ntfs_vnop_readdir(struct vnop_readdir_args *a)
6784{
6785	user_ssize_t start_count;
6786	ntfs_inode *dir_ni = NTFS_I(a->a_vp);
6787	errno_t err;
6788
6789	if (!dir_ni) {
6790		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
6791		return EINVAL;
6792	}
6793	ntfs_debug("Entering for directory inode 0x%llx.",
6794			(unsigned long long)dir_ni->mft_no);
6795	/*
6796	 * FIXME: Is this check necessary?  Can we ever get here for
6797	 * non-directories?  All current callers (except the NFS server) ensure
6798	 * that @dir_ni is a directory.  We do not currently support NFS
6799	 * exporting so this should indeed definitely never trigger but leave
6800	 * it here as a kind of debug assertion.
6801	 */
6802	if (!S_ISDIR(dir_ni->mode)) {
6803		ntfs_debug("Not a directory, returning ENOTDIR.");
6804		return ENOTDIR;
6805	}
6806	if (a->a_flags) {
6807		ntfs_error(dir_ni->vol->mp, "None of the VNODE_READDIR_* "
6808				"flags are supported yet, sorry.");
6809		return ENOTSUP;
6810	}
6811	lck_rw_lock_shared(&dir_ni->lock);
6812	/* Do not allow messing with the inode once it has been deleted. */
6813	if (NInoDeleted(dir_ni)) {
6814		/* Remove the inode from the name cache. */
6815		cache_purge(dir_ni->vn);
6816		lck_rw_unlock_shared(&dir_ni->lock);
6817		ntfs_debug("Directory is deleted.");
6818		return ENOENT;
6819	}
6820	start_count = uio_resid(a->a_uio);
6821	err = ntfs_readdir(dir_ni, a->a_uio, a->a_eofflag, a->a_numdirent);
6822	/*
6823	 * Update the last_access_time (atime) if something was read.
6824	 *
6825	 * Skip the update if atime updates are disabled via the noatime mount
6826	 * option or the volume is read only.
6827	 */
6828	if (uio_resid(a->a_uio) < start_count && !NVolReadOnly(dir_ni->vol) &&
6829			!(vfs_flags(dir_ni->vol->mp) & MNT_NOATIME)) {
6830		dir_ni->last_access_time = ntfs_utc_current_time();
6831		NInoSetDirtyTimes(dir_ni);
6832	}
6833	lck_rw_unlock_shared(&dir_ni->lock);
6834	ntfs_debug("Done (error %d).", (int)err);
6835	return err;
6836}
6837
6838/**
6839 * ntfs_vnop_readdirattr -
6840 *
6841 */
6842static int ntfs_vnop_readdirattr(struct vnop_readdirattr_args *a)
6843{
6844	errno_t err;
6845
6846	ntfs_debug("Entering.");
6847	(void)nop_readdirattr(a);
6848	// TODO:
6849	err = ENOTSUP;
6850	ntfs_debug("Done (error %d).", (int)err);
6851	return err;
6852}
6853
6854/**
6855 * ntfs_vnop_readlink - read the contents of a symbolic link
6856 * @a:		arguments to readlink function
6857 *
6858 * @a contains:
6859 *	vnode_t a_vp;		vnode of symbolic link whose data to read
6860 *	uio_t *a_uio;		destination in which to return the read data
6861 *	vfs_context_t a_context;
6862 *
6863 * Read the path stored in the symbolic link vnode @a->a_vp and return it in
6864 * the destination buffer pointed to by @a->a_uio.
6865 *
6866 * uio_resid(@a->a_uio) is the maximum number of bytes to read and
6867 * uio_offset(@a->a_uio) must be zero.
6868 *
6869 * We implement symbolic links the same way as SFM, i.e. a symbolic link is a
6870 * regular file as far as NTFS is concerned with an AFP_AfpInfo named stream
6871 * containing the finder info with the type set to 'slnk' and the creator set
6872 * to 'rhap'.  This is basically how HFS+ stores symbolic links, too.
6873 *
6874 * Thus obtaining the symbolic link target is a simple matter of calling
6875 * ntfs_read() on the symbolic link inode.
6876 *
6877 * TODO: We may wish to add support for other symbolic link types found on NTFS
6878 * volumes such as the methods used by:
6879 *	- Windows Services for Unix (SFU) and the userspace ntfsmount driver,
6880 *	- SMB/Samba (when run on a file system without native symbolic links)
6881 *	- Cygwin
6882 *
6883 * It may also be worth supporting reparse point based symbolic links but those
6884 * are a lot trickier if at all possible as they contain information that
6885 * cannot be resolved without access to the Windows registry and potentially
6886 * without access to the Windows Domain/Active Directory.
6887 *
6888 * Return 0 on success and errno on error.
6889 *
6890 * Note, since IEEE Std 1003.1-2001 does not require any association of file
6891 * times with symbolic links, there is no requirement that file times be
6892 * updated by readlink().
6893 */
6894static int ntfs_vnop_readlink(struct vnop_readlink_args *a)
6895{
6896	s64 size;
6897	user_ssize_t start_count;
6898	ntfs_inode *ni, *raw_ni;
6899	uio_t uio = a->a_uio;
6900	errno_t err;
6901
6902	ni = NTFS_I(a->a_vp);
6903	if (!ni) {
6904		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
6905		return EINVAL;
6906	}
6907	ntfs_debug("Entering for mft_no 0x%llx.",
6908			(unsigned long long)ni->mft_no);
6909	/*
6910	 * Protect against changes in initialized_size and thus against
6911	 * truncation also and against deletion/rename.
6912	 */
6913	lck_rw_lock_shared(&ni->lock);
6914	/* Do not allow messing with the inode once it has been deleted. */
6915	if (!ni->link_count || NInoDeleted(ni)) {
6916		/* Remove the inode from the name cache. */
6917		cache_purge(ni->vn);
6918		err = ENOENT;
6919		goto err;
6920	}
6921	if (!S_ISLNK(ni->mode)) {
6922		ntfs_debug("Not a symbolic link, returning EINVAL.");
6923		err = EINVAL;
6924		goto err;
6925	}
6926	if (uio_offset(uio)) {
6927		ntfs_error(ni->vol->mp, "uio_offset(uio) is not zero, "
6928				"returning EINVAL.");
6929		err = EINVAL;
6930		goto err;
6931	}
6932	/*
6933	 * FIXME: At present the kernel does not allow VLNK vnodes to use the
6934	 * UBC (<rdar://problem/5794900>) thus we need to use a shadow VREG
6935	 * vnode to do the actual read of the symbolic link data.  Fortunately
6936	 * we already implemented this functionality for compressed files where
6937	 * we need to read the compressed data using a shadow vnode so we use
6938	 * the same implementation here, thus our shadow vnode is a raw inode.
6939	 *
6940	 * Doing this has the unfortunate consequence that if the symbolic link
6941	 * inode is compressed or encrypted we cannot read it as we are already
6942	 * using the raw inode and we can only have one raw inode.
6943	 */
6944	lck_spin_lock(&ni->size_lock);
6945	size = ni->data_size;
6946	lck_spin_unlock(&ni->size_lock);
6947	/* Zero length symbolic links are not allowed. */
6948	if (!size || size > MAXPATHLEN) {
6949		ntfs_error(ni->vol->mp, "Invalid symbolic link size %lld in "
6950				"mft_no 0x%llx, returning EINVAL.",
6951				(long long)size,
6952				(unsigned long long)ni->mft_no);
6953		err = EINVAL;
6954		goto err;
6955	}
6956	start_count = uio_resid(uio);
6957	err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni);
6958	if (err) {
6959		ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).",
6960				err);
6961		goto err;
6962	}
6963	if (!NInoRaw(raw_ni))
6964		panic("%s(): Requested raw inode but got non-raw one.\n",
6965				__FUNCTION__);
6966	lck_spin_lock(&raw_ni->size_lock);
6967	if (size > ubc_getsize(raw_ni->vn) || size != raw_ni->data_size)
6968		panic("%s(): size (0x%llx) > ubc_getsize(raw_ni->vn, 0x%llx) "
6969				"|| size != raw_ni->data_size (0x%llx)\n",
6970				__FUNCTION__, (unsigned long long)size,
6971				(unsigned long long)ubc_getsize(raw_ni->vn),
6972				(unsigned long long)raw_ni->data_size);
6973	lck_spin_unlock(&raw_ni->size_lock);
6974	/* Perform the actual read of the symbolic link data into the uio. */
6975	err = ntfs_read(raw_ni, uio, 0, TRUE);
6976	lck_rw_unlock_shared(&raw_ni->lock);
6977	vnode_put(raw_ni->vn);
6978	/*
6979	 * If the read was partial, reset @uio pretending that the read never
6980	 * happened unless we used up all the space in the uio and it was
6981	 * simply not big enough to hold the entire symbolic link data in which
6982	 * case we return a truncated result.
6983	 */
6984	if (err || (uio_resid(uio) && start_count - uio_resid(uio) != size)) {
6985		/*
6986		 * FIXME: Should we be trying to continue a partial read in
6987		 * case we can complete it with multiple calls to ntfs_read()?
6988		 */
6989		if (!err) {
6990			ntfs_debug("ntfs_read() returned a partial read, "
6991					"pretending the read never happened.");
6992			err = EIO;
6993		}
6994		uio_setoffset(uio, 0);
6995		uio_setresid(uio, start_count);
6996		if (err)
6997			ntfs_error(ni->vol->mp, "Failed to read symbolic link "
6998					"data (error %d).", err);
6999	}
7000	ntfs_debug("Done (error %d).", (int)err);
7001err:
7002	lck_rw_unlock_shared(&ni->lock);
7003	return err;
7004}
7005
7006/**
7007 * ntfs_mft_record_free_all - free clusters referenced by an mft record
7008 * @base_ni:	base ntfs inode to which the (extent) inode @ni and @m belong
7009 * @ni:		ntfs inode for which to free all clusters
7010 * @m:		mft record for which to free all clusters
7011 *
7012 * For the ntfs inode @ni and its mft record @m, iterate over all attributes in
7013 * the mft record and free all clusters referenced by the attributes.  @base_ni
7014 * is the base ntfs inode to which @ni and @m belong.
7015 *
7016 * Also, mark the mft record as not in use, increment its sequence number and
7017 * mark it dirty to ensure it gets written out later.
7018 *
7019 * When any operations fail this function notifies the user about it and marks
7020 * the volume dirty but does not return an error code as the caller can proceed
7021 * regardless without caring if some clusters failed to be freed.  A later
7022 * chkdsk will find them and free them and in the mean time they just waste
7023 * some space on the volume.
7024 */
7025static void ntfs_mft_record_free_all(ntfs_inode *base_ni, ntfs_inode *ni,
7026		MFT_RECORD *m)
7027{
7028	ntfs_volume *vol = base_ni->vol;
7029	ATTR_RECORD *a;
7030	errno_t err;
7031	ntfs_runlist rl;
7032
7033	for (a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
7034			a->type != AT_END;
7035			a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
7036		if ((u8*)a < (u8*)m || (u8*)a > (u8*)m +
7037				le32_to_cpu(m->bytes_in_use) ||
7038				le32_to_cpu(m->bytes_in_use) >
7039				le32_to_cpu(m->bytes_allocated) ||
7040				!a->length) {
7041			ntfs_warning(vol->mp, "Found corrupt attribute whilst "
7042					"releasing deleted mft_no 0x%llx.  "
7043					"Run chkdsk to recover lost space and "
7044					"fix any other inconsistencies.",
7045					(unsigned long long)ni->mft_no);
7046			NVolSetErrors(vol);
7047			break;
7048		}
7049		/*
7050		 * For most resident attribute records, there is nothing we
7051		 * need to do as they do not reference any clusters outside the
7052		 * mft record itself.
7053		 */
7054		if (!a->non_resident) {
7055			STANDARD_INFORMATION *si;
7056
7057			/*
7058			 * We only need to deal with the standard information
7059			 * attribute.
7060			 */
7061			if (a->type != AT_STANDARD_INFORMATION)
7062				continue;
7063			/*
7064			 * We need to update the {a,m,c}times from the ntfs
7065			 * inode into the corresponding times in the standard
7066			 * information attribute.  The inode ctime, i.e. the
7067			 * last_mft_change_time in the standard information
7068			 * attribute, gives us a de facto deleted time that can
7069			 * be used by ntfsck and ntfsundelete for example.
7070			 */
7071			si = (STANDARD_INFORMATION*)((u8*)a +
7072					le16_to_cpu(a->value_offset));
7073			si->last_data_change_time = utc2ntfs(
7074					base_ni->last_data_change_time);
7075			si->last_mft_change_time = utc2ntfs(
7076					base_ni->last_mft_change_time);
7077			si->last_access_time = utc2ntfs(
7078					base_ni->last_access_time);
7079			/* Whilst here also update the file attributes. */
7080			si->file_attributes = base_ni->file_attributes;
7081			/*
7082			 * We need to take care to handle NTFS 1.x style
7083			 * standard information attributes on NTFS 3.0+ volumes
7084			 * as they are lazily updated on write after a volume
7085			 * has been upgraded from 1.x and after a volume has
7086			 * been accessed by an older NTFS driver such as the
7087			 * one in Windows NT4.
7088			 */
7089#if 0
7090			if (vol->major_ver <= 3 ||
7091					le32_to_cpu(a->value_length) <
7092					sizeof(STANDARD_INFORMATION))
7093				continue;
7094#endif
7095			/*
7096			 * We have an NTFS 3.0+ style, extended standard
7097			 * information attribute.
7098			 */
7099			/*
7100			 * TODO: When we implement support for $UsnJrnl, we
7101			 * will need to journal the delete event and update the
7102			 * usn field in the standard information attribute.
7103			 * For now this is not needed as we stamp the
7104			 * transaction log thus telling applications querying
7105			 * the transaction log that it does not contain
7106			 * uptodate information.  We cannot do this at unlink
7107			 * time because there may still be writes and truncates
7108			 * happening due to existing open file descriptors and
7109			 * the delete event has to come last.
7110			 */
7111			/*
7112			 * TODO: When we implement support for quotas, we will
7113			 * need to update the quota control entry belonging to
7114			 * the user_id specified in the owner_id field in the
7115			 * standard information attribute by updating its
7116			 * change_time field to the current time and
7117			 * decrementing its bytes_used field by the amount
7118			 * specified in the quota_charged field in the standard
7119			 * information attribute as well as setting the
7120			 * exceeded_time to 0 if we go from over the soft quota
7121			 * specified in the limit of the quota control entry.
7122			 * For now this is not needed as we mark all quotas as
7123			 * invalid when we mount a volume read-write.  We
7124			 * cannot do the quota update at unlink time because
7125			 * there may still be writes and truncates happening
7126			 * due to existing open file descriptors which will
7127			 * affect the quota related fields.
7128			 */
7129			continue;
7130		}
7131		/*
7132		 * For non-resident attribute records, we need to free all the
7133		 * clusters specified in their mapping pairs array.
7134		 *
7135		 * If this is the base extent, we only need to do this if the
7136		 * allocated size is not zero.  If this is not the base extent
7137		 * then by definition the allocated size cannot be zero and
7138		 * more importantly an extent mft rceord does not have the
7139		 * allocated_size field set thus it is always zero.
7140		 */
7141		if (!a->lowest_vcn && !a->allocated_size)
7142			continue;
7143		rl.rl = NULL;
7144		rl.alloc = rl.elements = 0;
7145		err = ntfs_mapping_pairs_decompress(vol, a, &rl);
7146		if (!err) {
7147			VCN lowest_vcn;
7148
7149			/*
7150			 * We need to supply the correct start and count values
7151			 * otherwise freeing the clusters fails when an
7152			 * attribute has multiple extent records because the
7153			 * runlist contains unmapped elements.
7154			 */
7155			lowest_vcn = sle64_to_cpu(a->lowest_vcn);
7156			err = ntfs_cluster_free_from_rl(vol, rl.rl, lowest_vcn,
7157					sle64_to_cpu(a->highest_vcn) + 1 -
7158					lowest_vcn, NULL);
7159			if (err) {
7160				ntfs_warning(vol->mp, "Failed to free some "
7161						"allocated clusters belonging "
7162						"to mft_no 0x%llx (error "
7163						"%d).  Run chkdsk to recover "
7164						"the lost space.",
7165						(unsigned long long)ni->mft_no,
7166						err);
7167				NVolSetErrors(vol);
7168			}
7169			OSFree(rl.rl, rl.alloc, ntfs_malloc_tag);
7170		} else {
7171			ntfs_error(vol->mp, "Cannot free some allocated space "
7172					"belonging to mft_no 0x%llx because "
7173					"the decompression of the mapping "
7174					"pairs array failed (error %d).  Run "
7175					"chkdsk to recover the lost space.",
7176					(unsigned long long)ni->mft_no, err);
7177			NVolSetErrors(vol);
7178		}
7179	}
7180	/*
7181	 * We have processed all attributes in the base mft record thus we can
7182	 * mark it as not in use, increment its sequence number, and mark it
7183	 * dirty for later writeout.
7184	 */
7185	m->flags &= ~MFT_RECORD_IN_USE;
7186	if (m->sequence_number != const_cpu_to_le16(0xffff))
7187		m->sequence_number = cpu_to_le16(
7188				le16_to_cpu(m->sequence_number) + 1);
7189	else
7190		m->sequence_number = const_cpu_to_le16(1);
7191	ni->seq_no = le16_to_cpu(m->sequence_number);
7192	NInoSetMrecNeedsDirtying(ni);
7193}
7194
7195/**
7196 * ntfs_vnop_inactive - the last reference to a vnode has been dropped
7197 * @args:	arguments to inactive function
7198 *
7199 * @args contains:
7200 *	vnode_t a_vp;		vnode whose last reference has been dropped
7201 *	vfs_context_t a_context;
7202 *
7203 * Last reference to a vnode has been dropped or a forced unmount is in
7204 * progress.
7205 *
7206 * Note: When called from reclaim, the vnode has a zero v_iocount and
7207 *	 v_usecount and vnode_isrecycled() is true.
7208 *
7209 * Return 0 on success and errno on error.
7210 *
7211 * Note the current OS X VFS ignores the return value from VNOP_INACTIVE() and
7212 * hence ntfs_vnop_inactive().
7213 */
7214static int ntfs_vnop_inactive(struct vnop_inactive_args *args)
7215{
7216	leMFT_REF mref;
7217	vnode_t vn = args->a_vp;
7218	ntfs_inode *base_ni, *mftbmp_ni, *ni = NTFS_I(vn);
7219	ntfs_volume *vol;
7220	MFT_RECORD *m;
7221	leMFT_REF *mrefs;
7222	unsigned nr_mrefs;
7223	errno_t err;
7224	BOOL is_delete;
7225
7226	if (!ni) {
7227		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
7228		return 0;
7229	}
7230	is_delete = !ni->link_count;
7231	vol = ni->vol;
7232	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x%s.",
7233			(unsigned long long)ni->mft_no,
7234			(unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len,
7235			is_delete ? ", is delete" : "");
7236	base_ni = ni;
7237	if (NInoAttr(ni))
7238		base_ni = ni->base_ni;
7239	/*
7240	 * This is the last close thus remove any directory hints.
7241	 *
7242	 * Note we check for presence of directory hints outside the locks as
7243	 * an optimization.  It is not a disaster if we miss any as all will be
7244	 * released in ntfs_inode_free() before the inode is thrown away at the
7245	 * latest.
7246	 */
7247	if (ni != base_ni && ni->type == AT_INDEX_ALLOCATION &&
7248			ni->nr_dirhints) {
7249		lck_rw_lock_exclusive(&ni->lock);
7250		ntfs_dirhints_put(ni, 0);
7251		lck_rw_unlock_exclusive(&ni->lock);
7252	}
7253	/*
7254	 * If the inode is not being deleted or this is a raw inode sync it and
7255	 * we are done.
7256	 */
7257	if (!is_delete || NInoRaw(ni)) {
7258sync:
7259		/*
7260		 * Commit dirty data to disk unless mounted read-only.
7261		 *
7262		 * WARNING: Please see <rdar://problem/7202356> why this causes
7263		 * stack exhaustion and kernel panics by creating a loop where
7264		 * the VNOP_INACTIVE() calls ntfs_inode_sync() which ends up
7265		 * doing ntfs_inode_get() which in turn triggers another
7266		 * VNOP_INACTIVE() which in turn calls ntfs_inode_sync() and
7267		 * thus ntfs_inode_get() which in turns calls VNOP_INACTIVE()
7268		 * and so on until the stack overflows.
7269		 */
7270		err = 0;
7271		if (!NVolReadOnly(vol))
7272			err = ntfs_inode_sync(ni, IO_SYNC | IO_CLOSE, FALSE);
7273		if (!err)
7274			ntfs_debug("Done.");
7275		else
7276			ntfs_error(vol->mp, "Failed to sync mft_no 0x%llx, "
7277					"type 0x%x, name_len 0x%x (error %d).",
7278					(unsigned long long)ni->mft_no,
7279					(unsigned)le32_to_cpu(ni->type),
7280					(unsigned)ni->name_len, err);
7281		return err;
7282	}
7283	if (ni != base_ni)
7284		lck_rw_lock_exclusive(&base_ni->lock);
7285	lck_rw_lock_exclusive(&ni->lock);
7286	/* Do not allow messing with the inode once it has been deleted. */
7287	if (NInoDeleted(ni)) {
7288		/* Remove the inode from the name cache. */
7289		cache_purge(vn);
7290		lck_rw_unlock_exclusive(&ni->lock);
7291		if (ni != base_ni)
7292			lck_rw_unlock_exclusive(&base_ni->lock);
7293		ntfs_debug("Done (was already deleted).");
7294		return 0;
7295	}
7296	/*
7297	 * If someone else re-instantiated the inode whilst we were waiting for
7298	 * the inode lock sync the inode instead of deleting it.
7299	 */
7300	if (ni->link_count) {
7301		lck_rw_unlock_exclusive(&ni->lock);
7302		if (ni != base_ni)
7303			lck_rw_unlock_exclusive(&base_ni->lock);
7304		ntfs_debug("Someone re-instantiated the inode.");
7305		goto sync;
7306	}
7307	/*
7308	 * The inode has been unlinked, delete it now freeing all allocated
7309	 * space on disk as well as all related resources on disk.  Note we
7310	 * proceed on errors because there is not much we can do about them.
7311	 * We have to carry on regardless as the inode is about to be
7312	 * terminated in any case.
7313	 *
7314	 * On a metadata affecting error, we mark the volume dirty and leave it
7315	 * to a subsequent chkdsk to clean up after us.  This is not a disaster
7316	 * since there are no directory entries pointing to the inode @ni any
7317	 * more, thus us failing just means that we will keep some on disk
7318	 * resources allocated so chkdsk will just find this file and delete
7319	 * it.
7320	 *
7321	 * First, remove the inode from the inode cache so it cannot be found
7322	 * any more.
7323	 */
7324	lck_mtx_lock(&ntfs_inode_hash_lock);
7325	/*
7326	 * Mark the inode as having been deleted so we do not try to remove it
7327	 * from the ntfs inode hash again in ntfs_inode_reclaim().
7328	 */
7329	NInoSetDeleted(ni);
7330	/*
7331	 * Remove the ntfs_inode from the inode hash so it cannot be looked up
7332	 * any more.
7333	 */
7334	ntfs_inode_hash_rm_nolock(ni);
7335	lck_mtx_unlock(&ntfs_inode_hash_lock);
7336	/* Remove the inode from the name cache if it is still in it. */
7337	cache_purge(vn);
7338	/*
7339	 * The inode/vnode are no longer reachable at all so drop the inode
7340	 * lock.  Anyone waiting on the lock should test for NInoDeleted() and
7341	 * abort once they have taken the lock.
7342	 */
7343	lck_rw_unlock_exclusive(&ni->lock);
7344	/* In case someone is waiting on the inode do a wakeup. */
7345	ntfs_inode_wakeup(ni);
7346	/* Invalidate all buffers to do with the vnode. */
7347	err = buf_invalidateblks(vn, 0, 0, 0);
7348	if (err)
7349		ntfs_error(vol->mp, "Failed to invalidate cached buffers "
7350				"(error %d).", err);
7351	/*
7352	 * Invalidate all cached pages in the VM.
7353	 *
7354	 * This will fail for non-regular (VREG) nodes as they do not have UBC
7355	 * info attached to them and ubc_msync() returns error in this case.
7356	 */
7357	if (vnode_isreg(vn)) {
7358		err = ubc_msync(vn, 0, ubc_getsize(vn), NULL, UBC_INVALIDATE);
7359		if (err)
7360			ntfs_error(vol->mp, "Failed to invalidate cached "
7361					"pages (error %d).", err);
7362	}
7363	/*
7364	 * Cause the vnode to be reused immediately when we return rather than
7365	 * sitting around in the vnode cache.
7366	 */
7367	vnode_recycle(vn);
7368	/*
7369	 * ntfs_unlink() and ntfs_vnop_rename() bail out for attribute inodes
7370	 * so we cannot get here with an attribute inode unless something has
7371	 * gone badly wrong.
7372	 *
7373	 * When a named stream is deleted via VNOP_REMOVENAMEDSTREAM() its
7374	 * link_count is set to zero so we get here on the last close.  We have
7375	 * to perform the actual freeing of allocated space if the attribute is
7376	 * non-resident as well as the removal of the attribute record here.
7377	 */
7378	if (ni != base_ni) {
7379		ntfs_attr_search_ctx *ctx;
7380
7381		if (ni->type != AT_DATA || !ni->name_len)
7382			panic("%s(): ni != base_ni && (ni->type != AT_DATA || "
7383					"!ni->name_len)\n", __FUNCTION__);
7384		/*
7385		 * For simplicity, if the attribute is non-resident, we
7386		 * truncate the attribute to zero size first as that causes
7387		 * both the allocated clusters to be freed as well as all
7388		 * extent attribute records to be deleted.
7389		 *
7390		 * We then only need to remove the base attribute record and we
7391		 * are done.
7392		 */
7393		if (NInoNonResident(ni)) {
7394			err = ntfs_attr_resize(ni, 0, 0, NULL);
7395			if (err) {
7396				ntfs_error(vol->mp, "Cannot delete named "
7397						"stream from mft_no 0x%llx "
7398						"because truncating the "
7399						"stream inode to zero size "
7400						"failed (error %d).",
7401						(unsigned long long)ni->mft_no,
7402						err);
7403				goto err;
7404			}
7405		}
7406		/* Remove the named stream. */
7407		err = ntfs_mft_record_map(base_ni, &m);
7408		if (err) {
7409			ntfs_error(vol->mp, "Failed to delete named stream "
7410					"because mapping the mft record "
7411					"0x%llx failed (error %d).",
7412					(unsigned long long)ni->mft_no, err);
7413			goto err;
7414		}
7415		ctx = ntfs_attr_search_ctx_get(base_ni, m);
7416		if (!ctx) {
7417			ntfs_error(vol->mp, "Failed to delete named stream "
7418					"because allocating an attribute "
7419					"search context failed.");
7420			goto unm_err;
7421		}
7422		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
7423				NULL, 0, ctx);
7424		if (err) {
7425			ntfs_error(vol->mp, "Failed to delete named stream "
7426					"because looking up the named $DATA "
7427					"attribute in the mft record 0x%llx "
7428					"failed (error %d).",
7429					(unsigned long long)ni->mft_no, err);
7430			goto put_err;
7431		}
7432		err = ntfs_attr_record_delete(base_ni, ctx);
7433		if (err) {
7434			ntfs_error(vol->mp, "Failed to delete named stream "
7435					"because deleting the named $DATA "
7436					"attribute from its mft record 0x%llx "
7437					"failed (error %d).",
7438					(unsigned long long)ctx->ni->mft_no,
7439					err);
7440			goto put_err;
7441		}
7442		ntfs_debug("Done (deleted attribute inode).");
7443put_err:
7444		ntfs_attr_search_ctx_put(ctx);
7445unm_err:
7446		ntfs_mft_record_unmap(base_ni);
7447err:
7448		lck_rw_unlock_exclusive(&base_ni->lock);
7449		return err;
7450	}
7451	/*
7452	 * We only need to be concerned with the allocated space on disk which
7453	 * we need to deallocate and any related resources on disk, which we
7454	 * also need to deallocate and/or mark unused.  To do this, we map the
7455	 * base mft record and iterate over all its attributes and deal with
7456	 * each of them in sequence.
7457	 */
7458	err = ntfs_mft_record_map(ni, &m);
7459	if (err) {
7460		ntfs_warning(vol->mp, "Cannot release deleted mft_no 0x%llx "
7461				"because the mapping of the base mft record "
7462				"failed (error %d).  Run chkdsk to recover "
7463				"lost resources.",
7464				(unsigned long long)ni->mft_no, err);
7465		NVolSetErrors(vol);
7466		return 0;
7467	}
7468	/*
7469	 * Make sure the mft record was marked as not in use in
7470	 * ntfs_unlink_internal().
7471	 */
7472	if (m->flags & MFT_RECORD_IN_USE)
7473		panic("%s(): m->flags & MFT_RECORD_IN_USE\n", __FUNCTION__);
7474	/*
7475	 * We will need the mft reference of the base mft record below but we
7476	 * are about to change it thus make a note of the old one now.
7477	 */
7478	mref = MK_LE_MREF(ni->mft_no, ni->seq_no);
7479	/*
7480	 * Release all clusters allocated to attribute records located in the
7481	 * extent mft record.
7482	 */
7483	ntfs_mft_record_free_all(ni, ni, m);
7484	/*
7485	 * We are finished with the base mft record, if there is an attribute
7486	 * list attribute, we iterate over its entries and each time we
7487	 * encounter an extent mft record that we have not done yet, we map it
7488	 * and iterate over all its attributes as we did above for the base mft
7489	 * record, followed by marking the extent mft record as not in use,
7490	 * incrementing its sequence number, and marking it dirty, again as we
7491	 * did above for the base mft record.  Finally, we add it to our list
7492	 * of mft records to deallocate from the $MFT/$BITMAP attribute.
7493	 *
7494	 * As an optimization, we reuse the attribute list buffer as our list
7495	 * of mft records to deallocate from the $MFT/$BITMAP attribute.  This
7496	 * works because each ATTR_LIST_ENTRY record in the attribute list
7497	 * attribute is at least 24 bytes long and we only need to store 8
7498	 * bytes for each mft reference in our list of mft records to
7499	 * deallocate so we are guaranteed to have enough space in the buffer
7500	 * for our needs and we are also guaranteed that we will never
7501	 * overwrite part of the attribute list attribute data that we have not
7502	 * dealt with yet.
7503	 */
7504	nr_mrefs = 1;
7505	mrefs = &mref;
7506	if (NInoAttrList(ni)) {
7507		ATTR_LIST_ENTRY *entry, *next_entry, *end;
7508		ntfs_inode *eni;
7509
7510		if (!ni->attr_list || ni->attr_list_size < sizeof(leMFT_REF) ||
7511				!ni->attr_list_alloc)
7512			panic("%s(): !ni->attr_list || !ni->attr_list_size || "
7513					"!ni->attr_list_alloc\n", __FUNCTION__);
7514		entry = (ATTR_LIST_ENTRY*)ni->attr_list;
7515		mrefs = (leMFT_REF*)entry;
7516		next_entry = (ATTR_LIST_ENTRY*)((u8*)entry +
7517				le16_to_cpu(entry->length));
7518		end = (ATTR_LIST_ENTRY*)(ni->attr_list + ni->attr_list_size);
7519		/*
7520		 * Add the mft reference of the base mft record as the first
7521		 * element in our list as we have already dealt with it.
7522		 */
7523		*mrefs = mref;
7524		while (entry < end) {
7525			unsigned i;
7526
7527			mref = entry->mft_reference;
7528			for (i = 0; i < nr_mrefs; i++) {
7529				if (mref == mrefs[i])
7530					goto do_next;
7531			}
7532			/*
7533			 * This mft reference has not been encountered before.
7534			 * Add it to the list of mft references and free all
7535			 * disk storage associated with all the attribute
7536			 * records stored in the mft record with this mft
7537			 * reference.
7538			 */
7539			mrefs[nr_mrefs++] = mref;
7540			err = ntfs_extent_mft_record_map(ni, le64_to_cpu(mref),
7541					&eni, &m);
7542			if (!err) {
7543				/*
7544				 * Release all clusters allocated to attribute
7545				 * records located in the extent mft record and
7546				 * mark the mft record as not in use.
7547				 *
7548				 * We need to ensure the mft record is marked
7549				 * as in use.  It can happen that it is not
7550				 * marked in use after a system crash occurs
7551				 * whilst a file is being extended.
7552				 */
7553				if (m->flags & MFT_RECORD_IN_USE)
7554					ntfs_mft_record_free_all(ni, eni, m);
7555				else {
7556					ntfs_warning(vol->mp, "Extent mft_no "
7557							"0x%llx, base mft_no "
7558							"0x%llx is marked as "
7559							"not in use.  Cannot "
7560							"release allocated "
7561							"clusters.  Unmount "
7562							"and run chkdsk to "
7563							"recover the lost "
7564							"clusters.",
7565							(unsigned long long)
7566							MREF_LE(mref),
7567							(unsigned long long)
7568							ni->mft_no);
7569					NVolSetErrors(vol);
7570				}
7571				/* Unmap the mft record again. */
7572				ntfs_extent_mft_record_unmap(eni);
7573			} else {
7574			     ntfs_warning(vol->mp, "Failed to release "
7575					     "allocated clusters because "
7576					     "mapping extent mft_no 0x%llx, "
7577					     "base mft_no 0x%llx failed "
7578					     "(error %d).  Unmount and run "
7579					     "chkdsk to recover the lost "
7580					     "clusters.",
7581					     (unsigned long long)MREF_LE(mref),
7582					     (unsigned long long)ni->mft_no,
7583					     err);
7584			     NVolSetErrors(vol);
7585			}
7586do_next:
7587			entry = next_entry;
7588			next_entry = (ATTR_LIST_ENTRY*)((u8*)entry +
7589					le16_to_cpu(entry->length));
7590		}
7591	}
7592	ntfs_mft_record_unmap(ni);
7593	/*
7594	 * Mark the base mft record and all extent mft records (if any) as
7595	 * unused in the mft bitmap.
7596	 *
7597	 * Note that this means that ntfs_inode_reclaim() may run when someone
7598	 * else has already reused one of the mft records we are freeing now.
7599	 * This is ok because all ntfs_inode_reclaim() does is to do some
7600	 * memory freeing.  And we have already removed the inode from the
7601	 * inode cache thus there are no problems from that point of view
7602	 * either.
7603	 */
7604	lck_rw_lock_exclusive(&vol->mftbmp_lock);
7605	mftbmp_ni = vol->mftbmp_ni;
7606	err = vnode_get(mftbmp_ni->vn);
7607	if (err)
7608		ntfs_warning(vol->mp, "Failed to get vnode for $MFT/$BITMAP "
7609				"(error %d) thus cannot release mft "
7610				"record(s).  Run chkdsk to recover the lost "
7611				"mft record(s).", err);
7612	else {
7613		lck_rw_lock_shared(&mftbmp_ni->lock);
7614		while (nr_mrefs > 0) {
7615			nr_mrefs--;
7616			err = ntfs_bitmap_clear_bit(mftbmp_ni,
7617					MREF_LE(mrefs[nr_mrefs]));
7618			if (!err) {
7619				/*
7620				 * We cleared a bit in the mft bitmap thus we
7621				 * need to reflect this in the cached number of
7622				 * free mft records.
7623				 */
7624				vol->nr_free_mft_records++;
7625				if (vol->nr_free_mft_records >=
7626						vol->nr_mft_records)
7627					panic("%s(): vol->nr_free_mft_records "
7628							"> vol->nr_mft_records"
7629							"\n", __FUNCTION__);
7630			} else {
7631				ntfs_error(vol->mp, "Failed to free mft_no "
7632						"0x%llx (error %d).  Run "
7633						"chkdsk to recover the lost "
7634						"mft record.",
7635						(unsigned long long)
7636						MREF_LE(mrefs[nr_mrefs]), err);
7637				NVolSetErrors(vol);
7638			}
7639		}
7640		lck_rw_unlock_shared(&mftbmp_ni->lock);
7641		(void)vnode_put(mftbmp_ni->vn);
7642	}
7643	lck_rw_unlock_exclusive(&vol->mftbmp_lock);
7644	ntfs_debug("Done (deleted base inode).");
7645	return 0;
7646}
7647
7648/**
7649 * ntfs_vnop_reclaim - free ntfs specific parts of a vnode so it can be reused
7650 * @a:		arguments to reclaim function
7651 *
7652 * @a contains:
7653 *	vnode_t a_vp;		vnode to be reclaimed
7654 *	vfs_context_t a_context;
7655 *
7656 * Reclaim a vnode so it can be used for other purposes.
7657 *
7658 * Note: This is called from reclaim.  The vnode has a zero v_iocount and
7659 *	 v_usecount and vnode_isrecycled() is true.
7660 *
7661 * Return 0 on success and errno on error.
7662 *
7663 * Note the current OS X VFS panic()s the machine if VNOP_RECLAIM() and hence
7664 * ntfs_vnop_reclaim() returns an error.
7665 */
7666static int ntfs_vnop_reclaim(struct vnop_reclaim_args *a)
7667{
7668	vnode_t vn = a->a_vp;
7669	ntfs_inode *ni = NTFS_I(vn);
7670	errno_t err;
7671
7672	/* Do not dereference @ni if it is NULL. */
7673#ifdef DEBUG
7674	if (ni)
7675		ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len "
7676				"0x%x.", (unsigned long long)ni->mft_no,
7677				le32_to_cpu(ni->type), (unsigned)ni->name_len);
7678	else
7679		ntfs_debug("Entering for already reclaimed vnode!");
7680#endif
7681	vnode_removefsref(vn);
7682	err = ntfs_inode_reclaim(ni);
7683	ntfs_debug("Done (error %d).", (int)err);
7684	return err;
7685}
7686
7687/**
7688 * ntfs_vnop_pathconf - get configurable pathname variables
7689 * @a:		arguments to pathconf function
7690 *
7691 * @a contains:
7692 *	vnode_t a_vp;		vnode for which to return pathconf information
7693 *	int a_name;		the pathconf variable to be queried
7694 *	register_t *a_retval;	destination for result of query
7695 *	vfs_context_t a_context;
7696 *
7697 * Return POSIX pathconf information applicable to ntfs file system.  Some
7698 * @a_name values are intercepted by the VFS in vn_pathconf (pathconf(2) ->
7699 * vn_pathconf() -> VNOP_PATHCONF() -> ntfs_vnop_pathconf()) so we do not
7700 * bother with them.
7701 *
7702 * Return 0 on success and EINVAL if an unsupported @a_name was queried for.
7703 */
7704static int ntfs_vnop_pathconf(struct vnop_pathconf_args *a)
7705{
7706	ntfs_inode *ni = NTFS_I(a->a_vp);
7707	ntfs_volume *vol = NTFS_MP(vnode_mount(a->a_vp));
7708	errno_t err = 0;
7709
7710	ntfs_debug("Entering for pathconf variable number %d.", a->a_name);
7711	if (ni) {
7712		lck_rw_lock_shared(&ni->lock);
7713		/*
7714		 * Do not allow messing with the inode once it has been
7715		 * deleted.
7716		 */
7717		if (NInoDeleted(ni)) {
7718			/* Remove the inode from the name cache. */
7719			cache_purge(ni->vn);
7720			lck_rw_unlock_shared(&ni->lock);
7721			ntfs_debug("Directory is deleted.");
7722			return ENOENT;
7723		}
7724	}
7725	switch (a->a_name) {
7726	case _PC_LINK_MAX:
7727		/*
7728		 * The maximum file link count.  For ntfs, the link count is
7729		 * stored in the mft record in the link_count field which is of
7730		 * type le16, thus 16 bits.  For attribute inodes and
7731		 * directories however, no hard links are allowed and thus the
7732		 * maximum link count is 1.
7733		 */
7734		if (!ni) {
7735			ntfs_debug("Entered with NULL ntfs_inode, aborting.");
7736			return EINVAL;
7737		}
7738		*a->a_retval = NTFS_MAX_HARD_LINKS;
7739		if (NInoAttr(ni) || S_ISDIR(ni->mode))
7740			*a->a_retval = 1;
7741		break;
7742	case _PC_NAME_MAX:
7743		/*
7744		 * The maximum number of bytes in a filename.  For ntfs, this
7745		 * is stored in the attribute record in the name_length field
7746		 * which is of type u8, thus 8 bits.
7747		 */
7748		*a->a_retval = NTFS_MAX_NAME_LEN; /* 255 */
7749		break;
7750	case _PC_PATH_MAX:
7751		/*
7752		 * The maximum number of bytes in a path name.  Ntfs imposes no
7753		 * restrictions so use the system limit.
7754		 */
7755		*a->a_retval = PATH_MAX; /* 1024 */
7756		break;
7757	case _PC_PIPE_BUF:
7758		/*
7759		 * The maximum number of bytes which will be written atomically
7760		 * to a pipe, again ntfs imposes no restrictions so use the
7761		 * system limit.
7762		 */
7763		*a->a_retval = PIPE_BUF; /* 512 */
7764		break;
7765	case _PC_CHOWN_RESTRICTED:
7766		/*
7767		 * Non-zero if appropriate privileges are required for the
7768		 * chown(2) system call.  For ntfs, this is always the case.
7769		 */
7770		*a->a_retval = 200112; /* unistd.h: _POSIX_CHOWN_RESTRICTED */
7771		break;
7772	case _PC_NO_TRUNC:
7773		/*
7774		 * Non-zero if accessing filenames longer than _POSIX_NAME_MAX
7775		 * (which we specified above to be NTFS_MAX_NAME_LEN) generates
7776		 * an error.  For ntfs, this is always the case.
7777		 */
7778		*a->a_retval = 200112; /* unistd.h: _POSIX_NO_TRUNC */
7779		break;
7780	case _PC_NAME_CHARS_MAX:
7781		/*
7782		 * The maximum number of characters in a filename.  This is
7783		 * the same as _PC_NAME_MAX, above.
7784		 */
7785		*a->a_retval = NTFS_MAX_NAME_LEN; /* 255 */
7786		break;
7787	case _PC_CASE_SENSITIVE:
7788		/*
7789		 * Return 1 if case sensitive and 0 if not.  For ntfs, this
7790		 * depends on the mount options.
7791		 */
7792		if (vol)
7793			*a->a_retval = (NVolCaseSensitive(vol) ? 1 : 0);
7794		else
7795			err = EINVAL;
7796		break;
7797	case _PC_CASE_PRESERVING:
7798		/*
7799		 * Return 1 if case preserving and 0 if not.  For ntfs, this is
7800		 * always 1, i.e. ntfs always preserves case.
7801		 */
7802		*a->a_retval = 1;
7803		break;
7804	case _PC_FILESIZEBITS:
7805		/*
7806		 * The number of bits to represent file size.  For ntfs, the
7807		 * file size is stored in the attribute record in the data_size
7808		 * field which is of type sle64, thus 63 bits.
7809		 */
7810		*a->a_retval = 63;
7811		break;
7812	default:
7813		err = EINVAL;
7814	}
7815	if (ni)
7816		lck_rw_unlock_shared(&ni->lock);
7817	ntfs_debug("Done (error %d).", (int)err);
7818	return err;
7819}
7820
7821/**
7822 * ntfs_vnop_allocate -
7823 */
7824static int ntfs_vnop_allocate(struct vnop_allocate_args *a)
7825{
7826	errno_t err;
7827
7828	ntfs_debug("Entering.");
7829	// TODO:
7830	(void)nop_allocate(a);
7831	err = ENOTSUP;
7832	ntfs_debug("Done (error %d).", (int)err);
7833	return err;
7834}
7835
7836/**
7837 * ntfs_vnop_pagein - read a range of pages into memory
7838 * @a:		arguments to pagein function
7839 *
7840 * @a contains:
7841 *	vnode_t a_vp;		vnode whose data to read into the page range
7842 *	upl_t a_pl;		page list describing destination page range
7843 *	upl_offset_t a_pl_offset; byte offset into page list at which to start
7844 *	off_t a_f_offset;	byte offset in the vnode at which to start
7845 *	size_t a_size;		number of bytes to read from the vnode
7846 *	int a_flags;		flags further describing the pagein request
7847 *	vfs_context_t a_context;
7848 *
7849 * Read @a->a_size bytes from the vnode @a-a_vp, starting at byte offset
7850 * @a->a_f_offset into the vnode, into the range of pages specified by the page
7851 * list @a->a_pl, starting at byte offset @a->a_pl_offset into the page list.
7852 *
7853 * The flags in @a->a_flags further describe the pagein request.  The following
7854 * pagein flags are currently defined in OS X kernel:
7855 *	UPL_IOSYNC	- Perform synchronous i/o.
7856 *	UPL_NOCOMMIT	- Do not commit/abort the page range.
7857 *	UPL_NORDAHEAD	- Do not perform any speculative read-ahead.
7858 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
7859 *
7860 * For encrypted attributes we abort for now as we do not support them yet.
7861 *
7862 * For non-resident, non-compressed attributes we use cluster_pagein_ext()
7863 * which deals with both normal and multi sector transfer protected attributes.
7864 *
7865 * For resident attributes and non-resident, compressed attributes we read the
7866 * data ourselves by mapping the page list, and in the resident case, mapping
7867 * the mft record, looking up the attribute in it, and copying the requested
7868 * data from the mapped attribute into the page list, then unmapping the mft
7869 * record, whilst for non-resident, compressed attributes, we get the raw inode
7870 * and use it with ntfs_read_compressed() to read and decompress the data into
7871 * our mapped page list.  We then unmap the page list and finally, if
7872 * UPL_NOCOMMIT is not specified, we commit (success) or abort (error) the page
7873 * range.
7874 *
7875 * Return 0 on success and errno on error.
7876 *
7877 * Note the pages in the page list are marked busy on entry and the busy bit is
7878 * cleared when we commit the page range.  Thus it is perfectly safe for us to
7879 * fill the pages with encrypted or mst protected data and to decrypt or mst
7880 * deprotect in place before committing the page range.
7881 *
7882 * Adapted from cluster_pagein_ext().
7883 */
7884static int ntfs_vnop_pagein(struct vnop_pagein_args *a)
7885{
7886	ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
7887	int err;
7888
7889	if (!ni) {
7890		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
7891		if (!(a->a_flags & UPL_NOCOMMIT) && a->a_pl)
7892			ubc_upl_abort_range(a->a_pl, a->a_pl_offset, a->a_size,
7893					UPL_ABORT_FREE_ON_EMPTY |
7894					UPL_ABORT_ERROR);
7895		return EINVAL;
7896	}
7897	base_ni = ni;
7898	if (NInoAttr(ni))
7899		base_ni = ni->base_ni;
7900	ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%llx, "
7901			"pagein flags 0x%x, page list offset 0x%llx.",
7902			(unsigned long long)ni->mft_no,
7903			(unsigned long long)a->a_f_offset,
7904			(unsigned long long)a->a_size, a->a_flags,
7905			(unsigned long long)a->a_pl_offset);
7906	err = ntfs_pagein(ni, a->a_f_offset, a->a_size, a->a_pl,
7907			a->a_pl_offset, a->a_flags);
7908	/*
7909	 * Update the last_access_time (atime) if something was read and this
7910	 * is the base ntfs inode or it is a named stream (this is what HFS+
7911	 * does, too).
7912	 *
7913	 * Skip the update if atime updates are disabled via the noatime mount
7914	 * option or the volume is read only or this is a symbolic link.
7915	 *
7916	 * Also, skip the core system files except for the root directory.
7917	 */
7918	if (!err && !NVolReadOnly(ni->vol) &&
7919			!(vfs_flags(ni->vol->mp) & MNT_NOATIME) &&
7920			!S_ISLNK(base_ni->mode) &&
7921			(ni == base_ni || ni->type == AT_DATA)) {
7922		BOOL need_update_time;
7923
7924		need_update_time = TRUE;
7925		if (ni->vol->major_ver > 1) {
7926			if (base_ni->mft_no <= FILE_Extend &&
7927					base_ni != ni->vol->root_ni)
7928				need_update_time = FALSE;
7929		} else {
7930			if (base_ni->mft_no <= FILE_UpCase &&
7931					base_ni != ni->vol->root_ni)
7932				need_update_time = FALSE;
7933		}
7934		if (need_update_time) {
7935			base_ni->last_access_time = ntfs_utc_current_time();
7936			NInoSetDirtyTimes(base_ni);
7937		}
7938	}
7939	return err;
7940}
7941
7942// TODO: Move to ntfs_page.[hc].
7943static int ntfs_mst_pageout(ntfs_inode *ni, upl_t upl, upl_offset_t upl_ofs,
7944		unsigned size, s64 attr_ofs, s64 attr_size, int flags)
7945{
7946	ntfs_volume *vol = ni->vol;
7947	u8 *kaddr;
7948	kern_return_t kerr;
7949	unsigned rec_size, rec_shift, nr_recs, i;
7950	int err;
7951	NTFS_RECORD_TYPE magic = 0;
7952	BOOL do_commit;
7953
7954	do_commit = !(flags & UPL_NOCOMMIT);
7955	if (ni->type == AT_INDEX_ALLOCATION)
7956		magic = magic_INDX;
7957	else
7958		panic("%s(): Unknown mst protected inode 0x%llx, type 0x%x, "
7959				"name_len 0x%x.", __FUNCTION__,
7960				(unsigned long long)ni->mft_no,
7961				(unsigned)le32_to_cpu(ni->type),
7962				(unsigned)ni->name_len);
7963	ntfs_debug("Entering for mft_no 0x%llx, page list offset 0x%llx, size "
7964			"0x%x, offset 0x%llx, pageout flags 0x%x, magic is "
7965			"0x%x.", (unsigned long long)ni->mft_no,
7966			(unsigned long long)upl_ofs, size,
7967			(unsigned long long)attr_ofs, flags,
7968			(unsigned)le32_to_cpu(magic));
7969	if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 ||
7970			size & PAGE_MASK || upl_ofs & PAGE_MASK) {
7971		err = EINVAL;
7972		goto err;
7973	}
7974	if (!NInoMstProtected(ni))
7975		panic("%s(): Called for non-mst protected attribute.\n",
7976				__FUNCTION__);
7977	if (!NInoNonResident(ni))
7978		panic("%s(): Resident mst protected attribute.\n",
7979				__FUNCTION__);
7980	rec_size = ni->block_size;
7981	if (attr_ofs & (rec_size - 1) || size & (rec_size - 1))
7982		panic("%s(): Write not aligned to NTFS record boundary.\n",
7983				__FUNCTION__);
7984	rec_shift = ni->block_size_shift;
7985	/* Clip the number of records to the size of the attribute. */
7986	nr_recs = size >> rec_shift;
7987	if (attr_ofs + size > attr_size) {
7988		unsigned to_write;
7989
7990		/* Abort any pages outside the end of the attribute. */
7991		to_write = attr_size - attr_ofs;
7992		nr_recs = to_write >> rec_shift;
7993		to_write = (to_write + PAGE_MASK) & ~PAGE_MASK;
7994		if (size != to_write) {
7995			if (size < to_write)
7996				panic("%s(): size less than to_write.\n",
7997						__FUNCTION__);
7998			ntfs_debug("Truncating write past end of attribute.");
7999			if (do_commit)
8000				ubc_upl_abort_range(upl, upl_ofs + to_write,
8001						size - to_write,
8002						UPL_ABORT_FREE_ON_EMPTY);
8003			size = to_write;
8004		}
8005	}
8006	if (!nr_recs)
8007		panic("%s(): NTFS record size greater than write size.\n",
8008				__FUNCTION__);
8009	/*
8010	 * Need to apply the mst fixups and abort on errors.  To apply the
8011	 * fixups need to map the page list so we can access its contents.
8012	 */
8013	kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
8014	if (kerr != KERN_SUCCESS) {
8015		ntfs_error(vol->mp, "ubc_upl_map() failed (error %d).",
8016				(int)kerr);
8017		err = EIO;
8018		goto err;
8019	}
8020	/*
8021	 * Loop over the records in the page list and for each apply the mst
8022	 * fixups.  On any fixup errors, remove all the applied fixups and
8023	 * abort the write completely.
8024	 */
8025	for (i = 0; i < nr_recs; i++) {
8026		NTFS_RECORD *rec = (NTFS_RECORD*)(kaddr + (i << rec_shift));
8027		if (__ntfs_is_magic(rec->magic, magic)) {
8028			err = ntfs_mst_fixup_pre_write(rec, rec_size);
8029			if (err) {
8030				ntfs_error(vol->mp, "Failed to apply mst "
8031						"fixups (mft_no 0x%llx, type "
8032						"0x%x, offset 0x%llx).",
8033						(unsigned long long)ni->mft_no,
8034						(unsigned)le32_to_cpu(ni->type),
8035						(unsigned long long)attr_ofs +
8036						(i << rec_shift));
8037				goto mst_err;
8038			}
8039		}
8040	}
8041	/* Unmap the page list again so we can call cluster_pageout_ext(). */
8042	// FIXME: Can we leave the page list mapped throughout the
8043	// cluster_pageout_ext() call?  That would be a lot more efficient and
8044	// simplify error handling.
8045	kerr = ubc_upl_unmap(upl);
8046	if (kerr != KERN_SUCCESS) {
8047		ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).",
8048				(int)kerr);
8049		err = EIO;
8050		goto mst_err;
8051	}
8052	/*
8053	 * We need the write to be synchronous so we do not leave the metadata
8054	 * with the fixups applied for too long.
8055	 *
8056	 * We also need to set the no commit flag so we can still recover from
8057	 * errors by removing the fixups.
8058	 */
8059	flags |= UPL_IOSYNC | UPL_NOCOMMIT;
8060	/*
8061	 * On success the fixups will have been removed by the
8062	 * ntfs_cluster_iodone() callback.
8063	 */
8064	err = cluster_pageout_ext(ni->vn, upl, upl_ofs, attr_ofs, size,
8065			attr_size, flags, ntfs_cluster_iodone, NULL);
8066	if (!err) {
8067		if (do_commit) {
8068			/* Commit the page range we wrote out. */
8069			ubc_upl_commit_range(upl, upl_ofs, size,
8070					UPL_COMMIT_FREE_ON_EMPTY |
8071					UPL_COMMIT_CLEAR_DIRTY);
8072		}
8073		ntfs_debug("Done.");
8074		return err;
8075	}
8076	ntfs_error(vol->mp, "Failed (cluster_pageout_ext() returned error "
8077			"%d).", err);
8078	/*
8079	 * We may have some records left with applied fixups thus remove them
8080	 * again.  It does not matter if it is done twice as this is an error
8081	 * code path and the only side effect is a little slow down.
8082	 */
8083	kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
8084	if (kerr != KERN_SUCCESS) {
8085		ntfs_error(vol->mp, "ubc_upl_map() failed (error %d), cannot "
8086				"remove mst fixups.  Unmount and run chkdsk.",
8087				(int)kerr);
8088		NVolSetErrors(vol);
8089		goto err;
8090	}
8091mst_err:
8092	/* Remove the applied fixups, unmap the page list and abort. */
8093	while (i > 0) {
8094		NTFS_RECORD *rec = (NTFS_RECORD*)(kaddr + (--i << rec_shift));
8095		if (__ntfs_is_magic(rec->magic, magic))
8096			ntfs_mst_fixup_post_write(rec);
8097	}
8098	kerr = ubc_upl_unmap(upl);
8099	if (kerr != KERN_SUCCESS)
8100		ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).",
8101				(int)kerr);
8102err:
8103	if (do_commit)
8104		ubc_upl_abort_range(upl, upl_ofs, size,
8105				UPL_ABORT_FREE_ON_EMPTY);
8106	return err;
8107}
8108
8109/**
8110 * ntfs_vnop_pageout - write a range of pages to storage
8111 * @a:		arguments to pageout function
8112 *
8113 * @a contains:
8114 *	vnode_t a_vp;		vnode whose data to write from the page range
8115 *	upl_t a_pl;		page list describing the source page range
8116 *	upl_offset_t a_pl_offset; byte offset into page list at which to start
8117 *	off_t a_f_offset;	byte offset in the vnode at which to start
8118 *	size_t a_size;		number of bytes to write to the vnode
8119 *	int a_flags;		flags further describing the pageout request
8120 *	vfs_context_t a_context;
8121 *
8122 * If UPL_NESTED_PAGEOUT is set in the flags (a->a_flags) we are called from
8123 * cluster_io() which is in turn called from cluster_write() which is in turn
8124 * called from ntfs_vnop_write() which means we are already holding the inode
8125 * lock (@ni->lock).  Alternatively cluster_io() can be called from
8126 * cluster_push() which can be called from various places in NTFS.
8127 *
8128 * Write @a->a_size bytes to the vnode @a-a_vp, starting at byte offset
8129 * @a->a_f_offset into the vnode, from the range of pages specified by the page
8130 * list @a->a_pl, starting at byte offset @a->a_pl_offset into the page list.
8131 *
8132 * The flags in @a->a_flags further describe the pageout request.  The
8133 * following pageout flags are currently defined in OS X kernel:
8134 *	UPL_IOSYNC	- Perform synchronous i/o.
8135 *	UPL_NOCOMMIT	- Do not commit/abort the page range.
8136 *	UPL_KEEPCACHED	- Data is already cached in memory, keep it cached.
8137 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
8138 *
8139 * For encrypted attributes we abort for now as we do not support them yet.
8140 *
8141 * For non-resident, non-compressed attributes we use cluster_pageout_ext()
8142 * which deals with both normal and multi sector transfer protected attributes.
8143 *
8144 * In the case of multi sector transfer protected attributes we apply the
8145 * fixups and then submit the i/o synchronously by setting the UPL_IOSYNC flag.
8146 *
8147 * For resident attributes and non-resident, compressed attributes we write the
8148 * data ourselves by mapping the page list, and in the resident case, mapping
8149 * the mft record, looking up the attribute in it, and copying the data to the
8150 * mapped attribute from the page list, then unmapping the mft record, whilst
8151 * for non-resident, compressed attributes, we get the raw inode and use it
8152 * with ntfs_write_compressed() to compress and write the data from our mapped
8153 * page list.  We then unmap the page list and finally, if UPL_NOCOMMIT is not
8154 * specified, we commit (success) or abort (error) the page range.
8155 *
8156 * Return 0 on success and errno on error.
8157 *
8158 * Note the pages in the page list are marked busy on entry and the busy bit is
8159 * cleared when we commit the page range.  Thus it is perfectly safe for us to
8160 * apply the mst fixups and write out the data which will then also take away
8161 * the fixups again before committing the page range.
8162 *
8163 * Adapted from cluster_pageout_ext().
8164 */
8165static int ntfs_vnop_pageout(struct vnop_pageout_args *a)
8166{
8167	s64 attr_ofs, attr_size, alloc_size, bytes;
8168	ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
8169	upl_t upl = a->a_pl;
8170	ntfs_volume *vol;
8171	u8 *kaddr;
8172	upl_offset_t upl_ofs = a->a_pl_offset;
8173	kern_return_t kerr;
8174	unsigned to_write, size = a->a_size;
8175	int err, flags = a->a_flags;
8176	lck_rw_type_t lock_type = LCK_RW_TYPE_SHARED;
8177	BOOL locked = FALSE;
8178
8179	if (!ni) {
8180		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
8181		if (!(flags & UPL_NOCOMMIT) && upl)
8182			ubc_upl_abort_range(upl, upl_ofs, size,
8183					UPL_ABORT_FREE_ON_EMPTY);
8184		return EINVAL;
8185	}
8186	vol = ni->vol;
8187	attr_ofs = a->a_f_offset;
8188	base_ni = ni;
8189	if (NInoAttr(ni))
8190		base_ni = ni->base_ni;
8191	ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%x, "
8192			"pageout flags 0x%x, page list offset 0x%llx.",
8193			(unsigned long long)ni->mft_no,
8194			(unsigned long long)attr_ofs, size, flags,
8195			(unsigned long long)upl_ofs);
8196	/*
8197	 * If the caller did not specify any i/o, then we are done.  We cannot
8198	 * issue an abort because we do not have a upl or we do not know its
8199	 * size.
8200	 */
8201	if (!upl || size <= 0) {
8202		ntfs_error(vol->mp, "NULL page list passed in or request size "
8203				"is below zero (error EINVAL).");
8204		return EINVAL;
8205	}
8206	if (S_ISDIR(ni->mode)) {
8207		ntfs_error(vol->mp, "Called for directory vnode.");
8208		err = EISDIR;
8209		goto err;
8210	}
8211	if (NVolReadOnly(vol)) {
8212		err = EROFS;
8213		goto err;
8214	}
8215	/*
8216	 * Need to clip i/o at maximum file size of 2^63-1 bytes in case
8217	 * someone creates a sparse file and is playing silly with seek + write
8218	 * note we only need to check for this for sparse files as non-sparse
8219	 * files can never reach 2^63-1 because that is also the maximum space
8220	 * on the volume thus the write would simply get an ENOSPC when the
8221	 * volume is full.
8222	 */
8223	if (NInoSparse(ni) && (u64)attr_ofs + size > NTFS_MAX_ATTRIBUTE_SIZE) {
8224		err = EFBIG;
8225		goto err;
8226	}
8227#if 1	// TODO: Remove this when sparse support is done...
8228	if (NInoSparse(ni)) {
8229		err = ENOTSUP;
8230		goto err;
8231	}
8232#endif
8233	/*
8234	 * Protect against changes in initialized_size and thus against
8235	 * truncation also but only if the VFS is not calling back into the
8236	 * NTFS driver after the NTFS driver called it in which case we are
8237	 * already holding the lock.
8238	 *
8239	 * There is a complication in that the UPL is already created by the
8240	 * caller thus us taking the lock here is a case of lock reversal wrt
8241	 * the UPL keeping the pages locked for exclusive access thus we can
8242	 * deadlock with a concurrent file create for example when it holds the
8243	 * ntfs inode lock @ni->lock for exclusive access on the index vnode of
8244	 * the parent directory and then calls ntfs_page_map() to map a page
8245	 * from the index as we already hold the same UPL that ntfs_page_map()
8246	 * will try to get thus if we go to sleep on the ntfs inode lock that
8247	 * is held exclusive by the create code path we would now deadlock.
8248	 *
8249	 * To avoid the deadlock, we do a try-lock for the ntfs inode lock and
8250	 * if that fails we simply abort the pages returning them to the VM
8251	 * without modification thus they should remain dirty and they should
8252	 * be paged out at a later point in time.
8253	 *
8254	 * We then return ENXIO to indicate that this is a temporary failure to
8255	 * the caller.
8256	 *
8257	 * FIXME: There is a complication and that is that we really need to
8258	 * hole the inode lock for writing if we are writing to a hole and/or
8259	 * writing past the initialized size as we would then be modifying the
8260	 * initialized_size.  But if UPL_NESTED_PAGEOUT is set we have no idea
8261	 * whether the caller is holding the lock for write or not and we
8262	 * cannot safely drop/retake the lock in any case...  For now we ignore
8263	 * the problem and just emit a warning in this case.
8264	 */
8265	if (!(flags & UPL_NESTED_PAGEOUT)) {
8266		if (NInoSparse(ni))
8267			lock_type = LCK_RW_TYPE_EXCLUSIVE;
8268		if (!lck_rw_try_lock(&ni->lock, lock_type)) {
8269			ntfs_debug("Failed to take ni->lock for %s for mft_no "
8270					"0x%llx, type 0x%x.  Aborting with "
8271					"ENXIO to avoid deadlock.",
8272					(lock_type == LCK_RW_TYPE_SHARED) ?
8273					"reading" : "writing",
8274					(unsigned long long)ni->mft_no,
8275					(unsigned)le32_to_cpu(ni->type));
8276			if (!(flags & UPL_NOCOMMIT))
8277				ubc_upl_abort_range(upl, upl_ofs, size,
8278						UPL_ABORT_FREE_ON_EMPTY);
8279			return ENXIO;
8280		}
8281		locked = TRUE;
8282	} else {
8283		if (NInoSparse(ni))
8284			ntfs_warning(vol->mp, "flags & UPL_NESTED_PAGEOUT && "
8285					"NINoSparse(ni), need inode lock "
8286					"exclusive but caller holds the lock "
8287					"so we do not know if it is exclusive "
8288					"or not.");
8289	}
8290	/* Do not allow messing with the inode once it has been deleted. */
8291	if (NInoDeleted(ni)) {
8292		/* Remove the inode from the name cache. */
8293		cache_purge(ni->vn);
8294		err = ENOENT;
8295		goto err;
8296	}
8297retry_pageout:
8298	/*
8299	 * TODO: This check may no longer be necessary now that we lock against
8300	 * changes in initialized size and thus truncation...  Revisit this
8301	 * issue when the write code has been written and remove the check if
8302	 * appropriate simply using ubc_getsize(vn); without the size_lock.
8303	 */
8304	lck_spin_lock(&ni->size_lock);
8305	attr_size = ubc_getsize(a->a_vp);
8306	if (attr_size > ni->data_size)
8307		attr_size = ni->data_size;
8308	/*
8309	 * Cannot pageout to a negative offset or if we are starting beyond the
8310	 * end of the attribute or if the attribute offset is not page aligned
8311	 * or the size requested is not a multiple of PAGE_SIZE.
8312	 */
8313	if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 ||
8314			size & PAGE_MASK || upl_ofs & PAGE_MASK) {
8315		lck_spin_unlock(&ni->size_lock);
8316		err = EINVAL;
8317		goto err;
8318	}
8319// TODO: HERE:
8320	// FIXME: For now abort writes beyond initialized size...
8321	// TODO: This causes a problem and that is in ntfs_vnop_write() we only
8322	// update the initialized size after calling cluster_write() which
8323	// means we cannot zero up to the initialized size here or we could
8324	// trample over data that has just been written out.  Also this causes
8325	// our check here to trigger even though we are not really outside the
8326	// initialized size at all and in fact this page out may be part of the
8327	// write itself so it has to succeed.  But on the other hand if this is
8328	// a genuine mmap()-based write we do need to do the zeroing.  We need
8329	// to somehow be able to tell the difference between the two...
8330	// If the initialized size equals attr_ofs then we can safely perform
8331	// the write and then update the initialized size to attr_ofs + size
8332	// but need to be careful to update the data size appropriately and
8333	// also need to make sure not to exceed the end of the write otherwise
8334	// we would cause a file extension here when we should not do so.  In
8335	// fact if this is not part of an extending write then we should not
8336	// modify the data size and only the initialized size instead.
8337	if (attr_ofs + size > ni->initialized_size && ni->initialized_size !=
8338			ni->data_size) {
8339		lck_spin_unlock(&ni->size_lock);
8340		ntfs_error(vol->mp, "Writing beyond the initialized size of "
8341				"an attribute is not implemented yet.");
8342		err = ENOTSUP;
8343		goto err;
8344	}
8345	alloc_size = ni->allocated_size;
8346	lck_spin_unlock(&ni->size_lock);
8347	/*
8348	 * If this is a sparse attribute we need to fill any holes overlapping
8349	 * the write.  We can skip resident attributes as they cannot have
8350	 * sparse regions.
8351	 *
8352	 * As allocated size goes in units of clusters we need to round down
8353	 * the start offset to the nearest cluster boundary and we need to
8354	 * round up the end offset to the next cluster boundary.
8355	 */
8356	if (NInoSparse(ni) && NInoNonResident(ni) && ni->type == AT_DATA) {
8357		s64 aligned_end, new_end;
8358
8359		aligned_end = (attr_ofs + size + vol->cluster_size_mask) &
8360				~vol->cluster_size_mask;
8361		/*
8362		 * Only need to instantiate holes up to the allocated size
8363		 * itself.  Everything else would be an extension which is not
8364		 * allowed from VNOP_PAGEOUT().
8365		 */
8366		if (aligned_end > alloc_size)
8367			aligned_end = alloc_size;
8368		err = ntfs_attr_instantiate_holes(ni,
8369				attr_ofs & ~vol->cluster_size_mask,
8370				aligned_end, &new_end, TRUE);
8371		if (err) {
8372			ntfs_error(vol->mp, "Cannot perform pageout of mft_no "
8373					"0x%llx because instantiation of "
8374					"sparse regions failed (error %d).",
8375					(unsigned long long)ni->mft_no, err);
8376			goto err;
8377		}
8378		/* The instantiation may not be partial. */
8379		if (new_end < aligned_end)
8380			panic("%s(): new_end < aligned_end\n", __FUNCTION__);
8381	}
8382	/*
8383	 * Only $DATA attributes can be encrypted/compressed.  Index root can
8384	 * have the flags set but this means to create compressed/encrypted
8385	 * files, not that the attribute is compressed/encrypted.  Note we need
8386	 * to check for AT_INDEX_ALLOCATION since this is the type of directory
8387	 * index inodes.
8388	 */
8389	if (ni->type != AT_INDEX_ALLOCATION) {
8390		/* TODO: Deny access to encrypted attributes, just like NT4. */
8391		if (NInoEncrypted(ni)) {
8392			if (ni->type != AT_DATA)
8393				panic("%s(): Encrypted non-data attribute.\n",
8394						__FUNCTION__);
8395			ntfs_warning(vol->mp, "Denying write to encrypted "
8396					"attribute (EACCES).");
8397			err = EACCES;
8398			goto err;
8399		}
8400		/* Compressed data streams need special handling. */
8401		if (NInoNonResident(ni) && NInoCompressed(ni) && !NInoRaw(ni)) {
8402			if (ni->type != AT_DATA)
8403				panic("%s(): Compressed non-data attribute.\n",
8404						__FUNCTION__);
8405			goto compressed;
8406		}
8407	}
8408	/* NInoNonResident() == NInoIndexAllocPresent() */
8409	if (NInoNonResident(ni)) {
8410		if (NInoMstProtected(ni))
8411			err = ntfs_mst_pageout(ni, upl, upl_ofs, size,
8412					attr_ofs, attr_size, flags);
8413		else {
8414			err = cluster_pageout_ext(a->a_vp, upl, upl_ofs,
8415					attr_ofs, size, attr_size, flags, NULL,
8416					NULL);
8417			if (!err)
8418				ntfs_debug("Done (cluster_pageout_ext()).");
8419			else
8420				ntfs_error(vol->mp, "Failed "
8421						"(cluster_pageout_ext(), "
8422						"error %d).", err);
8423		}
8424		goto done;
8425	}
8426compressed:
8427	/* The attribute is resident and/or compressed. */
8428	to_write = size;
8429	bytes = attr_size - attr_ofs;
8430	if (to_write > bytes)
8431		to_write = bytes;
8432	/*
8433	 * Calculate the number of bytes available in the attribute starting at
8434	 * offset @attr_ofs up to a maximum of the number of bytes to be
8435	 * written rounded up to a multiple of the system page size.
8436	 */
8437	bytes = (to_write + PAGE_MASK) & ~PAGE_MASK;
8438	/* Abort any pages outside the end of the attribute. */
8439	if (size > bytes && !(flags & UPL_NOCOMMIT)) {
8440		ubc_upl_abort_range(upl, upl_ofs + bytes, size - bytes,
8441				UPL_ABORT_FREE_ON_EMPTY);
8442		/* Update @size. */
8443		size = bytes;
8444	}
8445	/* To access the page list contents, we need to map the page list. */
8446	kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
8447	if (kerr != KERN_SUCCESS) {
8448		ntfs_error(vol->mp, "ubc_upl_map() failed (error %d).",
8449				(int)kerr);
8450		err = EIO;
8451		goto err;
8452	}
8453	if (!NInoNonResident(ni)) {
8454		/*
8455		 * Write the data from the page list into the resident
8456		 * attribute in its mft record.
8457		 */
8458		err = ntfs_resident_attr_write(ni, kaddr + upl_ofs, to_write,
8459				attr_ofs);
8460		// TODO: If !err and synchronous i/o, write the mft record now.
8461		// This should probably happen in ntfs_resident_attr_write().
8462		if (err && err != EAGAIN)
8463			ntfs_error(vol->mp, "ntfs_resident_attr_write() "
8464					"failed (error %d).", err);
8465	} else if (NInoCompressed(ni)) {
8466		ntfs_error(vol->mp, "Writing to compressed files is not "
8467				"implemented yet, sorry.");
8468		err = ENOTSUP;
8469#if 0
8470		ntfs_inode *raw_ni;
8471		int ioflags;
8472
8473		/*
8474		 * Get the raw inode and lock it for writing to protect against
8475		 * concurrent readers and writers as the compressed data is
8476		 * invalid whilst a write is in progress.
8477		 */
8478		err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_EXCLUSIVE, &raw_ni);
8479		if (err)
8480			ntfs_error(vol->mp, "Failed to get raw inode (error "
8481					"%d).", err);
8482		else {
8483			if (!NInoRaw(raw_ni))
8484				panic("%s(): Requested raw inode but got "
8485						"non-raw one.\n", __FUNCTION__);
8486			ioflags = 0;
8487			if (vnode_isnocache(ni->vn) ||
8488					vnode_isnocache(raw_ni->vn))
8489				ioflags |= IO_NOCACHE;
8490			if (vnode_isnoreadahead(ni->vn) ||
8491					vnode_isnoreadahead(raw_ni->vn))
8492				ioflags |= IO_RAOFF;
8493			err = ntfs_write_compressed(ni, raw_ni, attr_ofs, size,
8494					kaddr + upl_ofs, NULL, ioflags);
8495			if (err)
8496				ntfs_error(vol->mp, "ntfs_write_compressed() "
8497						"failed (error %d).", err);
8498			lck_rw_unlock_exclusive(&raw_ni->lock);
8499			(void)vnode_put(raw_ni->vn);
8500		}
8501#endif
8502	} else {
8503		/*
8504		 * The attribute was converted to non-resident under our nose
8505		 * we need to retry the pageout.
8506		 *
8507		 * TODO: This may no longer be possible to happen now that we
8508		 * lock against changes in initialized size and thus
8509		 * truncation...  Revisit this issue when the write code has
8510		 * been finished and replace this with a panic().
8511		 */
8512		err = EAGAIN;
8513	}
8514	kerr = ubc_upl_unmap(upl);
8515	if (kerr != KERN_SUCCESS) {
8516		ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).",
8517				(int)kerr);
8518		if (!err)
8519			err = EIO;
8520	}
8521	if (!err) {
8522		if (!(flags & UPL_NOCOMMIT)) {
8523			/* Commit the page range we wrote out. */
8524			ubc_upl_commit_range(upl, upl_ofs, size,
8525					UPL_COMMIT_FREE_ON_EMPTY);
8526		}
8527		// TODO: If we wrote anything at all we have to clear the
8528		// setuid and setgid bits as a precaution against tampering
8529		// (see xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_pageout()).
8530		ntfs_debug("Done (%s).", !NInoNonResident(ni) ?
8531				"ntfs_resident_attr_write()" :
8532				"ntfs_write_compressed()");
8533	} else /* if (err) */ {
8534		/*
8535		 * If the attribute was converted to non-resident under our
8536		 * nose, retry the pageout.
8537		 *
8538		 * TODO: This may no longer be possible to happen now that we
8539		 * lock against changes in initialized size and thus
8540		 * truncation...  Revisit this issue when the write code has
8541		 * been finished and remove the check and goto if appropriate.
8542		 */
8543		if (err == EAGAIN)
8544			goto retry_pageout;
8545err:
8546		if (!(flags & UPL_NOCOMMIT))
8547			ubc_upl_abort_range(upl, upl_ofs, size,
8548					UPL_ABORT_FREE_ON_EMPTY);
8549		ntfs_error(vol->mp, "Failed (error %d).", err);
8550	}
8551done:
8552	// TODO: If we wrote anything at all we have to clear the setuid and
8553	// setgid bits as a precaution against tampering (see
8554	// xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_pageout()).
8555	/*
8556	 * If this is not a directory or it is an encrypted directory, set the
8557	 * needs archiving bit except for the core system files.
8558	 */
8559	if (!err && (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni))) {
8560		BOOL need_set_archive_bit = TRUE;
8561		if (vol->major_ver > 1) {
8562			if (base_ni->mft_no <= FILE_Extend)
8563				need_set_archive_bit = FALSE;
8564		} else {
8565			if (base_ni->mft_no <= FILE_UpCase)
8566				need_set_archive_bit = FALSE;
8567		}
8568		if (need_set_archive_bit) {
8569			base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
8570			NInoSetDirtyFileAttributes(base_ni);
8571		}
8572	}
8573	/*
8574	 * Update the last_data_change_time (mtime) and last_mft_change_time
8575	 * (ctime) on the base ntfs inode @base_ni but not on the core system
8576	 * files.  However do set it on the root directory.
8577	 *
8578	 * Do not update the times on symbolic links.
8579	 */
8580	if (!err && !S_ISLNK(base_ni->mode)) {
8581		BOOL need_update_time = TRUE;
8582		if (vol->major_ver > 1) {
8583			if (base_ni->mft_no <= FILE_Extend &&
8584					base_ni != vol->root_ni)
8585				need_update_time = FALSE;
8586		} else {
8587			if (base_ni->mft_no <= FILE_UpCase &&
8588					base_ni != vol->root_ni)
8589				need_update_time = FALSE;
8590		}
8591		if (need_update_time) {
8592			base_ni->last_mft_change_time =
8593					base_ni->last_data_change_time =
8594					ntfs_utc_current_time();
8595			NInoSetDirtyTimes(base_ni);
8596		}
8597	}
8598	if (locked) {
8599		if (lock_type == LCK_RW_TYPE_SHARED)
8600			lck_rw_unlock_shared(&ni->lock);
8601		else
8602			lck_rw_unlock_exclusive(&ni->lock);
8603	}
8604	return err;
8605}
8606
8607/**
8608 * ntfs_vnop_searchfs -
8609 *
8610 */
8611static int ntfs_vnop_searchfs(struct vnop_searchfs_args *a)
8612{
8613	errno_t err;
8614
8615	ntfs_debug("Entering.");
8616	// TODO:
8617	err = err_searchfs(a);
8618	ntfs_debug("Done (error %d).", (int)err);
8619	return err;
8620}
8621
8622/**
8623 * ntfs_vnop_getxattr - get the data of an extended attribute of an ntfs inode
8624 * @a:		arguments to getxattr function
8625 *
8626 * @a contains:
8627 *	vnode_t a_vp;	vnode whose extended attribute to get
8628 *	char *a_name;	name of extented attribute to get in utf8
8629 *	uio_t a_uio;	destination in which to return the exteneded attribute
8630 *	size_t *a_size;	size of the extended attribute in bytes
8631 *	int a_options;	flags controlling how the attribute is obtained
8632 *	vfs_context_t a_context;
8633 *
8634 * Get the named stream with the name @a->a_name (we map named streams 1:1 with
8635 * extended attributes for NTFS as the NTFS native EAs are useless) contained
8636 * in the vnode @a->a_vp and return its data in the destination specified by
8637 * @a->a_uio.
8638 *
8639 * If there was not enough space to return the whole extended attribute in the
8640 * destination @a->a_uio we return error ERANGE.  The only exception to this is
8641 * the resource fork (@a->a_name is XATTR_RESOURCEFORK_NAME) for which we just
8642 * return up to uio_resid(@a->a_uio) bytes (or up to the end of the resource
8643 * fork if that is smaller).
8644 *
8645 * Note that uio_offset(@a->a_uio) must be zero except for the resource fork
8646 * where it can specify the offset into the resource fork at which to begin
8647 * returning the data.
8648 *
8649 * If @a->a_uio is NULL, do not return the data of the attribute and instead
8650 * return the current data size of the named stream in *@a->a_size.  Note that
8651 * when @a->a_uio is not NULL @a->a_size is ignored as the size of the named
8652 * stream is implicitly returned in the @a->a_uio and it can be obtained by
8653 * taking the original buffer size and subtracting uio_resid(@a->a_uio) from
8654 * it.
8655 *
8656 * The flags in @a->a_options control how the attribute is obtained.  The
8657 * following flags are currently defined in OS X kernel:
8658 *	XATTR_NOFOLLOW	- Do not follow symbolic links.
8659 *	XATTR_CREATE	- Set the value, fail if already exists (setxattr only).
8660 *	XATTR_REPLACE	- Set the value, fail if does not exist (setxattr only).
8661 *	XATTR_NOSECURITY- Bypass authorization checking.
8662 *	XATTR_NODEFAULT	- Bypass default extended attribute file ('._' file).
8663 *
8664 * Return 0 on success and errno on error.
8665 */
8666static int ntfs_vnop_getxattr(struct vnop_getxattr_args *a)
8667{
8668	s64 size;
8669	user_ssize_t start_count;
8670	off_t start_ofs;
8671	ntfs_inode *ani, *ni = NTFS_I(a->a_vp);
8672	const char *name = a->a_name;
8673	uio_t uio = a->a_uio;
8674	ntfs_volume *vol;
8675	ntfschar *ntfs_name;
8676	size_t ntfs_name_size;
8677	signed ntfs_name_len;
8678	errno_t err;
8679	ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN];
8680
8681	if (!ni) {
8682		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
8683		return EINVAL;
8684	}
8685	vol = ni->vol;
8686	/* Check for invalid names. */
8687	if (!name || name[0] == '\0')
8688		return EINVAL;
8689	start_ofs = uio_offset(uio);
8690	start_count = uio_resid(uio);
8691	ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, "
8692			"offset 0x%llx, size 0x%llx, options 0x%x.",
8693			(unsigned long long)ni->mft_no, name, start_ofs,
8694			start_count, a->a_options);
8695	lck_rw_lock_shared(&ni->lock);
8696	/* Do not allow messing with the inode once it has been deleted. */
8697	if (NInoDeleted(ni)) {
8698		/* Remove the inode from the name cache. */
8699		cache_purge(ni->vn);
8700		ntfs_debug("Mft_no 0x%llx is deleted.",
8701				(unsigned long long)ni->mft_no);
8702		err = ENOENT;
8703		goto err;
8704	}
8705	/*
8706	 * Only regular files, directories, and symbolic links can have
8707	 * extended attributes.  (Specifically named streams cannot have them.)
8708	 *
8709	 * Thus the check is for attribute inodes as all base inodes are
8710	 * allowed.  Raw inodes are also attribute inodes so they are excluded
8711	 * automatically, too.
8712	 */
8713	if (NInoAttr(ni)) {
8714		ntfs_debug("Mft_no 0x%llx is an attribute inode.",
8715				(unsigned long long)ni->mft_no);
8716		err = EPERM;
8717		goto err;
8718	}
8719	/*
8720	 * First of all deal with requests for the Finder info as that is
8721	 * special because we cache it in the base ntfs inode @ni and we only
8722	 * want to return it if the Finder info is non-zero.  This is what HFS
8723	 * does, too.
8724	 *
8725	 * Thus we need to check the status of the cache in the ntfs inode
8726	 * first and if that it valid we can use it to check the content of the
8727	 * Finder info for being zero.  And if it is not valid then we need to
8728	 * read it into the cache in the ntfs inode and then we can check the
8729	 * Finder info in the cache for being zero.  In fact we do this the
8730	 * other way round, i.e. if the Finder info cache is not valid we read
8731	 * the Finder info into the cache first and then the cache is
8732	 * definitely valid thus we can check the Finder info for being
8733	 * non-zero and the Finder info data if so.
8734	 *
8735	 * A further complication is in the event of symbolic links where we do
8736	 * not return the type and creator and instead return zero for them as
8737	 * that is what HFS+ does, too.
8738	 *
8739	 * FIXME: This comparison is case sensitive.
8740	 */
8741	if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) {
8742		FINDER_INFO fi;
8743
8744		if (!NInoValidFinderInfo(ni)) {
8745			if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) {
8746				lck_rw_lock_exclusive(&ni->lock);
8747				if (NInoDeleted(ni)) {
8748					cache_purge(ni->vn);
8749					lck_rw_unlock_exclusive(&ni->lock);
8750					ntfs_debug("Mft_no 0x%llx is deleted.",
8751							(unsigned long long)
8752							ni->mft_no);
8753					return ENOENT;
8754				}
8755			}
8756			/*
8757			 * Load the AFP_AfpInfo stream and initialize the
8758			 * backup time and Finder info (if they are not already
8759			 * valid).
8760			 */
8761			err = ntfs_inode_afpinfo_read(ni);
8762			if (err) {
8763				ntfs_error(vol->mp, "Failed to obtain AfpInfo "
8764						"for mft_no 0x%llx (error %d).",
8765						(unsigned long long)ni->mft_no,
8766						err);
8767				lck_rw_unlock_exclusive(&ni->lock);
8768				return err;
8769			}
8770			lck_rw_lock_exclusive_to_shared(&ni->lock);
8771			if (!NInoValidFinderInfo(ni))
8772				panic("%s(): !NInoValidFinderInfo(ni)\n",
8773						__FUNCTION__);
8774		}
8775		/*
8776		 * Make a copy of the Finder info and mask out the hidden bit
8777		 * if this is the root directory and the type and creator if
8778		 * this is a symbolic link.
8779		 */
8780		memcpy(&fi, &ni->finder_info, sizeof(fi));
8781		if (ni == vol->root_ni)
8782			fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
8783		if (S_ISLNK(ni->mode)) {
8784			fi.type = 0;
8785			fi.creator = 0;
8786		}
8787		/* If the Finder info is zero, pretend it does not exist. */
8788		if (!bcmp(&fi, &ntfs_empty_finder_info,
8789				sizeof(ni->finder_info))) {
8790			ntfs_debug("Mft_no 0x%llx has zero Finder info, "
8791					"returning ENOATTR.",
8792					(unsigned long long)ni->mft_no);
8793			err = ENOATTR;
8794			goto err;
8795		}
8796		/* The Finder info is not zero, return it. */
8797		if (!uio) {
8798			*a->a_size = sizeof(FINDER_INFO);
8799			err = 0;
8800		} else if (start_ofs)
8801			err = EINVAL;
8802		else if (uio_resid(uio) < (user_ssize_t)sizeof(FINDER_INFO))
8803			err = ERANGE;
8804		else {
8805			err = uiomove((caddr_t)&fi, sizeof(fi), uio);
8806			if (err)
8807				ntfs_error(vol->mp, "uiomove() failed (error "
8808						"%d).", err);
8809		}
8810		goto err;
8811	}
8812	/*
8813	 * Now deal with requests for the resource fork as that is special
8814	 * because on one hand we need to translate its name from
8815	 * XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert
8816	 * the utf8 name @name to Unicode and on the other hand the offset
8817	 * @start_ofs may be non-zero and the read may be only from a partial
8818	 * region of the resource fork.
8819	 *
8820	 * FIXME: This comparison is case sensitive.
8821	 */
8822	if (!bcmp(name, XATTR_RESOURCEFORK_NAME,
8823			sizeof(XATTR_RESOURCEFORK_NAME))) {
8824		ntfs_name = NTFS_SFM_RESOURCEFORK_NAME;
8825		ntfs_name_len = 12;
8826	} else {
8827		/*
8828		 * The request is not for the resource fork (nor for the Finder
8829		 * info).  This means that the offset @start_ofs must be zero.
8830		 */
8831		if (start_ofs) {
8832			err = EINVAL;
8833			goto err;
8834		}
8835		/* Convert the requested name from utf8 to Unicode. */
8836		ntfs_name = ntfs_name_buf;
8837		ntfs_name_size = sizeof(ntfs_name_buf);
8838		ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name),
8839				&ntfs_name, &ntfs_name_size);
8840		if (ntfs_name_len < 0) {
8841			err = -ntfs_name_len;
8842			if (err == ENAMETOOLONG)
8843				ntfs_debug("Failed (name is too long).");
8844			else
8845				ntfs_error(vol->mp, "Failed to convert name to "
8846						"Unicode (error %d).", err);
8847			goto err;
8848		}
8849		/*
8850		 * If this is one of the SFM named streams, skip it, as they
8851		 * contain effectively metadata information so should not be
8852		 * exposed directly.
8853		 */
8854		if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) {
8855			ntfs_debug("Not allowing access to protected SFM name "
8856					"(returning EINVAL).");
8857			err = EINVAL;
8858			goto err;
8859		}
8860	}
8861	/*
8862	 * We now have the name of the requested attribute in @ntfs_name and it
8863	 * is @ntfs_name_len characters long and we have verified that the
8864	 * start offset is zero (unless this is the resource fork in which case
8865	 * a non-zero start offset is fine).
8866	 *
8867	 * Start by getting the ntfs inode for the $DATA:@ntfs_name attribute.
8868	 */
8869	err = ntfs_attr_inode_get(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE,
8870			LCK_RW_TYPE_SHARED, &ani);
8871	if (err) {
8872		if (err == ENOENT)
8873			err = ENOATTR;
8874		else if (err != ENOATTR)
8875			ntfs_error(vol->mp, "Failed to get $DATA/%s attribute "
8876					"inode mft_no 0x%llx (error %d).", name,
8877					(unsigned long long)ni->mft_no, err);
8878		goto err;
8879	}
8880	/*
8881	 * TODO: This check may no longer be necessary now that we lock against
8882	 * changes in initialized size and thus truncation...  Revisit this
8883	 * issue when the write code has been written and remove the check if
8884	 * appropriate simply using ubc_getsize(ni->vn); without the size_lock.
8885	 */
8886	lck_spin_lock(&ani->size_lock);
8887	size = ubc_getsize(ani->vn);
8888	if (size > ani->data_size)
8889		size = ani->data_size;
8890	lck_spin_unlock(&ani->size_lock);
8891	if (!uio)
8892		*a->a_size = size;
8893	else if (ntfs_name != NTFS_SFM_RESOURCEFORK_NAME &&
8894			start_count < size) {
8895		/* Partial reads are only allowed for the resource fork. */
8896		err = ERANGE;
8897	} else {
8898		/*
8899		 * Perform the actual read from the attribute inode.  We pass
8900		 * in IO_UNIT as we want an atomic i/o operation.
8901		 *
8902		 * FIXME: ntfs_read() currently ignores the IO_UNIT flag so we
8903		 * still have to test for partial reads.
8904		 */
8905		err = ntfs_read(ani, uio, IO_UNIT, TRUE);
8906		/*
8907		 * If the read was partial, reset @uio pretending that the read
8908		 * never happened.  This is because extended attribute i/o is
8909		 * meant to be atomic, i.e. either we get it all or we do not
8910		 * get anything.
8911		 *
8912		 * Note we also accept the case where uio_resid() has gone to
8913		 * zero as this covers the exception of the resource fork for
8914		 * which we do not need to return the whole resource fork in
8915		 * one go.
8916		 */
8917		if (uio_resid(uio) && start_count - uio_resid(uio) !=
8918				size - start_ofs) {
8919			/*
8920			 * FIXME: Should we be trying to continue a partial
8921			 * read in case we can complete it with multiple calls
8922			 * to ntfs_read()?  If we do that we could also drop
8923			 * the IO_UNIT flag above.
8924			 */
8925			if (!err) {
8926				ntfs_debug("ntfs_read() returned a partial "
8927						"read, pretending the read "
8928						"never happened.");
8929				err = EIO;
8930			}
8931			uio_setoffset(uio, start_ofs);
8932			uio_setresid(uio, start_count);
8933		}
8934	}
8935	lck_rw_unlock_shared(&ani->lock);
8936	(void)vnode_put(ani->vn);
8937err:
8938	lck_rw_unlock_shared(&ni->lock);
8939	ntfs_debug("Done (error %d).", err);
8940	return err;
8941}
8942
8943/**
8944 * ntfs_vnop_setxattr - set the data of an extended attribute of an ntfs inode
8945 * @a:		arguments to setxattr function
8946 *
8947 * @a contains:
8948 *	vnode_t a_vp;	vnode whose extended attribute to set
8949 *	char *a_name;	name of extented attribute to set in utf8
8950 *	uio_t a_uio;	source data to which to set the exteneded attribute
8951 *	int a_options;	flags controlling how the attribute is set
8952 *	vfs_context_t a_context;
8953 *
8954 * Get the named stream with the name @a->a_name (we map named streams 1:1 with
8955 * extended attributes for NTFS as the NTFS native EAs are useless) contained
8956 * in the vnode @a->a_vp and set its data to the source specified by @a->a_uio.
8957 *
8958 * If @a->a_options does not specify XATTR_CREATE nor XATTR_REPLACE the
8959 * attribute will be created if it does not exist already and if it exists
8960 * already the old value will be replaced with the new one, i.e. if the old
8961 * value does not have the same size as the new value the attribute is
8962 * truncated to the new size.
8963 *
8964 * If @a->a_options specifies XATTR_CREATE the call will fail if the attribute
8965 * already exists, i.e. the existing attribute will not be replaced.
8966 *
8967 * If @a->a_options specifies XATTR_REPLACE the call will fail if the attribute
8968 * does not exist, i.e. the new attribute will not be created.
8969 *
8970 * An exception is the resource fork (@a->a_name is XATTR_RESOURCEFORK_NAME)
8971 * for which we do not replace the existing attribute and instead we write over
8972 * the existing attribute starting at offset uio_offset(@a->a_uio) and writing
8973 * uio_resid(@a->a_uio) bytes.  Writing past the end of the resource fork will
8974 * cause the resource fork to be extended just like a regular file write would
8975 * do but a write to any existing part of the attribute will not cause the
8976 * attribute to be shrunk.
8977 *
8978 * Simillar to other extended attributes, if @a->a_options specifies
8979 * XATTR_CREATE the call will fail if the resource fork already exists, i.e.
8980 * the write to the existing resource fork will be denied and if @a->a_options
8981 * specified XATTR_REPLACE the call will fail if the resource fork does not yet
8982 * exist, i.e. the new resource fork will not be created.
8983 *
8984 * Note that uio_offset(@a->a_uio) must be zero except for the resource fork
8985 * where it can specify the offset into the resource fork at which to begin
8986 * writing the data.
8987 *
8988 * The flags in @a->a_options control how the attribute is set.  The following
8989 * flags are currently defined in OS X kernel:
8990 *	XATTR_NOFOLLOW	- Do not follow symbolic links.
8991 *	XATTR_CREATE	- Set the value, fail if already exists (setxattr only).
8992 *	XATTR_REPLACE	- Set the value, fail if does not exist (setxattr only).
8993 *	XATTR_NOSECURITY- Bypass authorization checking.
8994 *	XATTR_NODEFAULT	- Bypass default extended attribute file ('._' file).
8995 *
8996 * Return 0 on success and errno on error.
8997 */
8998static int ntfs_vnop_setxattr(struct vnop_setxattr_args *a)
8999{
9000	s64 size;
9001	user_ssize_t start_count;
9002	off_t start_ofs;
9003	ntfs_inode *ani, *ni = NTFS_I(a->a_vp);
9004	ntfs_volume *vol;
9005	const char *name = a->a_name;
9006	uio_t uio = a->a_uio;
9007	ntfschar *ntfs_name;
9008	size_t ntfs_name_size;
9009	signed ntfs_name_len;
9010	const int options = a->a_options;
9011	errno_t err;
9012	ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN];
9013
9014	if (!ni) {
9015		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
9016		return EINVAL;
9017	}
9018	vol = ni->vol;
9019	/* Check for invalid names. */
9020	if (!name || name[0] == '\0')
9021		return EINVAL;
9022	start_ofs = uio_offset(uio);
9023	start_count = uio_resid(uio);
9024	ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, "
9025			"offset 0x%llx, size 0x%llx, options 0x%x.",
9026			(unsigned long long)ni->mft_no, name, start_ofs,
9027			start_count, options);
9028	/*
9029	 * Access to extended attributes must be atomic which we ensure by
9030	 * locking the base ntfs inode for writing.
9031	 */
9032	lck_rw_lock_exclusive(&ni->lock);
9033	/* Do not allow messing with the inode once it has been deleted. */
9034	if (NInoDeleted(ni)) {
9035		/* Remove the inode from the name cache. */
9036		cache_purge(ni->vn);
9037		ntfs_debug("Mft_no 0x%llx is deleted.",
9038				(unsigned long long)ni->mft_no);
9039		err = ENOENT;
9040		goto err;
9041	}
9042	/*
9043	 * Only regular files, directories, and symbolic links can have
9044	 * extended attributes.  (Specifically named streams cannot have them.)
9045	 *
9046	 * Thus the check is for attribute inodes as all base inodes are
9047	 * allowed.  Raw inodes are also attribute inodes so they are excluded
9048	 * automatically, too.
9049	 */
9050	if (NInoAttr(ni)) {
9051		ntfs_debug("Mft_no 0x%llx is an attribute inode.",
9052				(unsigned long long)ni->mft_no);
9053		err = EPERM;
9054		goto err;
9055	}
9056	/*
9057	 * XATTR_CREATE and XATTR_REPLACE may not be specified at the same time
9058	 * or weird things would happen so test for and abort this case here.
9059	 */
9060	if ((options & (XATTR_CREATE | XATTR_REPLACE)) ==
9061			(XATTR_CREATE | XATTR_REPLACE)) {
9062		ntfs_debug("Either XATTR_CREATE or XATTR_REPLACE but not both "
9063				"may be specified.");
9064		err = EINVAL;
9065		goto err;
9066	}
9067	/*
9068	 * First of all deal with requests to set the Finder info as that is
9069	 * special because we cache it in the base ntfs inode @ni thus we need
9070	 * to copy the new Finder info into the cache and then write the
9071	 * changes out to the AFP_AfpInfo attribute (creating it if it did not
9072	 * exist before).
9073	 *
9074	 * The only exception to the above description is when the XATTR_CREATE
9075	 * or XATTR_REPLACE flags are set in @options in which case we need to
9076	 * know whether the Finder info extists already or not and thus if the
9077	 * Finder info cache is not valid we need to make it valid first and
9078	 * then we can check it against being zero to determine whether the
9079	 * Finder info exists already or not and then we know whether or not to
9080	 * proceed with setting the Finder info.
9081	 *
9082	 * FIXME: This comparison is case sensitive.
9083	 */
9084	if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) {
9085		FINDER_INFO fi;
9086
9087		if (start_count != sizeof(ni->finder_info)) {
9088			ntfs_debug("Number of bytes to write (%lld) does not "
9089					"equal Finder info size (%ld), "
9090					"returning ERANGE.",
9091					(unsigned long long)start_count,
9092					sizeof(ni->finder_info));
9093			err = ERANGE;
9094			goto err;
9095		}
9096		/*
9097		 * If @options does not specify XATTR_CREATE nor XATTR_REPLACE
9098		 * there is no need to bring the Finder info up-to-date before
9099		 * the write.
9100		 */
9101		if (options & (XATTR_CREATE | XATTR_REPLACE)) {
9102			if (!NInoValidFinderInfo(ni)) {
9103				/*
9104				 * Load the AFP_AfpInfo stream and initialize
9105				 * the backup time and Finder info (at least
9106				 * the Finder info is not yet valid).
9107				 */
9108				err = ntfs_inode_afpinfo_read(ni);
9109				if (err) {
9110					ntfs_error(vol->mp, "Failed to obtain "
9111							"AfpInfo for mft_no "
9112							"0x%llx (error %d).",
9113							(unsigned long long)
9114							ni->mft_no, err);
9115					goto err;
9116				}
9117				if (!NInoValidFinderInfo(ni))
9118					panic("%s(): !NInoValidFinderInfo(ni)"
9119							"\n", __FUNCTION__);
9120			}
9121			/*
9122			 * Make a copy of the Finder info and mask out the
9123			 * hidden bit if this is the root directory and the
9124			 * type and creator if this is a symbolic link.
9125			 */
9126			memcpy(&fi, &ni->finder_info, sizeof(fi));
9127			if (ni == vol->root_ni)
9128				fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
9129			if (S_ISLNK(ni->mode)) {
9130				fi.type = 0;
9131				fi.creator = 0;
9132			}
9133			if (bcmp(&ni->finder_info, &ntfs_empty_finder_info,
9134					sizeof(ni->finder_info))) {
9135				/*
9136				 * Finder info is non-zero, i.e. it exists, and
9137				 * XATTR_CREATE was specified.
9138				 */
9139				if (options & XATTR_CREATE) {
9140					ntfs_debug("Mft_no 0x%llx has "
9141							"non-zero Finder info "
9142							"and XATTR_CREATE was "
9143							"specified, returning "
9144							"EEXIST.",
9145							(unsigned long long)
9146							ni->mft_no);
9147					err = EEXIST;
9148					goto err;
9149				}
9150			} else {
9151				/*
9152				 * Finder info is zero, i.e. it does not exist,
9153				 * and XATTR_REPLACE was specified.
9154				 */
9155				if (options & XATTR_REPLACE) {
9156					ntfs_debug("Mft_no 0x%llx has zero "
9157							"Finder info and "
9158							"XATTR_REPLACE was "
9159							"specified, returning "
9160							"ENOATTR.",
9161							(unsigned long long)
9162							ni->mft_no);
9163					err = ENOATTR;
9164					goto err;
9165				}
9166			}
9167		}
9168		/* Copy the new Finder info value to our buffer. */
9169		err = uiomove((caddr_t)&fi, sizeof(fi), uio);
9170		if (!err) {
9171			/*
9172			 * Set the Finder info to the new value after masking
9173			 * out the hidden bit if this is the root directory and
9174			 * enforcing the type and creator if this is a symbolic
9175			 * link to be our private values for symbolic links.
9176			 */
9177			if (ni == vol->root_ni)
9178				fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
9179			if (S_ISLNK(ni->mode)) {
9180				fi.type = FINDER_TYPE_SYMBOLIC_LINK;
9181				fi.creator = FINDER_CREATOR_SYMBOLIC_LINK;
9182			}
9183			memcpy((u8*)&ni->finder_info, (u8*)&fi, sizeof(fi));
9184			NInoSetValidFinderInfo(ni);
9185			NInoSetDirtyFinderInfo(ni);
9186			/*
9187			 * If the file is not hidden but the Finder info hidden
9188			 * bit is being set, we need to cause the file to be
9189			 * hidden, i.e. we need to set the FILE_ATTR_HIDDEN bit
9190			 * in the file_attributes of the $STANDARD_INFORMATION
9191			 * attribute.
9192			 */
9193			if (fi.attrs & FINDER_ATTR_IS_HIDDEN &&
9194					!(ni->file_attributes &
9195					FILE_ATTR_HIDDEN)) {
9196				ni->file_attributes |= FILE_ATTR_HIDDEN;
9197				NInoSetDirtyFileAttributes(ni);
9198			}
9199			/*
9200			 * Updating the Finder info causes both the
9201			 * last_data_change_time (mtime) and
9202			 * last_mft_change_time (ctime) to be updated.
9203			 */
9204			ni->last_mft_change_time = ni->last_data_change_time =
9205					ntfs_utc_current_time();
9206			NInoSetDirtyTimes(ni);
9207			/*
9208			 * Now write (if needed creating) the AFP_AfpInfo
9209			 * attribute with the specified Finder Info.
9210			 */
9211			err = ntfs_inode_afpinfo_write(ni);
9212			if (err)
9213				ntfs_error(vol->mp, "Failed to write/create "
9214						"AFP_AfpInfo attribute in "
9215						"inode 0x%llx (error %d).",
9216						(unsigned long long)ni->mft_no,
9217						err);
9218		} else
9219			ntfs_error(vol->mp, "uiomove() failed (error %d).",
9220					err);
9221		goto err;
9222	}
9223	/*
9224	 * Now deal with requests to write to the resource fork as that is
9225	 * special because on one hand we need to translate its name from
9226	 * XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert
9227	 * the utf8 name @name to Unicode and on the other hand the offset
9228	 * @start_ofs may be non-zero, the write may be only to a partial
9229	 * region of the resource fork, and the write may not shrink the
9230	 * resource fork though it may extend it.
9231	 *
9232	 * FIXME: This comparison is case sensitive.
9233	 */
9234	if (!bcmp(name, XATTR_RESOURCEFORK_NAME,
9235			sizeof(XATTR_RESOURCEFORK_NAME))) {
9236		ntfs_name = NTFS_SFM_RESOURCEFORK_NAME;
9237		ntfs_name_len = 12;
9238	} else {
9239		/*
9240		 * The request is not for the resource fork (nor for the Finder
9241		 * info).  This means that the offset @start_ofs must be zero.
9242		 */
9243		if (start_ofs) {
9244			err = EINVAL;
9245			goto err;
9246		}
9247		/* Convert the requested name from utf8 to Unicode. */
9248		ntfs_name = ntfs_name_buf;
9249		ntfs_name_size = sizeof(ntfs_name_buf);
9250		ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name),
9251				&ntfs_name, &ntfs_name_size);
9252		if (ntfs_name_len < 0) {
9253			err = -ntfs_name_len;
9254			if (err == ENAMETOOLONG)
9255				ntfs_debug("Failed (name is too long).");
9256			else
9257				ntfs_error(vol->mp, "Failed to convert name to "
9258						"Unicode (error %d).", err);
9259			goto err;
9260		}
9261		/*
9262		 * If this is one of the SFM named streams, skip it, as they
9263		 * contain effectively metadata information so should not be
9264		 * exposed directly.
9265		 */
9266		if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) {
9267			ntfs_debug("Not allowing access to protected SFM name "
9268					"(returning EINVAL).");
9269			err = EINVAL;
9270			goto err;
9271		}
9272	}
9273	/*
9274	 * We now have the name of the requested attribute in @ntfs_name and it
9275	 * is @ntfs_name_len characters long and we have verified that the
9276	 * start offset is zero (unless this is the resource fork in which case
9277	 * a non-zero start offset is fine).
9278	 *
9279	 * Get the ntfs attribute inode of the $DATA:@ntfs_name attribute
9280	 * (unless XATTR_CREATE is specified in @options) and if it does not
9281	 * exist create it first (unless XATTR_REPLACE is specified in
9282	 * @options).
9283	 */
9284	err = ntfs_attr_inode_get_or_create(ni, AT_DATA, ntfs_name,
9285			ntfs_name_len, FALSE, FALSE, options,
9286			LCK_RW_TYPE_EXCLUSIVE, &ani);
9287	if (err) {
9288		if (err == ENOENT)
9289			err = ENOATTR;
9290		else if (err != ENOATTR && err != EEXIST)
9291			ntfs_error(vol->mp, "Failed to get or create $DATA/%s "
9292					"attribute inode mft_no 0x%llx (error "
9293					"%d).", name,
9294					(unsigned long long)ni->mft_no, err);
9295		goto err;
9296	}
9297	/*
9298	 * TODO: This check may no longer be necessary now that we lock against
9299	 * changes in initialized size and thus truncation...  Revisit this
9300	 * issue when the write code has been written and remove the check if
9301	 * appropriate simply using ubc_getsize(ni->vn); without the size_lock.
9302	 */
9303	lck_spin_lock(&ani->size_lock);
9304	size = ubc_getsize(ani->vn);
9305	if (size > ani->data_size)
9306		size = ani->data_size;
9307	lck_spin_unlock(&ani->size_lock);
9308	/*
9309	 * Perform the actual write to the attribute inode.  We pass in IO_UNIT
9310	 * as we want an atomic i/o operation.
9311	 *
9312	 * FIXME: ntfs_write() does not always honour the IO_UNIT flag so we
9313	 * still have to test for partial writes.
9314	 */
9315	err = ntfs_write(ani, uio, IO_UNIT, TRUE);
9316	/*
9317	 * If the write was successful, need to shrink the attribute if the new
9318	 * size is smaller than the old size.
9319	 *
9320	 * If the write was partial or failed, reset @uio pretending that the
9321	 * write never happened.  This is because extended attribute i/o is
9322	 * meant to be atomic, i.e. either we get it all or we do not get
9323	 * anything.
9324	 *
9325	 * In the partial/failed case, if @options specifies XATTR_REPLACE we
9326	 * know the extended attribute existed already thus we truncate it to
9327	 * zero size to simulate that the old value has been replaced.  And if
9328	 * @options specifies XATTR_CREATE we know we created the extended
9329	 * attribute thus we delete it again.  And if @options does not specify
9330	 * XATTR_REPLACE nor XATTR_CREATE then we do not know whether we
9331	 * created it or not and in this case we assume the caller does not
9332	 * care so we delete it to conserve disk space.
9333	 */
9334	if (!err && !uio_resid(uio)) {
9335		/*
9336		 * Shrink the attribute if the new value is smaller than the
9337		 * old value.  We do not do this for the resource fork as that
9338		 * is a special case.
9339		 */
9340		if (ntfs_name != NTFS_SFM_RESOURCEFORK_NAME) {
9341			if (size > start_count) {
9342				err = ntfs_attr_resize(ani, start_count, 0,
9343						NULL);
9344				if (err) {
9345					ntfs_error(vol->mp, "Failed to resize "
9346							"extended attribute "
9347							"to its new size "
9348							"(error %d).", err);
9349					goto undo_err;
9350				}
9351			}
9352		}
9353	} else {
9354		/*
9355		 * FIXME: Should we be trying to continue a partial write in
9356		 * case we can complete it with multiple calls to ntfs_write()?
9357		 */
9358		if (!err) {
9359			ntfs_debug("ntfs_write() returned a partial write, "
9360					"pretending the write never happened "
9361					"and removing or truncating to zero "
9362					"size the old attribute value.");
9363			err = EIO;
9364		}
9365undo_err:
9366		uio_setoffset(uio, start_ofs);
9367		uio_setresid(uio, start_count);
9368		if (options & XATTR_REPLACE) {
9369			errno_t err2;
9370
9371			err2 = ntfs_attr_resize(ani, 0, 0, NULL);
9372			if (err2) {
9373				ntfs_error(vol->mp, "Failed to truncate "
9374						"extended attribute to zero "
9375						"size in error code path "
9376						"(error %d), attempting to "
9377						"delete it instead.", err2);
9378				goto rm_err;
9379			}
9380		} else {
9381rm_err:
9382			/*
9383			 * Unlink the named stream.  The last close will cause
9384			 * the VFS to call ntfs_vnop_inactive() which will do
9385			 * the actual removal.
9386			 */
9387			ani->link_count = 0;
9388			/*
9389			 * Update the last_mft_change_time (ctime) in the inode
9390			 * as named stream/extended attribute semantics expect
9391			 * on OS X.
9392			 */
9393			ni->last_mft_change_time = ntfs_utc_current_time();
9394			NInoSetDirtyTimes(ni);
9395			/*
9396			 * If this is not a directory or it is an encrypted
9397			 * directory, set the needs archiving bit except for
9398			 * the core system files.
9399			 */
9400			if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
9401				BOOL need_set_archive_bit = TRUE;
9402				if (ni->vol->major_ver >= 2) {
9403					if (ni->mft_no <= FILE_Extend)
9404						need_set_archive_bit = FALSE;
9405				} else {
9406					if (ni->mft_no <= FILE_UpCase)
9407						need_set_archive_bit = FALSE;
9408				}
9409				if (need_set_archive_bit) {
9410					ni->file_attributes |=
9411							FILE_ATTR_ARCHIVE;
9412					NInoSetDirtyFileAttributes(ni);
9413				}
9414			}
9415		}
9416	}
9417	lck_rw_unlock_exclusive(&ani->lock);
9418	(void)vnode_put(ani->vn);
9419err:
9420	lck_rw_unlock_exclusive(&ni->lock);
9421	ntfs_debug("Done (error %d).", err);
9422	return err;
9423}
9424
9425/**
9426 * ntfs_vnop_removexattr - remove an extended attribute from an ntfs inode
9427 * @a:		arguments to removexattr function
9428 *
9429 * @a contains:
9430 *	vnode_t a_vp;	vnode whose extended attribute to remove
9431 *	char *a_name;	name of extented attribute to remove in utf8
9432 *	int a_options;	flags controlling how the attribute is removed
9433 *	vfs_context_t a_context;
9434 *
9435 * Remove the named stream with the name @a->a_name (we map named streams 1:1
9436 * with extended attributes for NTFS as the NTFS native EAs are useless) from
9437 * the vnode @a->a_vp.
9438 *
9439 * The flags in @a->a_options control how the attribute is set.  The following
9440 * flags are currently defined in OS X kernel:
9441 *	XATTR_NOFOLLOW	- Do not follow symbolic links.
9442 *	XATTR_CREATE	- Set the value, fail if already exists (setxattr only).
9443 *	XATTR_REPLACE	- Set the value, fail if does not exist (setxattr only).
9444 *	XATTR_NOSECURITY- Bypass authorization checking.
9445 *	XATTR_NODEFAULT	- Bypass default extended attribute file ('._' file).
9446 *
9447 * Return 0 on success and errno on error.
9448 */
9449static int ntfs_vnop_removexattr(struct vnop_removexattr_args *a)
9450{
9451	ntfs_inode *ani, *ni = NTFS_I(a->a_vp);
9452	const char *name = a->a_name;
9453	ntfs_volume *vol;
9454	ntfschar *ntfs_name;
9455	size_t ntfs_name_size;
9456	signed ntfs_name_len;
9457	errno_t err;
9458	ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN];
9459
9460	if (!ni) {
9461		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
9462		return EINVAL;
9463	}
9464	vol = ni->vol;
9465	/* Check for invalid names. */
9466	if (!name || name[0] == '\0')
9467		return EINVAL;
9468	ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, "
9469			"options 0x%x.", (unsigned long long)ni->mft_no, name,
9470			a->a_options);
9471	/*
9472	 * Access to extended attributes must be atomic which we ensure by
9473	 * locking the base ntfs inode for writing.
9474	 */
9475	lck_rw_lock_exclusive(&ni->lock);
9476	/* Do not allow messing with the inode once it has been deleted. */
9477	if (NInoDeleted(ni)) {
9478		/* Remove the inode from the name cache. */
9479		cache_purge(ni->vn);
9480		ntfs_debug("Mft_no 0x%llx is deleted.",
9481				(unsigned long long)ni->mft_no);
9482		err = ENOENT;
9483		goto err;
9484	}
9485	/*
9486	 * Only regular files, directories, and symbolic links can have
9487	 * extended attributes.  (Specifically named streams cannot have them.)
9488	 *
9489	 * Thus the check is for attribute inodes as all base inodes are
9490	 * allowed.  Raw inodes are also attribute inodes so they are excluded
9491	 * automatically, too.
9492	 */
9493	if (NInoAttr(ni)) {
9494		ntfs_debug("Mft_no 0x%llx is an attribute inode.",
9495				(unsigned long long)ni->mft_no);
9496		err = EPERM;
9497		goto err;
9498	}
9499	/*
9500	 * First of all deal with requests to remove the Finder info as that is
9501	 * special because we cache it in the base ntfs inode @ni thus we need
9502	 * to zero the cached Finder info and then write the changes out to the
9503	 * AFP_AfpInfo attribute (deleting it if it is no longer needed).  This
9504	 * is sufficient as a zero Finder info is treated the same as
9505	 * non-existent Finder info and vice versa.
9506	 *
9507	 * Note if the Finder info is already zero it does not exist thus we
9508	 * need to return ENOATTR instead thus we may need to load the Finder
9509	 * info first to find out whether it is zero or not.
9510	 *
9511	 * FIXME: This comparison is case sensitive.
9512	 */
9513	if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) {
9514		FINDER_INFO fi;
9515
9516		if (!NInoValidFinderInfo(ni)) {
9517			/*
9518			 * Load the AFP_AfpInfo stream and initialize the
9519			 * backup time and Finder info (at least the Finder
9520			 * info is not yet valid).
9521			 */
9522			err = ntfs_inode_afpinfo_read(ni);
9523			if (err) {
9524				ntfs_error(vol->mp, "Failed to obtain AfpInfo "
9525						"for mft_no 0x%llx (error %d).",
9526						(unsigned long long)ni->mft_no,
9527						err);
9528				goto err;
9529			}
9530			if (!NInoValidFinderInfo(ni))
9531				panic("%s(): !NInoValidFinderInfo(ni)\n",
9532						__FUNCTION__);
9533		}
9534		/*
9535		 * Make a copy of the Finder info and mask out the hidden bit
9536		 * if this is the root directory and the type and creator if
9537		 * this is a symbolic link.
9538		 */
9539		memcpy(&fi, &ni->finder_info, sizeof(fi));
9540		if (ni == vol->root_ni)
9541			fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
9542		if (S_ISLNK(ni->mode)) {
9543			fi.type = 0;
9544			fi.creator = 0;
9545		}
9546		if (!bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi))) {
9547			/* Finder info is zero, i.e. it does not exist. */
9548			ntfs_debug("Mft_no 0x%llx has zero Finder info, "
9549					"returning ENOATTR.",
9550					(unsigned long long)ni->mft_no);
9551			err = ENOATTR;
9552			goto err;
9553		}
9554		/* Zero the Finder info. */
9555		bzero(&ni->finder_info, sizeof(ni->finder_info));
9556		/*
9557		 * If the file is hidden, we need to reflect this fact in the
9558		 * Finder info, too.
9559		 */
9560		if (ni->file_attributes & FILE_ATTR_HIDDEN)
9561			ni->finder_info.attrs |= FINDER_ATTR_IS_HIDDEN;
9562		/*
9563		 * Also, enforce the type and creator if this is a symbolic
9564		 * link to be our private values for symbolic links.  This in
9565		 * fact causes the Finder info not to be deleted on disk and we
9566		 * cannot allow that to happen as we would then no longer know
9567		 * that this is a symbolic link.
9568		 */
9569		if (S_ISLNK(ni->mode)) {
9570			ni->finder_info.type = FINDER_TYPE_SYMBOLIC_LINK;
9571			ni->finder_info.creator = FINDER_CREATOR_SYMBOLIC_LINK;
9572		}
9573		NInoSetValidFinderInfo(ni);
9574		NInoSetDirtyFinderInfo(ni);
9575		/*
9576		 * Updating the Finder info causes both the
9577		 * last_data_change_time (mtime) and last_mft_change_time
9578		 * (ctime) to be updated.
9579		 */
9580		ni->last_mft_change_time = ni->last_data_change_time =
9581				ntfs_utc_current_time();
9582		NInoSetDirtyTimes(ni);
9583		/* Now write (if needed deleting) the AFP_AfpInfo attribute. */
9584		err = ntfs_inode_afpinfo_write(ni);
9585		if (!err)
9586			ntfs_debug("Deleted Finder info from mft_no 0x%llx.",
9587					(unsigned long long)ni->mft_no);
9588		else
9589			ntfs_error(vol->mp, "Failed to write/delete "
9590					"AFP_AfpInfo attribute in inode "
9591					"0x%llx (error %d).",
9592					(unsigned long long)ni->mft_no, err);
9593		goto err;
9594	}
9595	/*
9596	 * Now deal with requests to remove the resource fork as that is
9597	 * special because we need to translate its name from
9598	 * XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert
9599	 * the utf8 name @name to Unicode.
9600	 *
9601	 * FIXME: This comparison is case sensitive.
9602	 */
9603	if (!bcmp(name, XATTR_RESOURCEFORK_NAME,
9604			sizeof(XATTR_RESOURCEFORK_NAME))) {
9605		ntfs_name = NTFS_SFM_RESOURCEFORK_NAME;
9606		ntfs_name_len = 12;
9607	} else {
9608		/*
9609		 * The request is not for the resource fork (nor for the Finder
9610		 * info).
9611		 *
9612		 * Convert the requested name from utf8 to Unicode.
9613		 */
9614		ntfs_name = ntfs_name_buf;
9615		ntfs_name_size = sizeof(ntfs_name_buf);
9616		ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name),
9617				&ntfs_name, &ntfs_name_size);
9618		if (ntfs_name_len < 0) {
9619			err = -ntfs_name_len;
9620			if (err == ENAMETOOLONG)
9621				ntfs_debug("Failed (name is too long).");
9622			else
9623				ntfs_error(vol->mp, "Failed to convert name to "
9624						"Unicode (error %d).", err);
9625			goto err;
9626		}
9627		/*
9628		 * If this is one of the SFM named streams, skip it, as they
9629		 * contain effectively metadata information so should not be
9630		 * exposed directly.
9631		 */
9632		if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) {
9633			ntfs_debug("Not allowing access to protected SFM name "
9634					"%s in mft_no 0x%llx (returning "
9635					"EINVAL).", name,
9636					(unsigned long long)ni->mft_no);
9637			err = EINVAL;
9638			goto err;
9639		}
9640	}
9641	/*
9642	 * We now have the name of the requested attribute in @ntfs_name and it
9643	 * is @ntfs_name_len characters long.
9644	 *
9645	 * Get the ntfs attribute inode of the $DATA:@ntfs_name attribute.
9646	 */
9647	err = ntfs_attr_inode_get(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE,
9648			LCK_RW_TYPE_EXCLUSIVE, &ani);
9649	if (err) {
9650		if (err == ENOENT)
9651			err = ENOATTR;
9652		else if (err != ENOATTR)
9653			ntfs_error(vol->mp, "Failed to get $DATA/%s attribute "
9654					"inode mft_no 0x%llx (error %d).",
9655					name, (unsigned long long)ni->mft_no,
9656					err);
9657		goto err;
9658	}
9659	/*
9660	 * Unlink the named stream.  The last close will cause the VFS to call
9661	 * ntfs_vnop_inactive() which will do the actual removal.
9662	 */
9663	ani->link_count = 0;
9664	/*
9665	 * Update the last_mft_change_time (ctime) in the inode as named
9666	 * stream/extended attribute semantics expect on OS X.
9667	 */
9668	ni->last_mft_change_time = ntfs_utc_current_time();
9669	NInoSetDirtyTimes(ni);
9670	/*
9671	 * If this is not a directory or it is an encrypted directory, set the
9672	 * needs archiving bit except for the core system files.
9673	 */
9674	if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
9675		BOOL need_set_archive_bit = TRUE;
9676		if (ni->vol->major_ver >= 2) {
9677			if (ni->mft_no <= FILE_Extend)
9678				need_set_archive_bit = FALSE;
9679		} else {
9680			if (ni->mft_no <= FILE_UpCase)
9681				need_set_archive_bit = FALSE;
9682		}
9683		if (need_set_archive_bit) {
9684			ni->file_attributes |= FILE_ATTR_ARCHIVE;
9685			NInoSetDirtyFileAttributes(ni);
9686		}
9687	}
9688	ntfs_debug("Done.");
9689	lck_rw_unlock_exclusive(&ani->lock);
9690	(void)vnode_put(ani->vn);
9691err:
9692	lck_rw_unlock_exclusive(&ni->lock);
9693	return err;
9694}
9695
9696/**
9697 * ntfs_vnop_listxattr - list the names of the extended attributes of an inode
9698 * @args:		arguments to listxattr function
9699 *
9700 * @args contains:
9701 *	vnode_t a_vp;	vnode whose extended attributes to list
9702 *	uio_t a_uio;	destination in which to return the list
9703 *	size_t *a_size;	size of the list of extended attributes in bytes
9704 *	int a_options;	flags controlling how the attribute list is generated
9705 *	vfs_context_t a_context;
9706 *
9707 * Iterate over the list of named streams (which we map 1:1 with extended
9708 * attributes for NTFS as the NTFS native EAs are useless) in the vnode
9709 * @args->a_vp and for each encountered stream copy its name (converted to an
9710 * NULL-terminated utf8 string) to the destination as specified by
9711 * @args->a_uio.
9712 *
9713 * If @args->a_uio is NULL, do not copy anything and simply iterate over all
9714 * named streams and add up the number of bytes needed to create a full list of
9715 * their names and return that in *@args->a_size.  Note that when @args->a_uio
9716 * is not NULL @args->a_size is ignored as the number of bytes is implicitly
9717 * returned in the @args->a_uio and it can be obtained by taking the original
9718 * buffer size and subtracting uio_resid(@args->a_uio) from it.
9719 *
9720 * The flags in @args->a_options control how the attribute list is generated.
9721 * The following flags are currently defined in OS X kernel:
9722 *	XATTR_NOFOLLOW	- Do not follow symbolic links.
9723 *	XATTR_CREATE	- Set the value, fail if already exists (setxattr only).
9724 *	XATTR_REPLACE	- Set the value, fail if does not exist (setxattr only).
9725 *	XATTR_NOSECURITY- Bypass authorization checking.
9726 *	XATTR_NODEFAULT	- Bypass default extended attribute file ('._' file).
9727 *
9728 * Return 0 on success and errno on error.
9729 */
9730static int ntfs_vnop_listxattr(struct vnop_listxattr_args *args)
9731{
9732	ntfs_inode *ni = NTFS_I(args->a_vp);
9733	uio_t uio = args->a_uio;
9734	ntfs_volume *vol;
9735	MFT_RECORD *m;
9736	ntfs_attr_search_ctx *ctx;
9737	u8 *utf8_name;
9738	ntfschar *upcase;
9739	unsigned upcase_len;
9740	size_t size, utf8_size;
9741	errno_t err;
9742	BOOL case_sensitive;
9743	FINDER_INFO fi;
9744
9745	if (!ni) {
9746		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
9747		return EINVAL;
9748	}
9749	vol = ni->vol;
9750	upcase = vol->upcase;
9751	upcase_len = vol->upcase_len;
9752	case_sensitive = NVolCaseSensitive(vol);
9753	ntfs_debug("Entering.");
9754	lck_rw_lock_shared(&ni->lock);
9755	/* Do not allow messing with the inode once it has been deleted. */
9756	if (NInoDeleted(ni)) {
9757		/* Remove the inode from the name cache. */
9758		cache_purge(ni->vn);
9759		ntfs_debug("Mft_no 0x%llx is deleted.",
9760				(unsigned long long)ni->mft_no);
9761		err = ENOENT;
9762		goto err;
9763	}
9764	/*
9765	 * Only regular files, directories, and symbolic links can have
9766	 * extended attributes.  (Specifically named streams cannot have them.)
9767	 *
9768	 * Thus the check is for attribute inodes as all base inodes are
9769	 * allowed.  Raw inodes are also attribute inodes so they are excluded
9770	 * automatically, too.
9771	 */
9772	if (NInoAttr(ni)) {
9773		ntfs_debug("Mft_no 0x%llx is an attribute inode.",
9774				(unsigned long long)ni->mft_no);
9775		err = EPERM;
9776		goto err;
9777	}
9778	size = 0;
9779	/*
9780	 * First of all deal with the Finder info as that is special because we
9781	 * cache it in the base ntfs inode @ni and we only want to export the
9782	 * name for the Finder info, XATTR_FINDERINFO_NAME, if the Finder info
9783	 * is non-zero.  This is what HFS does, too.
9784	 *
9785	 * Thus we need to check the status of the cache in the ntfs inode
9786	 * first and if that it valid we can use it to check the content of the
9787	 * Finder info for being zero.  And if it is not valid then it must be
9788	 * non-resident in which case we need to read it into the cache in the
9789	 * ntfs inode and then we can check the Finder info in the cache for
9790	 * being zero.  In fact we do this the other way round, i.e. if the
9791	 * Finder info cache is not valid we read the Finder info into the
9792	 * cache first and then the cache is definitely valid thus we can check
9793	 * the Finder info for being non-zero and export XATTR_FINDERINFO_NAME
9794	 * if so.
9795	 */
9796	if (!NInoValidFinderInfo(ni)) {
9797		if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) {
9798			lck_rw_lock_exclusive(&ni->lock);
9799			if (NInoDeleted(ni)) {
9800				cache_purge(ni->vn);
9801				lck_rw_unlock_exclusive(&ni->lock);
9802				ntfs_debug("Mft_no 0x%llx is deleted.",
9803						(unsigned long long)ni->mft_no);
9804				return ENOENT;
9805			}
9806		}
9807		/*
9808		 * Load the AFP_AfpInfo stream and initialize the backup time
9809		 * and Finder info (if they are not already valid).
9810		 */
9811		err = ntfs_inode_afpinfo_read(ni);
9812		if (err) {
9813			ntfs_error(vol->mp, "Failed to obtain AfpInfo for "
9814					"mft_no 0x%llx (error %d).",
9815					(unsigned long long)ni->mft_no, err);
9816			lck_rw_unlock_exclusive(&ni->lock);
9817			return err;
9818		}
9819		if (!NInoValidFinderInfo(ni))
9820			panic("%s(): !NInoValidFinderInfo(ni)\n", __FUNCTION__);
9821		lck_rw_lock_exclusive_to_shared(&ni->lock);
9822	}
9823	/*
9824	 * Make a copy of the Finder info and mask out the hidden bit if this
9825	 * is the root directory and the type and creator if this is a symbolic
9826	 * link.
9827	 */
9828	memcpy(&fi, &ni->finder_info, sizeof(fi));
9829	if (ni == vol->root_ni)
9830		fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
9831	if (S_ISLNK(ni->mode)) {
9832		fi.type = 0;
9833		fi.creator = 0;
9834	}
9835	if (bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi))) {
9836		if (!uio)
9837			size += sizeof(XATTR_FINDERINFO_NAME);
9838		else if (uio_resid(uio) <
9839				(user_ssize_t)sizeof(XATTR_FINDERINFO_NAME)) {
9840			err = ERANGE;
9841			goto err;
9842		} else {
9843			err = uiomove((caddr_t)XATTR_FINDERINFO_NAME,
9844					sizeof(XATTR_FINDERINFO_NAME), uio);
9845			if (err) {
9846				ntfs_error(vol->mp, "uiomove() failed (error "
9847						"%d).", err);
9848				goto err;
9849			}
9850		}
9851		ntfs_debug("Exporting Finder info name %s.",
9852				XATTR_FINDERINFO_NAME);
9853	}
9854	/* Iterate over all the named $DATA attributes. */
9855	err = ntfs_mft_record_map(ni, &m);
9856	if (err) {
9857		ntfs_error(vol->mp, "Failed to map mft record (error %d).",
9858				err);
9859		goto err;
9860	}
9861	ctx = ntfs_attr_search_ctx_get(ni, m);
9862	if (!ctx) {
9863		ntfs_error(vol->mp, "Failed to allocate search context.");
9864		err = ENOMEM;
9865		goto unm_err;
9866	}
9867	/*
9868	 * Allocate a buffer we can use when converting the names of the named
9869	 * $DATA attributes to utf8.  We want enough space to definitely be
9870	 * able to convert the name as well as a byte for the NULL terminator.
9871	 */
9872	utf8_size = NTFS_MAX_ATTR_NAME_LEN * 4 + 1;
9873	utf8_name = OSMalloc(utf8_size, ntfs_malloc_tag);
9874	if (!utf8_name) {
9875		ntfs_error(vol->mp, "Failed to allocate name buffer.");
9876		err = ENOMEM;
9877		goto put_err;
9878	}
9879	do {
9880		ntfs_inode *ani;
9881		ATTR_RECORD *a;
9882		ntfschar *name;
9883		unsigned name_len;
9884		signed utf8_len;
9885
9886		/* Get the next $DATA attribute. */
9887		err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, NULL, 0, ctx);
9888		if (err) {
9889			if (err == ENOENT) {
9890				err = 0;
9891				break;
9892			}
9893			ntfs_error(vol->mp, "Failed to iterate over named "
9894					"$DATA attributes (error %d).", err);
9895			goto free_err;
9896		}
9897		/* Got the next attribute, deal with it. */
9898		a = ctx->a;
9899		/* If this is the unnamed $DATA attribute, skip it. */
9900		if (!a->name_length) {
9901			ntfs_debug("Skipping unnamed $DATA attribute.");
9902			continue;
9903		}
9904		name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset));
9905		name_len = a->name_length;
9906		if ((u8*)name < (u8*)a || (u8*)name + name_len > (u8*)a +
9907				le32_to_cpu(a->length)) {
9908			ntfs_error(vol->mp, "Found corrupt named $DATA "
9909					"attribute.  Run chkdsk.");
9910			NVolSetErrors(vol);
9911			err = EIO;
9912			goto free_err;
9913		}
9914		/*
9915		 * Check if this attribute currently has a cached inode/vnode
9916		 * and if so check if it has been unlinked/deleted and if so
9917		 * skip it.
9918		 */
9919		err = ntfs_attr_inode_lookup(ni, a->type, name, name_len,
9920				FALSE, &ani);
9921		if (err != ENOENT) {
9922			BOOL skip_it;
9923
9924			if (err)
9925				panic("%s() inode lookup failed (error %d).\n",
9926						__FUNCTION__, err);
9927			/* Got the cached attribute inode. */
9928			skip_it = FALSE;
9929			if (NInoDeleted(ani) || !ani->link_count ||
9930					(ntfs_are_names_equal(name, name_len,
9931					NTFS_SFM_RESOURCEFORK_NAME, 12,
9932					case_sensitive, upcase, upcase_len) &&
9933					!ubc_getsize(ani->vn)))
9934				skip_it = TRUE;
9935			if (skip_it) {
9936				if (NInoDeleted(ani) || !ani->link_count)
9937					ntfs_debug("Skipping deleted/unlinked "
9938							"attribute.");
9939				else
9940					ntfs_debug("Mft_no 0x%llx has zero "
9941							"size resource fork, "
9942							"pretending it does "
9943							"not exist.",
9944							(unsigned long long)
9945							ani->mft_no);
9946				(void)vnode_put(ani->vn);
9947				continue;
9948			}
9949			(void)vnode_put(ani->vn);
9950		}
9951		/*
9952		 * If AFP_Resource named stream exists, i.e. the resource fork
9953		 * is present, and it is non-empty export the name
9954		 * XATTR_RESOURCEFORK_NAME.  This is what HFS does, too.
9955		 */
9956		if (ntfs_are_names_equal(name, name_len,
9957				NTFS_SFM_RESOURCEFORK_NAME, 12, case_sensitive,
9958				upcase, upcase_len)) {
9959			if (!ntfs_attr_size(a)) {
9960				ntfs_debug("Skipping empty resource fork "
9961						"name %s.",
9962						XATTR_RESOURCEFORK_NAME);
9963				continue;
9964			}
9965			if (!uio)
9966				size += sizeof(XATTR_RESOURCEFORK_NAME);
9967			else if (uio_resid(uio) < (user_ssize_t)sizeof(
9968					XATTR_RESOURCEFORK_NAME)) {
9969				err = ERANGE;
9970				goto free_err;
9971			} else {
9972				err = uiomove((caddr_t)XATTR_RESOURCEFORK_NAME,
9973						sizeof(XATTR_RESOURCEFORK_NAME),
9974						uio);
9975				if (err) {
9976					ntfs_error(vol->mp, "uiomove() failed "
9977							"(error %d).", err);
9978					goto free_err;
9979				}
9980			}
9981			ntfs_debug("Exporting resource fork name %s.",
9982					XATTR_RESOURCEFORK_NAME);
9983			continue;
9984		}
9985		/*
9986		 * If this is one of the SFM named streams, skip it, as they
9987		 * contain effectively metadata information so should not be
9988		 * exposed directly.
9989		 */
9990		if (ntfs_is_sfm_name(vol, name, name_len)) {
9991			ntfs_debug("Skipping protected SFM name.");
9992			continue;
9993		}
9994		/* Convert the name to utf8. */
9995		utf8_len = ntfs_to_utf8(vol, name, name_len <<
9996				NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size);
9997		if (utf8_len < 0) {
9998			ntfs_warning(vol->mp, "Skipping unrepresentable name "
9999					"in mft_no 0x%llx (error %d).",
10000					(unsigned long long)ni->mft_no,
10001					-utf8_len);
10002			continue;
10003		}
10004		/*
10005		 * If this is a protected attribute, skip it.
10006		 *
10007		 * FIXME: xattr_protected() is case sensitive so it does not
10008		 * exclude protected attributes when they are not correctly
10009		 * cased on disk.
10010		 *
10011		 * However we do call it to be consistent with HFS and SMB but
10012		 * it is pointless as anyone can call getxattr() for a case
10013		 * variant and the getxattr() system call would use
10014		 * xattr_protected() which would not filter it out so the
10015		 * VNOP_GETXATTR() call would happen and we would return the
10016		 * attribute just fine.  Simillarly anyone could set and remove
10017		 * such "protected" attributes by just calling the system call
10018		 * with a case variant even when they are correctly filtered
10019		 * out here.
10020		 */
10021		if (xattr_protected((char*)utf8_name)) {
10022			ntfs_debug("Skipping protected name %.*s.", utf8_len,
10023					utf8_name);
10024			continue;
10025		}
10026		/*
10027		 * Increment the length of the name by one for the NULL
10028		 * terminator.
10029		 */
10030		utf8_len++;
10031		/* Export the utf8_name. */
10032		if (!uio)
10033			size += utf8_len;
10034		else if (uio_resid(uio) < utf8_len) {
10035			err = ERANGE;
10036			goto free_err;
10037		} else {
10038			err = uiomove((caddr_t)utf8_name, utf8_len, uio);
10039			if (err) {
10040				ntfs_error(vol->mp, "uiomove() failed (error "
10041						"%d).", err);
10042				goto free_err;
10043			}
10044		}
10045		ntfs_debug("Exporting name %.*s.", utf8_len, utf8_name);
10046		/* Continue to the next name. */
10047	} while (1);
10048	if (!uio)
10049		*args->a_size = size;
10050	ntfs_debug("Done.");
10051free_err:
10052	OSFree(utf8_name, utf8_size, ntfs_malloc_tag);
10053put_err:
10054	ntfs_attr_search_ctx_put(ctx);
10055unm_err:
10056	ntfs_mft_record_unmap(ni);
10057err:
10058	lck_rw_unlock_shared(&ni->lock);
10059	return err;
10060}
10061
10062/**
10063 * ntfs_vnop_blktooff - map a logical block number to its byte offset
10064 * @a:		arguments to blktooff function
10065 *
10066 * @a contains:
10067 *	vnode_t a_vp;		vnode to which the logical block number belongs
10068 *	daddr64_t a_lblkno;	logical block number to map
10069 *	off_t *a_offset;	destination for returning the result
10070 *
10071 * Map the logical block number @a->a_lblkno belonging to the vnode @a->a_vp to
10072 * the corresponding byte offset, i.e. the offset in the vnode in bytes and
10073 * return the result in @a->a_offset.
10074 *
10075 * Return 0 on success and EINVAL if no vnode was specified in @a->a_vp.
10076 */
10077static int ntfs_vnop_blktooff(struct vnop_blktooff_args *a)
10078{
10079	ntfs_inode *ni;
10080	ntfs_volume *vol;
10081	unsigned block_size_shift;
10082
10083	if (!a->a_vp) {
10084		ntfs_warning(NULL, "Called with NULL vnode!");
10085		return EINVAL;
10086	}
10087	ni = NTFS_I(a->a_vp);
10088	if (!ni) {
10089		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
10090		return EINVAL;
10091	}
10092	if (S_ISDIR(ni->mode)) {
10093		ntfs_error(ni->vol->mp, "Called for directory vnode.");
10094		return EINVAL;
10095	}
10096	ntfs_debug("Entering for logical block 0x%llx, mft_no 0x%llx, type "
10097			"0x%x, name_len 0x%x.", (unsigned long long)a->a_lblkno,
10098			(unsigned long long)ni->mft_no, le32_to_cpu(ni->type),
10099			(unsigned)ni->name_len);
10100	vol = ni->vol;
10101	block_size_shift = PAGE_SHIFT;
10102	/*
10103	 * For $MFT/$DATA and $MFTMirr/$DATA the logical block number is the
10104	 * mft record number and the block size is the mft record size which is
10105	 * also in @ni->block_size{,_shift}.
10106	 */
10107	if (ni == vol->mft_ni || ni == vol->mftmirr_ni)
10108		block_size_shift = ni->block_size_shift;
10109	*a->a_offset = a->a_lblkno << block_size_shift;
10110	ntfs_debug("Done (byte offset 0x%llx).",
10111			(unsigned long long)*a->a_offset);
10112	return 0;
10113}
10114
10115/**
10116 * ntfs_vnop_offtoblk - map a byte offset to its logical block number
10117 * @a:		arguments to offtoblk function
10118 *
10119 * @a contains:
10120 *	vnode_t a_vp;		vnode to which the byte offset belongs
10121 *	off_t a_offset;		byte offset to map
10122 *	daddr64_t *a_lblkno;	destination for returning the result
10123 *
10124 * Map the byte offset @a->a_offset belonging to the vnode @a->a_vp to the
10125 * corresponding logical block number, i.e. the offset in the vnode in units of
10126 * the vnode block size and return the result in @a->a_lblkno.
10127 *
10128 * Return 0 on success and EINVAL if no vnode was specified in @a->a_vp.
10129 */
10130static int ntfs_vnop_offtoblk(struct vnop_offtoblk_args *a)
10131{
10132	ntfs_inode *ni;
10133	ntfs_volume *vol;
10134	unsigned block_size_shift;
10135
10136	if (!a->a_vp) {
10137		ntfs_warning(NULL, "Called with NULL vnode.");
10138		return EINVAL;
10139	}
10140	ni = NTFS_I(a->a_vp);
10141	if (!ni) {
10142		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
10143		return EINVAL;
10144	}
10145	if (S_ISDIR(ni->mode)) {
10146		ntfs_error(ni->vol->mp, "Called for directory vnode.");
10147		return EINVAL;
10148	}
10149	ntfs_debug("Entering for byte offset 0x%llx, mft_no 0x%llx, type "
10150			"0x%x, name_len 0x%x.", (unsigned long long)a->a_offset,
10151			(unsigned long long)ni->mft_no, le32_to_cpu(ni->type),
10152			(unsigned)ni->name_len);
10153	vol = ni->vol;
10154	block_size_shift = PAGE_SHIFT;
10155	/*
10156	 * For $MFT/$DATA and $MFTMirr/$DATA the logical block number is the
10157	 * mft record number and the block size is the mft record size which is
10158	 * also in @ni->block_size{,_shift}.
10159	 */
10160	if (ni == vol->mft_ni || ni == vol->mftmirr_ni)
10161		block_size_shift = ni->block_size_shift;
10162	*a->a_lblkno = a->a_offset >> block_size_shift;
10163	ntfs_debug("Done (logical block 0x%llx).",
10164			(unsigned long long)*a->a_lblkno);
10165	return 0;
10166}
10167
10168/**
10169 * ntfs_vnop_blockmap - map a file offset to its physical block number
10170 * @a:		arguments to blockmap function
10171 *
10172 * @a contains:
10173 *	vnode_t a_vp;		vnode to which the byte offset belongs
10174 *	off_t a_foffset;	starting byte offset to map
10175 *	size_t a_size;		number of bytes to map starting at @a_foffset
10176 *	daddr64_t *a_bpn;	destination for starting physical block number
10177 *	size_t *a_run;		destination for contiguous bytes from @a_bpn
10178 *	void *a_poff;		physical offset into @a_bpn
10179 *	int a_flags;		reason for map (VNODE_READ, VNODE_WRITE, or 0)
10180 *	vfs_context_t a_context;
10181 *
10182 * Map @a->a_size bytes starting at the file offset @a->a_foffset to the
10183 * corresponding physical block number and return the result in @a->a_bpn
10184 * (starting block number), @a->a_run (number of contiguous bytes starting at
10185 * @a->a_bpn), and @a->a_poff (byte offset into @a->a_bpn corresponding to the
10186 * file offset @a->a_foffset, this will be zero if @a_foffset is block aligned
10187 * and non-zero otherwise).
10188 *
10189 * FIXME: At present the OS X kernel completely ignores @a->a_poff and in fact
10190 * it is always either NULL on entry or the returned value is ignored.  Thus,
10191 * for now, if @a->a_foffset is not aligned to the physical block size, we
10192 * always return error (EINVAL) unless @a->a_foffset equals the initialized
10193 * size in the ntfs inode in which case we return a block number of -1 in
10194 * @a->a_bpn thus alignment to the block and hence @a->a_poff are not relevant.
10195 * Thus we always return 0 in @a->a_poff.
10196 *
10197 * @a->a_flags is either VNODE_READ or VNODE_WRITE but can be 0 in certain call
10198 * paths such as the system call fcntl(F_LOG2PHYS) for example.
10199 *
10200 * Note, all the return pointers (@a->a_bpn, @a->a_run, @a->a_poff) are NULL in
10201 * some code paths in xnu (one or more of them at a time), thus all of them
10202 * need to be checked for being NULL before writing to them.  If @a->a_bpn is
10203 * NULL then there is nothing to do and success is returned immediately.
10204 *
10205 * For ntfs mapping to physical blocks is special because some attributes do
10206 * not have block aligned data.  This is the case for all resident attributes
10207 * as well as for all non-resident attributes which are compressed or
10208 * encrypted.  For all of those it would be logical to return an error however
10209 * this leads to a kernel panic in current xnu because a buf_bread() can cause
10210 * ntfs_vnop_blockmap() to be called when an uptodate page is in memory but no
10211 * buffer is in memory.  This can happen under memory pressure when the buffer
10212 * has been recycled for something else but the page has not been reused yet.
10213 * In that case ntfs_vnop_blockmap() is only called to recreate the physical
10214 * mapping of the buffer and is not actually used for anything as the data is
10215 * already present in the uptodate page.  Thus, instead of returning error, we
10216 * set the physical block @a->a_bpn to equal the logical block corresponding to
10217 * the byte offset @a->a_foffset and return success.  Doing this signals to the
10218 * VFS that the physical mapping cannot be cached in the buffer and all is
10219 * well.  Note this call path always has a non-zero @a->a_flags whilst other
10220 * "weird" code paths like fcntl(F_LOG2PHYS) set @a->a_flags to zero, thus we
10221 * can do the above workaround when @a->a_flags is not zero and return error
10222 * EINVAL when @a->a_flags is zero.
10223 *
10224 * In the read case and when @a->a_flags is zero, if @a->a_foffset is beyond
10225 * the end of the attribute, return error ERANGE.  HFS returns ERANGE in this
10226 * case so we follow suit.  Although some other OS X file systems return EFBIG
10227 * and some E2BIG instead so it does not seem to be very standardized, so maybe
10228 * we should return the IMHO more correct "invalid seek" (ESPIPE), instead. (-;
10229 *
10230 * In the write case we need to allow the mapping of blocks beyond the end of
10231 * the attribute as we will already have extended the allocated size but not
10232 * yet the data size nor the initialized size.  Thus in this case we only
10233 * return ERANGE if the requested @a->a_foffset is beyond the end of the
10234 * allocated size.
10235 *
10236 * Return 0 on success and errno on error.
10237 */
10238static int ntfs_vnop_blockmap(struct vnop_blockmap_args *a)
10239{
10240	const s64 byte_offset = a->a_foffset;
10241	const s64 byte_size = a->a_size;
10242	s64 max_size, data_size, init_size, clusters, bytes = 0;
10243	VCN vcn;
10244	LCN lcn;
10245	ntfs_inode *ni = NTFS_I(a->a_vp);
10246	ntfs_volume *vol;
10247	unsigned vcn_ofs;
10248	BOOL is_write = (a->a_flags & VNODE_WRITE);
10249
10250	if (!ni) {
10251		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
10252		return EINVAL;
10253	}
10254	vol = ni->vol;
10255	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, "
10256			"offset 0x%llx, size 0x%llx, for %s operation.",
10257			(unsigned long long)ni->mft_no,
10258			(unsigned)le32_to_cpu(ni->type),
10259			(unsigned)ni->name_len,
10260			(unsigned long long)byte_offset,
10261			(unsigned long long)byte_size,
10262			a->a_flags ? (is_write ? "write" : "read") :
10263			"unspecified");
10264	if (S_ISDIR(ni->mode)) {
10265		ntfs_error(vol->mp, "Called for directory vnode.");
10266		return EINVAL;
10267	}
10268	if (is_write && NVolReadOnly(vol)) {
10269		ntfs_warning(vol->mp, "Called for VNODE_WRITE but mount is "
10270				"read-only.");
10271		return EROFS;
10272	}
10273	if (!a->a_bpn) {
10274		ntfs_debug("Called with a_bpn == NULL, nothing to do.  "
10275				"Returning success (0).");
10276		return 0;
10277	}
10278	/*
10279	 * We cannot take the inode lock as it may be held already so we just
10280	 * check the deleted bit and abort if it is set which is better than
10281	 * nothing.
10282	 */
10283	if (NInoDeleted(ni)) {
10284		/* Remove the inode from the name cache. */
10285		cache_purge(ni->vn);
10286		ntfs_debug("Inode has been deleted.");
10287		return ENOENT;
10288	}
10289	/*
10290	 * Note it does not matter if we are racing with truncate because that
10291	 * will be detected during the runlist lookup below.
10292	 */
10293	lck_spin_lock(&ni->size_lock);
10294	if (is_write)
10295		max_size = ni->allocated_size;
10296	else
10297		max_size = ni->data_size;
10298	data_size = ni->data_size;
10299	init_size = ni->initialized_size;
10300	lck_spin_unlock(&ni->size_lock);
10301	if (byte_offset >= max_size) {
10302eof:
10303		ntfs_error(vol->mp, "Called for inode 0x%llx, size 0x%llx, "
10304				"byte offset 0x%llx, for %s operation, which "
10305				"is beyond the end of the inode %s size "
10306				"0x%llx.  Returning error: ERANGE.",
10307				(unsigned long long)ni->mft_no,
10308				(unsigned long long)byte_size,
10309				(unsigned long long)byte_offset, a->a_flags ?
10310				(is_write ? "write" : "read") : "unspecified",
10311				is_write ? "allocated" : "data",
10312				(unsigned long long)max_size);
10313		return ERANGE;
10314	}
10315	if (byte_offset & vol->sector_size_mask && byte_offset != init_size) {
10316		ntfs_error(vol->mp, "Called for inode 0x%llx, byte offset "
10317				"0x%llx.  This is not a multiple of the "
10318				"physical block size %u thus the mapping "
10319				"cannot be performed.  Returning error: "
10320				"EINVAL.", (unsigned long long)ni->mft_no,
10321				(unsigned long long)byte_offset,
10322				(unsigned)vol->sector_size);
10323		return EINVAL;
10324	}
10325	/*
10326	 * In the read case, if the requested byte offset is at or beyond the
10327	 * initialized size simply return a hole.  We already checked for being
10328	 * at or beyond the data size so we know we are in an uninitialized
10329	 * region in this case rather than at or beyond the end of the
10330	 * attribute.
10331	 */
10332	if (!is_write && byte_offset >= init_size) {
10333		*a->a_bpn = -1; /* -1 means hole. */
10334		/*
10335		 * Set the size of the block to the number of uninitialized
10336		 * bytes in the attribute starting at the requested byte offset
10337		 * @a->a_foffset.
10338		 */
10339		bytes = data_size - byte_offset;
10340		goto done;
10341	}
10342	/*
10343	 * Blockmap does not make sense for resident attributes and neither
10344	 * does it make sense for non-resident, compressed or encrypted
10345	 * attributes.  The only special case is for directory inodes because
10346	 * their flags are only defaults to be used when creating new files
10347	 * rather than having any meaning for their actual data contents.
10348	 */
10349	if (!NInoNonResident(ni) || (ni->type != AT_INDEX_ALLOCATION &&
10350			(NInoCompressed(ni) || NInoEncrypted(ni)) &&
10351			!NInoRaw(ni))) {
10352		if (!a->a_flags) {
10353			ntfs_error(vol->mp, "Called for inode 0x%llx, which "
10354					"is resident, compressed, or "
10355					"encrypted and VNOP_BLOCKMAP() does "
10356					"not make sense for such inodes.  "
10357					"Returning error: EINVAL.",
10358					(unsigned long long)ni->mft_no);
10359			return EINVAL;
10360		}
10361		*a->a_bpn = byte_offset >> PAGE_SHIFT;
10362		bytes = ni->block_size;
10363		ntfs_debug("Called for inode 0x%llx which is resident, "
10364				"compressed, or encrypted and VNOP_BLOCKMAP() "
10365				"does not make sense for such inodes.  "
10366				"Returning success and setting physical == "
10367				"logical block number to signal to VFS that "
10368				"the mapping cannot be cached in the buffer.",
10369				(unsigned long long)ni->mft_no);
10370		goto done;
10371	}
10372	/*
10373	 * All is ok, do the mapping.  First, work out the vcn and vcn offset
10374	 * corresponding to the @a->a_foffset.
10375	 */
10376	vcn = byte_offset >> vol->cluster_size_shift;
10377	vcn_ofs = (u32)byte_offset & vol->cluster_size_mask;
10378	/*
10379	 * Convert the vcn to the corresponding lcn and obtain the number of
10380	 * contiguous clusters starting at the vcn.
10381	 */
10382	lck_rw_lock_shared(&ni->rl.lock);
10383	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, FALSE,
10384			a->a_run ? &clusters : 0);
10385	if (lcn < LCN_HOLE) {
10386		errno_t err;
10387
10388		/* Error: deal with it. */
10389		lck_rw_unlock_shared(&ni->rl.lock);
10390		switch (lcn) {
10391		case LCN_ENOENT:
10392			/*
10393			 * Raced with a concurrent truncate which caused the
10394			 * byte offset @a->a_foffset to become outside the
10395			 * attribute size.
10396			 */
10397			goto eof;
10398		case LCN_ENOMEM:
10399			ntfs_error(vol->mp, "Not enough memory to complete "
10400					"mapping for inode 0x%llx.  "
10401					"Returning error: ENOMEM.",
10402					(unsigned long long)ni->mft_no);
10403			err = ENOMEM;
10404			break;
10405		default:
10406			ntfs_error(vol->mp, "Failed to complete mapping for "
10407					"inode 0x%llx.  Run chkdsk.  "
10408					"Returning error: EIO.",
10409					(unsigned long long)ni->mft_no);
10410			err = EIO;
10411			break;
10412		}
10413		return err;
10414	}
10415	if (lcn < 0) {
10416		/*
10417		 * It is a hole, return it.  If this is a VNODE_WRITE request,
10418		 * output a warning as this should never happen.  Both
10419		 * VNOP_WRITE() and VNOP_PAGEOUT() should have instantiated the
10420		 * hole before performing the write.
10421		 *
10422		 * Note we could potentially fill the hole here in the write
10423		 * case.  However this is quite hard to do as the caller will
10424		 * likely have pages around the hole locked in UBC UPLs thus we
10425		 * would have difficulties zeroing the surrounding regions when
10426		 * the cluster size is larger than the page size.  Also a
10427		 * problem is what happens if the write fails for some reason
10428		 * but we have instantiated the hole here and not zeroed it
10429		 * completely (because we are expecting the write to go into
10430		 * the allocated clusters).  We would have no way of fixing up
10431		 * in this case and we would end up exposing stale data.  This
10432		 * all is why we choose not to fill the hole here but to do it
10433		 * in advance in ntfs_vnop_write() and ntfs_vnop_pageout().
10434		 *
10435		 * The only thing that will happen when we return a hole in the
10436		 * write case is that when the caller is cluster_io(), it will
10437		 * page out page by page and this will fill the hole in pieces
10438		 * which will degrade performance.
10439		 */
10440		if (is_write)
10441			ntfs_warning(vol->mp, "Returning hole but flags "
10442					"specify VNODE_WRITE.  This causes "
10443					"very inefficient allocation and I/O "
10444					"patterns.");
10445		/* Return the hole. */
10446		lck_rw_unlock_shared(&ni->rl.lock);
10447		*a->a_bpn = -1; /* -1 means hole. */
10448		if (a->a_run) {
10449			bytes = (clusters << vol->cluster_size_shift) - vcn_ofs;
10450			/*
10451			 * If the run overlaps the initialized size, extend the
10452			 * run length so it goes up to the data size thus
10453			 * merging the hole with the uninitialized region.
10454			 *
10455			 * Note, do not do this in the write case as we want to
10456			 * return the real clusters even beyond the initialized
10457			 * size as the initialized size will only be updated
10458			 * after the write has completed.
10459			 */
10460			if (!is_write && byte_offset + bytes > init_size)
10461				bytes = data_size - byte_offset;
10462		}
10463		goto done;
10464	} else
10465		lck_rw_unlock_shared(&ni->rl.lock);
10466	/* The vcn was mapped successfully to a physical lcn, return it. */
10467	*a->a_bpn = ((lcn << vol->cluster_size_shift) + vcn_ofs) >>
10468			vol->sector_size_shift;
10469	if (a->a_run) {
10470		bytes = (clusters << vol->cluster_size_shift) - vcn_ofs;
10471		/*
10472		 * In the read case, if the run overlaps the initialized size,
10473		 * truncate the run length so it only goes up to the
10474		 * initialized size.  The caller will then be able to access
10475		 * this region on disk directly and will then call us again
10476		 * with a byte offset equal to the initialized size and we will
10477		 * then return the entire initialized region as a hole.  Thus
10478		 * the caller does not need to know about the fact that NTFS
10479		 * has such a thing as the initialized_size.
10480		 *
10481		 * We already handled the case where the byte offset is beyond
10482		 * the initialized size so no need to check for that here.
10483		 *
10484		 * However do not do this if the initialized size is equal to
10485		 * the data size.  The caller is responsible for not returning
10486		 * data beyond the attribute size to user space.  If this is
10487		 * not done the last page of an attribute read is broken into
10488		 * two separate i/os, one with a read and one with a hole.
10489		 * cluster_io() will zero beyond the end of attribute in any
10490		 * case so it is faster to do it with a single call.
10491		 */
10492		if (!is_write && byte_offset + bytes > init_size &&
10493				init_size < data_size)
10494			bytes = init_size - byte_offset;
10495	}
10496done:
10497	if (a->a_run) {
10498		if (bytes > byte_size)
10499			bytes = byte_size;
10500		*a->a_run = bytes;
10501	}
10502	if (a->a_poff)
10503		*(int*)a->a_poff = 0;
10504	ntfs_debug("Done (a_bpn 0x%llx, a_run 0x%lx, a_poff 0x%x).",
10505			(unsigned long long)*a->a_bpn,
10506			a->a_run ? (unsigned long)*a->a_run : 0,
10507			a->a_poff ? *(int*)a->a_poff : 0);
10508	return 0;
10509}
10510
10511/**
10512 * ntfs_vnop_getnamedstream - find a named stream in an inode given its name
10513 * @a:		arguments to getnamedstream function
10514 *
10515 * @a contains:
10516 *	vnode_t a_vp;			vnode containing the named stream
10517 *	vnode_t *a_svpp;		destination for the named stream vnode
10518 *	const char *a_name;		name of the named stream to get
10519 *	enum nsoperation a_operation;	reason for getnamedstream
10520 *	int a_flags;			flags describing the request
10521 *	vfs_context_t a_context;
10522 *
10523 * Find the named stream with name @a->a_name in the vnode @a->a_vp and return
10524 * the vnode of the named stream in *@a->a_svpp if it was found.
10525 *
10526 * @a->a_operation specifies the reason for the lookup of the named stream.
10527 * The following operations are currently defined in OS X kernel:
10528 *	NS_OPEN	  - Want to open the named stream for access.
10529 *	NS_CREATE - Want to create the named stream so checking it does not
10530 *		    exist already.
10531 *	NS_DELETE - Want to delete the named stream so making sure it exists.
10532 *
10533 * The flags in @a->a_flags further describe the getnamedstream request.  At
10534 * present no flags are defined in OS X kernel.
10535 *
10536 * Note that at present Mac OS X only supports the "com.apple.ResourceFork"
10537 * stream so we follow suit.
10538 *
10539 * Return 0 on success and the error code on error.  A return value of ENOATTR
10540 * does not signify an error as such but merely the fact that the named stream
10541 * @name is not present in the vnode @a->a_vp.
10542 */
10543static int ntfs_vnop_getnamedstream(struct vnop_getnamedstream_args *a)
10544{
10545	vnode_t vn = a->a_vp;
10546	ntfs_inode *sni, *ni = NTFS_I(vn);
10547	const char *name = a->a_name;
10548	int options;
10549	const enum nsoperation op = a->a_operation;
10550	errno_t err;
10551
10552	if (!ni) {
10553		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
10554		return EINVAL;
10555	}
10556	ntfs_debug("Entering for mft_no 0x%llx, stream name %s, operation %s "
10557			"(0x%x), flags 0x%x.", (unsigned long long)ni->mft_no,
10558			name, op == NS_OPEN ? "NS_OPEN" :
10559			(op == NS_CREATE ? "NS_CREATE" :
10560			(op == NS_DELETE ? "NS_DELETE" : "unknown")), op,
10561			a->a_flags);
10562	/*
10563	 * Mac OS X only supports the resource fork stream.
10564	 * Note that this comparison is case sensitive.
10565	 */
10566	if (bcmp(name, XATTR_RESOURCEFORK_NAME,
10567			sizeof(XATTR_RESOURCEFORK_NAME))) {
10568		ntfs_warning(ni->vol->mp, "Unsupported named stream %s "
10569				"specified, only the resource fork named "
10570				"stream (%s) is supported at present.  "
10571				"Returning ENOATTR.", name,
10572				XATTR_RESOURCEFORK_NAME);
10573		return ENOATTR;
10574	}
10575	/* Only regular files may have a resource fork stream. */
10576	if (!S_ISREG(ni->mode)) {
10577		ntfs_warning(ni->vol->mp, "The resource fork may only be "
10578				"attached to regular files and mft_no 0x%llx "
10579				"is not a regular file.  Returning EPERM.",
10580				(unsigned long long)ni->mft_no);
10581		return EPERM;
10582	}
10583	/*
10584	 * Attempt to get the inode for the named stream.  For the resource
10585	 * fork we need to return it even if it is zero size if the caller has
10586	 * specified @op == NS_OPEN so we set @options to zero in this case.
10587	 * Otherwise we want to treat a zero size resource fork as a
10588	 * non-existent resource fork se we set @options to XATTR_REPLACE which
10589	 * is the behaviour of ntfs_attr_inode_get().
10590	 */
10591	if (op == NS_OPEN) {
10592		options = 0;
10593		lck_rw_lock_exclusive(&ni->lock);
10594	} else {
10595		options = XATTR_REPLACE;
10596		lck_rw_lock_shared(&ni->lock);
10597	}
10598	/* Do not allow messing with the inode once it has been deleted. */
10599	if (NInoDeleted(ni)) {
10600		/* Remove the inode from the name cache. */
10601		cache_purge(vn);
10602		if (op == NS_OPEN)
10603			lck_rw_unlock_exclusive(&ni->lock);
10604		else
10605			lck_rw_unlock_shared(&ni->lock);
10606		ntfs_debug("Mft_no 0x%llx is deleted.",
10607				(unsigned long long)ni->mft_no);
10608		return ENOENT;
10609	}
10610	err = ntfs_attr_inode_get_or_create(ni, AT_DATA,
10611			NTFS_SFM_RESOURCEFORK_NAME, 12, FALSE, FALSE, options,
10612			LCK_RW_TYPE_SHARED, &sni);
10613	if (!err) {
10614		/* We have successfully opened the named stream. */
10615		*a->a_svpp = sni->vn;
10616		lck_rw_unlock_shared(&sni->lock);
10617		ntfs_debug("Done.");
10618	} else {
10619		if (err == ENOENT) {
10620			err = ENOATTR;
10621			ntfs_debug("Done (named stream %s does not exist in "
10622					"mft_no 0x%llx.", name,
10623					(unsigned long long)ni->mft_no);
10624		} else
10625			ntfs_error(ni->vol->mp, "Failed to get named stream "
10626					"%s, mft_no 0x%llx (error %d).", name,
10627					(unsigned long long)ni->mft_no, err);
10628	}
10629	if (op == NS_OPEN)
10630		lck_rw_unlock_exclusive(&ni->lock);
10631	else
10632		lck_rw_unlock_shared(&ni->lock);
10633	return err;
10634}
10635
10636/**
10637 * ntfs_vnop_makenamedstream - create a named stream in an ntfs inode
10638 * @a:		arguments to makenamedstream function
10639 *
10640 * @a contains:
10641 *	vnode_t a_vp;		vnode in which to create the named stream
10642 *	vnode_t *a_svpp;	destination for the named stream vnode
10643 *	const char *a_name;	name of the named stream to create
10644 *	int a_flags;		flags describing the request
10645 *	vfs_context_t a_context;
10646 *
10647 * Create the named stream with name @a->a_name in the vnode @a->a_vp and
10648 * return the created vnode of the named stream in *@a->a_svpp.  If the named
10649 * stream already exists than it is obtained instead, i.e. if the named stream
10650 * already exists then ntfs_vnop_makenamedstream() does exactly the same thing
10651 * as ntfs_vnop_getnamedstream().
10652 *
10653 * The flags in @a->a_flags further describe the makenamedstream request.  At
10654 * present no flags are defined in OS X kernel.
10655 *
10656 * Note that at present Mac OS X only supports the "com.apple.ResourceFork"
10657 * stream so we follow suit.
10658 *
10659 * Return 0 on success and the error code on error.
10660 */
10661static int ntfs_vnop_makenamedstream(struct vnop_makenamedstream_args *a)
10662{
10663	vnode_t vn = a->a_vp;
10664	ntfs_inode *sni, *ni = NTFS_I(vn);
10665	const char *name = a->a_name;
10666	errno_t err;
10667
10668	if (!ni) {
10669		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
10670		return EINVAL;
10671	}
10672	ntfs_debug("Entering for mft_no 0x%llx, stream name %s, flags 0x%x.",
10673			(unsigned long long)ni->mft_no, name, a->a_flags);
10674	/*
10675	 * Mac OS X only supports the resource fork stream.
10676	 * Note that this comparison is case sensitive.
10677	 */
10678	if (bcmp(name, XATTR_RESOURCEFORK_NAME,
10679			sizeof(XATTR_RESOURCEFORK_NAME))) {
10680		ntfs_warning(ni->vol->mp, "Unsupported named stream %s "
10681				"specified, only the resource fork named "
10682				"stream (%s) is supported at present.  "
10683				"Returning ENOATTR.", name,
10684				XATTR_RESOURCEFORK_NAME);
10685		return ENOATTR;
10686	}
10687	/* Only regular files may have a resource fork stream. */
10688	if (!S_ISREG(ni->mode)) {
10689		ntfs_warning(ni->vol->mp, "The resource fork may only be "
10690				"attached to regular files and mft_no 0x%llx "
10691				"is not a regular file.  Returning EPERM.",
10692				(unsigned long long)ni->mft_no);
10693		return EPERM;
10694	}
10695	lck_rw_lock_exclusive(&ni->lock);
10696	/* Do not allow messing with the inode once it has been deleted. */
10697	if (NInoDeleted(ni)) {
10698		/* Remove the inode from the name cache. */
10699		cache_purge(vn);
10700		lck_rw_unlock_exclusive(&ni->lock);
10701		ntfs_debug("Mft_no 0x%llx is deleted.",
10702				(unsigned long long)ni->mft_no);
10703		return ENOENT;
10704	}
10705	/*
10706	 * Attempt to create the named stream.
10707	 *
10708	 * HFS allows an existing resource fork to be opened.  We want to
10709	 * follow suit so we specify 0 for @options when calling
10710	 * ntfs_attr_inode_get_or_create().
10711	 *
10712	 * FIXME: I think this is actually wrong behaviour.  If I am right and
10713	 * this is one day fixed in HFS, then we can trivially fix the
10714	 * behaviour here by setting @options to XATTR_CREATE.
10715	 */
10716	err = ntfs_attr_inode_get_or_create(ni, AT_DATA,
10717			NTFS_SFM_RESOURCEFORK_NAME, 12, FALSE, FALSE, 0,
10718			LCK_RW_TYPE_SHARED, &sni);
10719	if (!err) {
10720		/* We have successfully opened the (created) named stream. */
10721		*a->a_svpp = sni->vn;
10722		lck_rw_unlock_shared(&sni->lock);
10723		ntfs_debug("Done.");
10724	} else {
10725		if (err == EEXIST)
10726			ntfs_debug("Named stream %s already exists in mft_no "
10727					"0x%llx.", name,
10728					(unsigned long long)ni->mft_no);
10729		else
10730			ntfs_error(ni->vol->mp, "Failed to create named "
10731					"stream %s in mft_no 0x%llx (error "
10732					"%d).", name,
10733					(unsigned long long)ni->mft_no, err);
10734	}
10735	lck_rw_unlock_exclusive(&ni->lock);
10736	return err;
10737}
10738
10739/**
10740 * ntfs_vnop_removenamedstream - remove a named stream from an ntfs inode
10741 * @a:		arguments to removenamedstream function
10742 *
10743 * @a contains:
10744 *	vnode_t a_vp;		vnode from which to remove the named stream
10745 *	vnode_t a_svp;		vnode of named stream to remove
10746 *	const char *a_name;	name of the named stream to remove
10747 *	int a_flags;		flags describing the request
10748 *	vfs_context_t a_context;
10749 *
10750 * Delete the named stream described by the vnode @a->a_svp with name
10751 * @a->a_name from the vnode @a->a_vp.
10752 *
10753 * The flags in @a->a_flags further describe the removenamedstream request.  At
10754 * present no flags are defined in OS X kernel.
10755 *
10756 * Note we obey POSIX open unlink semantics thus an open named stream will
10757 * remain accessible for read/write/lseek purproses until the last open
10758 * instance is closed when the VFS will call ntfs_vnop_inactive() which will in
10759 * turn actually remove the named stream.
10760 *
10761 * Note that at present Mac OS X only supports the "com.apple.ResourceFork"
10762 * stream so we follow suit.
10763 *
10764 * Return 0 on success and the error code on error.  A return value of ENOATTR
10765 * does not signify an error as such but merely the fact that the named stream
10766 * @name is not present in the vnode @a->a_vp.
10767 */
10768static int ntfs_vnop_removenamedstream(struct vnop_removenamedstream_args *a)
10769{
10770	vnode_t svn, vn = a->a_vp;
10771	ntfs_inode *sni, *ni = NTFS_I(vn);
10772	const char *vname, *name = a->a_name;
10773
10774	svn = a->a_svp;
10775	sni = NTFS_I(svn);
10776	if (!ni || !sni) {
10777		ntfs_debug("Entered with NULL ntfs_inode, aborting.");
10778		return EINVAL;
10779	}
10780	vname = vnode_getname(svn);
10781	ntfs_debug("Entering for mft_no 0x%llx, stream mft_no 0x%llx, stream "
10782			"name %s, flags 0x%x, stream vnode name %s.",
10783			(unsigned long long)ni->mft_no,
10784			(unsigned long long)sni->mft_no, name, a->a_flags,
10785			vname ? vname : "not present");
10786	if (vname)
10787		(void)vnode_putname(vname);
10788	/*
10789	 * Mac OS X only supports the resource fork stream.
10790	 * Note that this comparison is case sensitive.
10791	 */
10792	if (bcmp(name, XATTR_RESOURCEFORK_NAME,
10793			sizeof(XATTR_RESOURCEFORK_NAME))) {
10794		ntfs_warning(ni->vol->mp, "Unsupported named stream %s "
10795				"specified, only the resource fork named "
10796				"stream (%s) is supported at present.  "
10797				"Returning ENOATTR.", name,
10798				XATTR_RESOURCEFORK_NAME);
10799		return ENOATTR;
10800	}
10801	/* Only regular files may have a resource fork stream. */
10802	if (!S_ISREG(ni->mode)) {
10803		ntfs_warning(ni->vol->mp, "The resource fork may only be "
10804				"attached to regular files and mft_no 0x%llx "
10805				"is not a regular file.  Returning EPERM.",
10806				(unsigned long long)ni->mft_no);
10807		return EPERM;
10808	}
10809	lck_rw_lock_exclusive(&ni->lock);
10810	/* Do not allow messing with the inode once it has been deleted. */
10811	if (NInoDeleted(ni)) {
10812		/* Remove the inode from the name cache. */
10813		cache_purge(vn);
10814		lck_rw_unlock_exclusive(&ni->lock);
10815		ntfs_debug("Mft_no 0x%llx is deleted.",
10816				(unsigned long long)ni->mft_no);
10817		return ENOATTR;
10818	}
10819	lck_rw_lock_exclusive(&sni->lock);
10820	/* Do not allow messing with the stream once it has been deleted. */
10821	if (NInoDeleted(sni)) {
10822		/* Remove the inode from the name cache. */
10823		cache_purge(svn);
10824		lck_rw_unlock_exclusive(&sni->lock);
10825		lck_rw_unlock_exclusive(&ni->lock);
10826		ntfs_debug("Stream mft_no 0x%llx, name %s is deleted.",
10827				(unsigned long long)sni->mft_no, name);
10828		return ENOATTR;
10829	}
10830	/*
10831	 * The base inode of the stream inode must be the same as the parent
10832	 * inode specified by the caller.
10833	 */
10834	if (!NInoAttr(sni) || sni->base_ni != ni)
10835		panic("%s(): !NInoAttr(sni) || sni->base_ni != ni\n",
10836				__FUNCTION__);
10837	/*
10838	 * Unlink the named stream.  The last close will cause the VFS to call
10839	 * ntfs_vnop_inactive() which will do the actual removal.
10840	 *
10841	 * And if the named stream is already unlinked there is nothing to do.
10842	 * This is what HFS does so we follow suit.
10843	 */
10844	if (sni->link_count) {
10845		sni->link_count = 0;
10846		/*
10847		 * Update the last_mft_change_time (ctime) in the inode as
10848		 * named stream/extended attribute semantics expect on OS X.
10849		 */
10850		ni->last_mft_change_time = ntfs_utc_current_time();
10851		NInoSetDirtyTimes(ni);
10852		/*
10853		 * If this is not a directory or it is an encrypted directory,
10854		 * set the needs archiving bit except for the core system
10855		 * files.
10856		 */
10857		if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
10858			BOOL need_set_archive_bit = TRUE;
10859			if (ni->vol->major_ver >= 2) {
10860				if (ni->mft_no <= FILE_Extend)
10861					need_set_archive_bit = FALSE;
10862			} else {
10863				if (ni->mft_no <= FILE_UpCase)
10864					need_set_archive_bit = FALSE;
10865			}
10866			if (need_set_archive_bit) {
10867				ni->file_attributes |= FILE_ATTR_ARCHIVE;
10868				NInoSetDirtyFileAttributes(ni);
10869			}
10870		}
10871		ntfs_debug("Done.");
10872	} else
10873		ntfs_debug("$DATA/%s attribute has already been unlinked from "
10874				"mft_no 0x%llx.", name,
10875				(unsigned long long)sni->mft_no);
10876	lck_rw_unlock_exclusive(&sni->lock);
10877	lck_rw_unlock_exclusive(&ni->lock);
10878	return 0;
10879}
10880
10881static struct vnodeopv_entry_desc ntfs_vnodeop_entries[] = {
10882	/*
10883	 * Set vn_default_error() to be our default vnop, thus any vnops we do
10884	 * not specify (or specify as NULL) will be set to it and this function
10885	 * just returns ENOTSUP.
10886	 */
10887	{ &vnop_default_desc,		(vnop_t*)vn_default_error },
10888	{ &vnop_strategy_desc,		(vnop_t*)ntfs_vnop_strategy },
10889	/*
10890	 * vn_bwrite() is a simple wrapper for buf_bwrite() which in turn uses
10891	 * VNOP_STRATEGY() and hence ntfs_vnop_strategy() to do the i/o and the
10892	 * latter handles all NTFS specifics thus we can simply use the generic
10893	 * vn_bwrite() for our VNOP_BWRITE() method.
10894	 */
10895	{ &vnop_bwrite_desc,		(vnop_t*)vn_bwrite },
10896	{ &vnop_lookup_desc,		(vnop_t*)ntfs_vnop_lookup },
10897	{ &vnop_create_desc,		(vnop_t*)ntfs_vnop_create },
10898	{ &vnop_mknod_desc,		(vnop_t*)ntfs_vnop_mknod },
10899	{ &vnop_open_desc,		(vnop_t*)ntfs_vnop_open },
10900	{ &vnop_close_desc,		(vnop_t*)ntfs_vnop_close },
10901	{ &vnop_access_desc,		(vnop_t*)ntfs_vnop_access },
10902	{ &vnop_getattr_desc,		(vnop_t*)ntfs_vnop_getattr },
10903	{ &vnop_setattr_desc,		(vnop_t*)ntfs_vnop_setattr },
10904	{ &vnop_read_desc,		(vnop_t*)ntfs_vnop_read },
10905	{ &vnop_write_desc,		(vnop_t*)ntfs_vnop_write },
10906	{ &vnop_ioctl_desc,		(vnop_t*)ntfs_vnop_ioctl },
10907	{ &vnop_select_desc,		(vnop_t*)ntfs_vnop_select },
10908	{ &vnop_exchange_desc,		(vnop_t*)ntfs_vnop_exchange },
10909	/* Let the VFS deal with revoking a vnode. */
10910	{ &vnop_revoke_desc,		(vnop_t*)nop_revoke },
10911	{ &vnop_mmap_desc,		(vnop_t*)ntfs_vnop_mmap },
10912	{ &vnop_mnomap_desc,		(vnop_t*)ntfs_vnop_mnomap },
10913	{ &vnop_fsync_desc,		(vnop_t*)ntfs_vnop_fsync },
10914	{ &vnop_remove_desc,		(vnop_t*)ntfs_vnop_remove },
10915	{ &vnop_link_desc,		(vnop_t*)ntfs_vnop_link },
10916	{ &vnop_rename_desc,		(vnop_t*)ntfs_vnop_rename },
10917	{ &vnop_mkdir_desc,		(vnop_t*)ntfs_vnop_mkdir },
10918	{ &vnop_rmdir_desc,		(vnop_t*)ntfs_vnop_rmdir },
10919	{ &vnop_symlink_desc,		(vnop_t*)ntfs_vnop_symlink },
10920	{ &vnop_readdir_desc,		(vnop_t*)ntfs_vnop_readdir },
10921	{ &vnop_readdirattr_desc, 	(vnop_t*)ntfs_vnop_readdirattr },
10922	{ &vnop_readlink_desc,		(vnop_t*)ntfs_vnop_readlink },
10923	{ &vnop_inactive_desc,		(vnop_t*)ntfs_vnop_inactive },
10924	{ &vnop_reclaim_desc,		(vnop_t*)ntfs_vnop_reclaim },
10925	{ &vnop_pathconf_desc,		(vnop_t*)ntfs_vnop_pathconf },
10926	/*
10927	 * Let the VFS deal with advisory locking for us, so our advlock method
10928	 * should never get called and if it were to get called for some
10929	 * reason, we make sure to return error (ENOTSUP).
10930	 */
10931	{ &vnop_advlock_desc,		(vnop_t*)err_advlock },
10932	{ &vnop_allocate_desc,		(vnop_t*)ntfs_vnop_allocate },
10933	{ &vnop_pagein_desc,		(vnop_t*)ntfs_vnop_pagein },
10934	{ &vnop_pageout_desc,		(vnop_t*)ntfs_vnop_pageout },
10935	{ &vnop_searchfs_desc,		(vnop_t*)ntfs_vnop_searchfs },
10936	/*
10937	 * Nothing supports copyfile in current xnu and it is not documented so
10938	 * we do not support it either.
10939	 */
10940	{ &vnop_copyfile_desc,		(vnop_t*)err_copyfile },
10941	{ &vnop_getxattr_desc,		(vnop_t*)ntfs_vnop_getxattr },
10942	{ &vnop_setxattr_desc,		(vnop_t*)ntfs_vnop_setxattr },
10943	{ &vnop_removexattr_desc,	(vnop_t*)ntfs_vnop_removexattr },
10944	{ &vnop_listxattr_desc,		(vnop_t*)ntfs_vnop_listxattr },
10945	{ &vnop_blktooff_desc,		(vnop_t*)ntfs_vnop_blktooff },
10946	{ &vnop_offtoblk_desc,		(vnop_t*)ntfs_vnop_offtoblk },
10947	{ &vnop_blockmap_desc,		(vnop_t*)ntfs_vnop_blockmap },
10948	{ &vnop_getnamedstream_desc,	(vnop_t*)ntfs_vnop_getnamedstream },
10949	{ &vnop_makenamedstream_desc,	(vnop_t*)ntfs_vnop_makenamedstream },
10950	{ &vnop_removenamedstream_desc,	(vnop_t*)ntfs_vnop_removenamedstream },
10951	{ NULL,				(vnop_t*)NULL }
10952};
10953
10954struct vnodeopv_desc ntfs_vnodeopv_desc = {
10955	&ntfs_vnodeop_p, ntfs_vnodeop_entries
10956};
10957