1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*	@(#)hfs_readwrite.c	1.0
29 *
30 *	(c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
31 *
32 *	hfs_readwrite.c -- vnode operations to deal with reading and writing files.
33 *
34 */
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/resourcevar.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/filedesc.h>
42#include <sys/stat.h>
43#include <sys/buf.h>
44#include <sys/buf_internal.h>
45#include <sys/proc.h>
46#include <sys/kauth.h>
47#include <sys/vnode.h>
48#include <sys/vnode_internal.h>
49#include <sys/uio.h>
50#include <sys/vfs_context.h>
51#include <sys/fsevents.h>
52#include <kern/kalloc.h>
53#include <sys/disk.h>
54#include <sys/sysctl.h>
55#include <sys/fsctl.h>
56#include <sys/mount_internal.h>
57#include <sys/file_internal.h>
58
59#include <miscfs/specfs/specdev.h>
60
61#include <sys/ubc.h>
62#include <sys/ubc_internal.h>
63
64#include <vm/vm_pageout.h>
65#include <vm/vm_kern.h>
66
67#include <sys/kdebug.h>
68
69#include	"hfs.h"
70#include	"hfs_attrlist.h"
71#include	"hfs_endian.h"
72#include  	"hfs_fsctl.h"
73#include	"hfs_quota.h"
74#include	"hfscommon/headers/FileMgrInternal.h"
75#include	"hfscommon/headers/BTreesInternal.h"
76#include	"hfs_cnode.h"
77#include	"hfs_dbg.h"
78
79#define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
80
81enum {
82	MAXHFSFILESIZE = 0x7FFFFFFF		/* this needs to go in the mount structure */
83};
84
85/* from bsd/hfs/hfs_vfsops.c */
86extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
87
88static int  hfs_clonefile(struct vnode *, int, int, int);
89static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
90static int  hfs_minorupdate(struct vnode *vp);
91static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
92
93/* from bsd/hfs/hfs_vnops.c */
94extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
95
96
97
98int flush_cache_on_write = 0;
99SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
100
101/*
102 * Read data from a file.
103 */
104int
105hfs_vnop_read(struct vnop_read_args *ap)
106{
107	/*
108	   struct vnop_read_args {
109	   struct vnodeop_desc *a_desc;
110	   vnode_t a_vp;
111	   struct uio *a_uio;
112	   int a_ioflag;
113	   vfs_context_t a_context;
114	   };
115	 */
116
117	uio_t uio = ap->a_uio;
118	struct vnode *vp = ap->a_vp;
119	struct cnode *cp;
120	struct filefork *fp;
121	struct hfsmount *hfsmp;
122	off_t filesize;
123	off_t filebytes;
124	off_t start_resid = uio_resid(uio);
125	off_t offset = uio_offset(uio);
126	int retval = 0;
127	int took_truncate_lock = 0;
128	int io_throttle = 0;
129
130	/* Preflight checks */
131	if (!vnode_isreg(vp)) {
132		/* can only read regular files */
133		if (vnode_isdir(vp))
134			return (EISDIR);
135		else
136			return (EPERM);
137	}
138	if (start_resid == 0)
139		return (0);		/* Nothing left to do */
140	if (offset < 0)
141		return (EINVAL);	/* cant read from a negative offset */
142
143
144
145#if HFS_COMPRESSION
146	if (VNODE_IS_RSRC(vp)) {
147		if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
148			return 0;
149		}
150		/* otherwise read the resource fork normally */
151	} else {
152		int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
153		if (compressed) {
154			retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
155			if (compressed) {
156				if (retval == 0) {
157					/* successful read, update the access time */
158					VTOC(vp)->c_touch_acctime = TRUE;
159
160					/* compressed files are not hot file candidates */
161					if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
162						VTOF(vp)->ff_bytesread = 0;
163					}
164				}
165				return retval;
166			}
167			/* otherwise the file was converted back to a regular file while we were reading it */
168			retval = 0;
169		} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
170			int error;
171
172			error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
173			if (error) {
174				return error;
175			}
176
177		}
178	}
179#endif /* HFS_COMPRESSION */
180
181	cp = VTOC(vp);
182	fp = VTOF(vp);
183	hfsmp = VTOHFS(vp);
184
185#if CONFIG_PROTECT
186	if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
187		goto exit;
188	}
189#endif
190
191	/*
192	 * If this read request originated from a syscall (as opposed to
193	 * an in-kernel page fault or something), then set it up for
194	 * throttle checks
195	 */
196	if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
197		io_throttle = IO_RETURN_ON_THROTTLE;
198	}
199
200read_again:
201
202	/* Protect against a size change. */
203	hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
204	took_truncate_lock = 1;
205
206	filesize = fp->ff_size;
207	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
208	if (offset > filesize) {
209		if ((hfsmp->hfs_flags & HFS_STANDARD) &&
210		    (offset > (off_t)MAXHFSFILESIZE)) {
211			retval = EFBIG;
212		}
213		goto exit;
214	}
215
216	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
217		(int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
218
219	retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
220
221	cp->c_touch_acctime = TRUE;
222
223	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
224		(int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
225
226	/*
227	 * Keep track blocks read
228	 */
229	if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
230		int took_cnode_lock = 0;
231		off_t bytesread;
232
233		bytesread = start_resid - uio_resid(uio);
234
235		/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
236		if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
237			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
238			took_cnode_lock = 1;
239		}
240		/*
241		 * If this file hasn't been seen since the start of
242		 * the current sampling period then start over.
243		 */
244		if (cp->c_atime < hfsmp->hfc_timebase) {
245			struct timeval tv;
246
247			fp->ff_bytesread = bytesread;
248			microtime(&tv);
249			cp->c_atime = tv.tv_sec;
250		} else {
251			fp->ff_bytesread += bytesread;
252		}
253		if (took_cnode_lock)
254			hfs_unlock(cp);
255	}
256exit:
257	if (took_truncate_lock) {
258		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
259	}
260	if (retval == EAGAIN) {
261		throttle_lowpri_io(1);
262
263		retval = 0;
264		goto read_again;
265	}
266	return (retval);
267}
268
269/*
270 * Write data to a file.
271 */
272int
273hfs_vnop_write(struct vnop_write_args *ap)
274{
275	uio_t uio = ap->a_uio;
276	struct vnode *vp = ap->a_vp;
277	struct cnode *cp;
278	struct filefork *fp;
279	struct hfsmount *hfsmp;
280	kauth_cred_t cred = NULL;
281	off_t origFileSize;
282	off_t writelimit;
283	off_t bytesToAdd = 0;
284	off_t actualBytesAdded;
285	off_t filebytes;
286	off_t offset;
287	ssize_t resid;
288	int eflags;
289	int ioflag = ap->a_ioflag;
290	int retval = 0;
291	int lockflags;
292	int cnode_locked = 0;
293	int partialwrite = 0;
294	int do_snapshot = 1;
295	time_t orig_ctime=VTOC(vp)->c_ctime;
296	int took_truncate_lock = 0;
297	int io_return_on_throttle = 0;
298	struct rl_entry *invalid_range;
299
300#if HFS_COMPRESSION
301	if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
302		int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
303		switch(state) {
304			case FILE_IS_COMPRESSED:
305				return EACCES;
306			case FILE_IS_CONVERTING:
307				/* if FILE_IS_CONVERTING, we allow writes but do not
308				   bother with snapshots or else we will deadlock.
309				*/
310				do_snapshot = 0;
311				break;
312			default:
313				printf("invalid state %d for compressed file\n", state);
314				/* fall through */
315		}
316	} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
317		int error;
318
319		error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
320		if (error != 0) {
321			return error;
322		}
323	}
324
325	if (do_snapshot) {
326		check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
327	}
328
329#endif
330
331	resid = uio_resid(uio);
332	offset = uio_offset(uio);
333
334	if (offset < 0)
335		return (EINVAL);
336	if (resid == 0)
337		return (E_NONE);
338	if (!vnode_isreg(vp))
339		return (EPERM);  /* Can only write regular files */
340
341	cp = VTOC(vp);
342	fp = VTOF(vp);
343	hfsmp = VTOHFS(vp);
344
345#if CONFIG_PROTECT
346	if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
347		goto exit;
348	}
349#endif
350
351	eflags = kEFDeferMask;	/* defer file block allocations */
352#if HFS_SPARSE_DEV
353	/*
354	 * When the underlying device is sparse and space
355	 * is low (< 8MB), stop doing delayed allocations
356	 * and begin doing synchronous I/O.
357	 */
358	if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
359	    (hfs_freeblks(hfsmp, 0) < 2048)) {
360		eflags &= ~kEFDeferMask;
361		ioflag |= IO_SYNC;
362	}
363#endif /* HFS_SPARSE_DEV */
364
365	if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
366			(IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
367		io_return_on_throttle = IO_RETURN_ON_THROTTLE;
368	}
369
370again:
371	/* Protect against a size change. */
372	/*
373	 * Protect against a size change.
374	 *
375	 * Note: If took_truncate_lock is true, then we previously got the lock shared
376	 * but needed to upgrade to exclusive.  So try getting it exclusive from the
377	 * start.
378	 */
379	if (ioflag & IO_APPEND || took_truncate_lock) {
380		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
381	}
382	else {
383		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
384	}
385	took_truncate_lock = 1;
386
387	/* Update UIO */
388	if (ioflag & IO_APPEND) {
389		uio_setoffset(uio, fp->ff_size);
390		offset = fp->ff_size;
391	}
392	if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
393		retval = EPERM;
394		goto exit;
395	}
396
397	origFileSize = fp->ff_size;
398	writelimit = offset + resid;
399	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
400
401	/*
402	 * We may need an exclusive truncate lock for several reasons, all
403	 * of which are because we may be writing to a (portion of a) block
404	 * for the first time, and we need to make sure no readers see the
405	 * prior, uninitialized contents of the block.  The cases are:
406	 *
407	 * 1. We have unallocated (delayed allocation) blocks.  We may be
408	 *    allocating new blocks to the file and writing to them.
409	 *    (A more precise check would be whether the range we're writing
410	 *    to contains delayed allocation blocks.)
411	 * 2. We need to extend the file.  The bytes between the old EOF
412	 *    and the new EOF are not yet initialized.  This is important
413	 *    even if we're not allocating new blocks to the file.  If the
414	 *    old EOF and new EOF are in the same block, we still need to
415	 *    protect that range of bytes until they are written for the
416	 *    first time.
417	 * 3. The write overlaps some invalid ranges (delayed zero fill; that
418	 *    part of the file has been allocated, but not yet written).
419	 *
420	 * If we had a shared lock with the above cases, we need to try to upgrade
421	 * to an exclusive lock.  If the upgrade fails, we will lose the shared
422	 * lock, and will need to take the truncate lock again; the took_truncate_lock
423	 * flag will still be set, causing us to try for an exclusive lock next time.
424	 *
425	 * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
426	 * lock is held, since it protects the range lists.
427	 */
428	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
429	    ((fp->ff_unallocblocks != 0) ||
430	     (writelimit > origFileSize))) {
431		if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
432			/*
433			 * Lock upgrade failed and we lost our shared lock, try again.
434			 * Note: we do not set took_truncate_lock=0 here.  Leaving it
435			 * set to 1 will cause us to try to get the lock exclusive.
436			 */
437			goto again;
438		}
439		else {
440			/* Store the owner in the c_truncatelockowner field if we successfully upgrade */
441			cp->c_truncatelockowner = current_thread();
442		}
443	}
444
445	if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
446		goto exit;
447	}
448	cnode_locked = 1;
449
450	if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
451		hfs_incr_gencount (cp);
452	}
453
454	/*
455	 * Now that we have the cnode lock, see if there are delayed zero fill ranges
456	 * overlapping our write.  If so, we need the truncate lock exclusive (see above).
457	 */
458	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
459	    (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
460	    	/*
461		 * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
462		 * a deadlock, rather than simply returning failure.  (That is, it apparently does
463		 * not behave like a "try_lock").  Since this condition is rare, just drop the
464		 * cnode lock and try again.  Since took_truncate_lock is set, we will
465		 * automatically take the truncate lock exclusive.
466		 */
467		hfs_unlock(cp);
468		cnode_locked = 0;
469		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
470		goto again;
471	}
472
473	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
474		     (int)offset, uio_resid(uio), (int)fp->ff_size,
475		     (int)filebytes, 0);
476
477	/* Check if we do not need to extend the file */
478	if (writelimit <= filebytes) {
479		goto sizeok;
480	}
481
482	cred = vfs_context_ucred(ap->a_context);
483	bytesToAdd = writelimit - filebytes;
484
485#if QUOTA
486	retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
487			   cred, 0);
488	if (retval)
489		goto exit;
490#endif /* QUOTA */
491
492	if (hfs_start_transaction(hfsmp) != 0) {
493		retval = EINVAL;
494		goto exit;
495	}
496
497	while (writelimit > filebytes) {
498		bytesToAdd = writelimit - filebytes;
499		if (cred && suser(cred, NULL) != 0)
500			eflags |= kEFReserveMask;
501
502		/* Protect extents b-tree and allocation bitmap */
503		lockflags = SFL_BITMAP;
504		if (overflow_extents(fp))
505			lockflags |= SFL_EXTENTS;
506		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
507
508		/* Files that are changing size are not hot file candidates. */
509		if (hfsmp->hfc_stage == HFC_RECORDING) {
510			fp->ff_bytesread = 0;
511		}
512		retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
513				0, eflags, &actualBytesAdded));
514
515		hfs_systemfile_unlock(hfsmp, lockflags);
516
517		if ((actualBytesAdded == 0) && (retval == E_NONE))
518			retval = ENOSPC;
519		if (retval != E_NONE)
520			break;
521		filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
522		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
523			(int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
524	}
525	(void) hfs_update(vp, TRUE);
526	(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
527	(void) hfs_end_transaction(hfsmp);
528
529	/*
530	 * If we didn't grow the file enough try a partial write.
531	 * POSIX expects this behavior.
532	 */
533	if ((retval == ENOSPC) && (filebytes > offset)) {
534		retval = 0;
535		partialwrite = 1;
536		uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
537		resid -= bytesToAdd;
538		writelimit = filebytes;
539	}
540sizeok:
541	if (retval == E_NONE) {
542		off_t filesize;
543		off_t zero_off;
544		off_t tail_off;
545		off_t inval_start;
546		off_t inval_end;
547		off_t io_start;
548		int lflag;
549
550		if (writelimit > fp->ff_size)
551			filesize = writelimit;
552		else
553			filesize = fp->ff_size;
554
555		lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
556
557		if (offset <= fp->ff_size) {
558			zero_off = offset & ~PAGE_MASK_64;
559
560			/* Check to see whether the area between the zero_offset and the start
561			   of the transfer to see whether is invalid and should be zero-filled
562			   as part of the transfer:
563			 */
564			if (offset > zero_off) {
565			        if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
566				        lflag |= IO_HEADZEROFILL;
567			}
568		} else {
569			off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
570
571			/* The bytes between fp->ff_size and uio->uio_offset must never be
572			   read without being zeroed.  The current last block is filled with zeroes
573			   if it holds valid data but in all cases merely do a little bookkeeping
574			   to track the area from the end of the current last page to the start of
575			   the area actually written.  For the same reason only the bytes up to the
576			   start of the page where this write will start is invalidated; any remainder
577			   before uio->uio_offset is explicitly zeroed as part of the cluster_write.
578
579			   Note that inval_start, the start of the page after the current EOF,
580			   may be past the start of the write, in which case the zeroing
581			   will be handled by the cluser_write of the actual data.
582			 */
583			inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
584			inval_end = offset & ~PAGE_MASK_64;
585			zero_off = fp->ff_size;
586
587			if ((fp->ff_size & PAGE_MASK_64) &&
588				(rl_scan(&fp->ff_invalidranges,
589							eof_page_base,
590							fp->ff_size - 1,
591							&invalid_range) != RL_NOOVERLAP)) {
592				/* The page containing the EOF is not valid, so the
593				   entire page must be made inaccessible now.  If the write
594				   starts on a page beyond the page containing the eof
595				   (inval_end > eof_page_base), add the
596				   whole page to the range to be invalidated.  Otherwise
597				   (i.e. if the write starts on the same page), zero-fill
598				   the entire page explicitly now:
599				 */
600				if (inval_end > eof_page_base) {
601					inval_start = eof_page_base;
602				} else {
603					zero_off = eof_page_base;
604				};
605			};
606
607			if (inval_start < inval_end) {
608				struct timeval tv;
609				/* There's some range of data that's going to be marked invalid */
610
611				if (zero_off < inval_start) {
612					/* The pages between inval_start and inval_end are going to be invalidated,
613					   and the actual write will start on a page past inval_end.  Now's the last
614					   chance to zero-fill the page containing the EOF:
615					 */
616					hfs_unlock(cp);
617					cnode_locked = 0;
618					retval = cluster_write(vp, (uio_t) 0,
619							fp->ff_size, inval_start,
620							zero_off, (off_t)0,
621							lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
622					hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
623					cnode_locked = 1;
624					if (retval) goto ioerr_exit;
625					offset = uio_offset(uio);
626				};
627
628				/* Mark the remaining area of the newly allocated space as invalid: */
629				rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
630				microuptime(&tv);
631				cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
632				zero_off = fp->ff_size = inval_end;
633			};
634
635			if (offset > zero_off) lflag |= IO_HEADZEROFILL;
636		};
637
638		/* Check to see whether the area between the end of the write and the end of
639		   the page it falls in is invalid and should be zero-filled as part of the transfer:
640		 */
641		tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
642		if (tail_off > filesize) tail_off = filesize;
643		if (tail_off > writelimit) {
644			if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
645				lflag |= IO_TAILZEROFILL;
646			};
647		};
648
649		/*
650		 * if the write starts beyond the current EOF (possibly advanced in the
651		 * zeroing of the last block, above), then we'll zero fill from the current EOF
652		 * to where the write begins:
653		 *
654		 * NOTE: If (and ONLY if) the portion of the file about to be written is
655		 *       before the current EOF it might be marked as invalid now and must be
656		 *       made readable (removed from the invalid ranges) before cluster_write
657		 *       tries to write it:
658		 */
659		io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
660		if (io_start < fp->ff_size) {
661			off_t io_end;
662
663			io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
664			rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
665		};
666
667		hfs_unlock(cp);
668		cnode_locked = 0;
669
670		/*
671		 * We need to tell UBC the fork's new size BEFORE calling
672		 * cluster_write, in case any of the new pages need to be
673		 * paged out before cluster_write completes (which does happen
674		 * in embedded systems due to extreme memory pressure).
675		 * Similarly, we need to tell hfs_vnop_pageout what the new EOF
676		 * will be, so that it can pass that on to cluster_pageout, and
677		 * allow those pageouts.
678		 *
679		 * We don't update ff_size yet since we don't want pageins to
680		 * be able to see uninitialized data between the old and new
681		 * EOF, until cluster_write has completed and initialized that
682		 * part of the file.
683		 *
684		 * The vnode pager relies on the file size last given to UBC via
685		 * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
686		 * ff_size (whichever is larger).  NOTE: ff_new_size is always
687		 * zero, unless we are extending the file via write.
688		 */
689		if (filesize > fp->ff_size) {
690			fp->ff_new_size = filesize;
691			ubc_setsize(vp, filesize);
692		}
693		retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
694				tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
695		if (retval) {
696			fp->ff_new_size = 0;	/* no longer extending; use ff_size */
697
698			if (retval == EAGAIN) {
699				/*
700				 * EAGAIN indicates that we still have I/O to do, but
701				 * that we now need to be throttled
702				 */
703				if (resid != uio_resid(uio)) {
704					/*
705					 * did manage to do some I/O before returning EAGAIN
706					 */
707					resid = uio_resid(uio);
708					offset = uio_offset(uio);
709
710					cp->c_touch_chgtime = TRUE;
711					cp->c_touch_modtime = TRUE;
712				}
713				if (filesize > fp->ff_size) {
714					/*
715					 * we called ubc_setsize before the call to
716					 * cluster_write... since we only partially
717					 * completed the I/O, we need to
718					 * re-adjust our idea of the filesize based
719					 * on our interim EOF
720					 */
721					ubc_setsize(vp, offset);
722
723					fp->ff_size = offset;
724				}
725				goto exit;
726			}
727			if (filesize > origFileSize) {
728				ubc_setsize(vp, origFileSize);
729			}
730			goto ioerr_exit;
731		}
732
733		if (filesize > origFileSize) {
734			fp->ff_size = filesize;
735
736			/* Files that are changing size are not hot file candidates. */
737			if (hfsmp->hfc_stage == HFC_RECORDING) {
738				fp->ff_bytesread = 0;
739			}
740		}
741		fp->ff_new_size = 0;	/* ff_size now has the correct size */
742
743		/* If we wrote some bytes, then touch the change and mod times */
744		if (resid > uio_resid(uio)) {
745			cp->c_touch_chgtime = TRUE;
746			cp->c_touch_modtime = TRUE;
747		}
748	}
749	if (partialwrite) {
750		uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
751		resid += bytesToAdd;
752	}
753
754	// XXXdbg - see radar 4871353 for more info
755	{
756	    if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
757		VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
758	    }
759	}
760
761ioerr_exit:
762	/*
763	 * If we successfully wrote any data, and we are not the superuser
764	 * we clear the setuid and setgid bits as a precaution against
765	 * tampering.
766	 */
767	if (cp->c_mode & (S_ISUID | S_ISGID)) {
768		cred = vfs_context_ucred(ap->a_context);
769		if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
770			if (!cnode_locked) {
771				hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
772				cnode_locked = 1;
773			}
774			cp->c_mode &= ~(S_ISUID | S_ISGID);
775		}
776	}
777	if (retval) {
778		if (ioflag & IO_UNIT) {
779			if (!cnode_locked) {
780				hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
781				cnode_locked = 1;
782			}
783			(void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
784			                   0, 0, ap->a_context);
785			uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
786			uio_setresid(uio, resid);
787			filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
788		}
789	} else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
790		if (!cnode_locked) {
791			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
792			cnode_locked = 1;
793		}
794		retval = hfs_update(vp, TRUE);
795	}
796	/* Updating vcbWrCnt doesn't need to be atomic. */
797	hfsmp->vcbWrCnt++;
798
799	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
800		(int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
801exit:
802	if (cnode_locked)
803		hfs_unlock(cp);
804
805	if (took_truncate_lock) {
806		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
807	}
808	if (retval == EAGAIN) {
809		throttle_lowpri_io(1);
810
811		retval = 0;
812		goto again;
813	}
814	return (retval);
815}
816
817/* support for the "bulk-access" fcntl */
818
819#define CACHE_LEVELS 16
820#define NUM_CACHE_ENTRIES (64*16)
821#define PARENT_IDS_FLAG 0x100
822
823struct access_cache {
824       int numcached;
825       int cachehits; /* these two for statistics gathering */
826       int lookups;
827       unsigned int *acache;
828       unsigned char *haveaccess;
829};
830
831struct access_t {
832	uid_t     uid;              /* IN: effective user id */
833	short     flags;            /* IN: access requested (i.e. R_OK) */
834	short     num_groups;       /* IN: number of groups user belongs to */
835	int       num_files;        /* IN: number of files to process */
836	int       *file_ids;        /* IN: array of file ids */
837	gid_t     *groups;          /* IN: array of groups */
838	short     *access;          /* OUT: access info for each file (0 for 'has access') */
839} __attribute__((unavailable)); // this structure is for reference purposes only
840
841struct user32_access_t {
842	uid_t     uid;              /* IN: effective user id */
843	short     flags;            /* IN: access requested (i.e. R_OK) */
844	short     num_groups;       /* IN: number of groups user belongs to */
845	int       num_files;        /* IN: number of files to process */
846	user32_addr_t      file_ids;        /* IN: array of file ids */
847	user32_addr_t      groups;          /* IN: array of groups */
848	user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
849};
850
851struct user64_access_t {
852	uid_t		uid;			/* IN: effective user id */
853	short		flags;			/* IN: access requested (i.e. R_OK) */
854	short		num_groups;		/* IN: number of groups user belongs to */
855	int		num_files;		/* IN: number of files to process */
856	user64_addr_t	file_ids;		/* IN: array of file ids */
857	user64_addr_t	groups;			/* IN: array of groups */
858	user64_addr_t	access;			/* OUT: access info for each file (0 for 'has access') */
859};
860
861
862// these are the "extended" versions of the above structures
863// note that it is crucial that they be different sized than
864// the regular version
865struct ext_access_t {
866	uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
867	uint32_t   num_files;       /* IN: number of files to process */
868	uint32_t   map_size;        /* IN: size of the bit map */
869	uint32_t  *file_ids;        /* IN: Array of file ids */
870	char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
871	short     *access;          /* OUT: access info for each file (0 for 'has access') */
872	uint32_t   num_parents;   /* future use */
873	cnid_t      *parents;   /* future use */
874} __attribute__((unavailable)); // this structure is for reference purposes only
875
876struct user32_ext_access_t {
877	uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
878	uint32_t   num_files;       /* IN: number of files to process */
879	uint32_t   map_size;        /* IN: size of the bit map */
880	user32_addr_t  file_ids;        /* IN: Array of file ids */
881	user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
882	user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
883	uint32_t   num_parents;   /* future use */
884	user32_addr_t parents;   /* future use */
885};
886
887struct user64_ext_access_t {
888	uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
889	uint32_t      num_files;    /* IN: number of files to process */
890	uint32_t      map_size;     /* IN: size of the bit map */
891	user64_addr_t   file_ids;     /* IN: array of file ids */
892	user64_addr_t   bitmap;       /* IN: array of groups */
893	user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
894	uint32_t      num_parents;/* future use */
895	user64_addr_t   parents;/* future use */
896};
897
898
899/*
900 * Perform a binary search for the given parent_id. Return value is
901 * the index if there is a match.  If no_match_indexp is non-NULL it
902 * will be assigned with the index to insert the item (even if it was
903 * not found).
904 */
905static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
906{
907    int index=-1;
908    unsigned int lo=0;
909
910    do {
911	unsigned int mid = ((hi - lo)/2) + lo;
912	unsigned int this_id = array[mid];
913
914	if (parent_id == this_id) {
915	    hi = mid;
916	    break;
917	}
918
919	if (parent_id < this_id) {
920	    hi = mid;
921	    continue;
922	}
923
924	if (parent_id > this_id) {
925	    lo = mid + 1;
926	    continue;
927	}
928    } while(lo < hi);
929
930    /* check if lo and hi converged on the match */
931    if (parent_id == array[hi]) {
932	index = hi;
933    }
934
935    if (no_match_indexp) {
936	*no_match_indexp = hi;
937    }
938
939    return index;
940}
941
942
943static int
944lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
945{
946    unsigned int hi;
947    int matches = 0;
948    int index, no_match_index;
949
950    if (cache->numcached == 0) {
951	*indexp = 0;
952	return 0; // table is empty, so insert at index=0 and report no match
953    }
954
955    if (cache->numcached > NUM_CACHE_ENTRIES) {
956	cache->numcached = NUM_CACHE_ENTRIES;
957    }
958
959    hi = cache->numcached - 1;
960
961    index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
962
963    /* if no existing entry found, find index for new one */
964    if (index == -1) {
965	index = no_match_index;
966	matches = 0;
967    } else {
968	matches = 1;
969    }
970
971    *indexp = index;
972    return matches;
973}
974
975/*
976 * Add a node to the access_cache at the given index (or do a lookup first
977 * to find the index if -1 is passed in). We currently do a replace rather
978 * than an insert if the cache is full.
979 */
980static void
981add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
982{
983    int lookup_index = -1;
984
985    /* need to do a lookup first if -1 passed for index */
986    if (index == -1) {
987	if (lookup_bucket(cache, &lookup_index, nodeID)) {
988	    if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
989		// only update an entry if the previous access was ESRCH (i.e. a scope checking error)
990		cache->haveaccess[lookup_index] = access;
991	    }
992
993	    /* mission accomplished */
994	    return;
995	} else {
996	    index = lookup_index;
997	}
998
999    }
1000
1001    /* if the cache is full, do a replace rather than an insert */
1002    if (cache->numcached >= NUM_CACHE_ENTRIES) {
1003	cache->numcached = NUM_CACHE_ENTRIES-1;
1004
1005	if (index > cache->numcached) {
1006	    index = cache->numcached;
1007	}
1008    }
1009
1010    if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1011	index++;
1012    }
1013
1014    if (index >= 0 && index < cache->numcached) {
1015	/* only do bcopy if we're inserting */
1016	bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1017	bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1018    }
1019
1020    cache->acache[index] = nodeID;
1021    cache->haveaccess[index] = access;
1022    cache->numcached++;
1023}
1024
1025
1026struct cinfo {
1027    uid_t   uid;
1028    gid_t   gid;
1029    mode_t  mode;
1030    cnid_t  parentcnid;
1031    u_int16_t recflags;
1032};
1033
1034static int
1035snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
1036{
1037    struct cinfo *cip = (struct cinfo *)arg;
1038
1039    cip->uid = attrp->ca_uid;
1040    cip->gid = attrp->ca_gid;
1041    cip->mode = attrp->ca_mode;
1042    cip->parentcnid = descp->cd_parentcnid;
1043    cip->recflags = attrp->ca_recflags;
1044
1045    return (0);
1046}
1047
1048/*
1049 * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1050 * isn't incore, then go to the catalog.
1051 */
1052static int
1053do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1054    struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1055{
1056    int error = 0;
1057
1058    /* if this id matches the one the fsctl was called with, skip the lookup */
1059    if (cnid == skip_cp->c_cnid) {
1060	cnattrp->ca_uid = skip_cp->c_uid;
1061	cnattrp->ca_gid = skip_cp->c_gid;
1062	cnattrp->ca_mode = skip_cp->c_mode;
1063	cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1064	keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1065    } else {
1066	struct cinfo c_info;
1067
1068	/* otherwise, check the cnode hash incase the file/dir is incore */
1069	if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
1070	    cnattrp->ca_uid = c_info.uid;
1071	    cnattrp->ca_gid = c_info.gid;
1072	    cnattrp->ca_mode = c_info.mode;
1073	    cnattrp->ca_recflags = c_info.recflags;
1074	    keyp->hfsPlus.parentID = c_info.parentcnid;
1075	} else {
1076	    int lockflags;
1077
1078	    if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1079		    throttle_lowpri_io(1);
1080
1081	    lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1082
1083	    /* lookup this cnid in the catalog */
1084	    error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1085
1086	    hfs_systemfile_unlock(hfsmp, lockflags);
1087
1088	    cache->lookups++;
1089	}
1090    }
1091
1092    return (error);
1093}
1094
1095
1096/*
1097 * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1098 * up to CACHE_LEVELS as we progress towards the root.
1099 */
1100static int
1101do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1102    struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1103    struct vfs_context *my_context,
1104    char *bitmap,
1105    uint32_t map_size,
1106    cnid_t* parents,
1107    uint32_t num_parents)
1108{
1109    int                     myErr = 0;
1110    int                     myResult;
1111    HFSCatalogNodeID        thisNodeID;
1112    unsigned int            myPerms;
1113    struct cat_attr         cnattr;
1114    int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1115    CatalogKey              catkey;
1116
1117    int i = 0, ids_to_cache = 0;
1118    int parent_ids[CACHE_LEVELS];
1119
1120    thisNodeID = nodeID;
1121    while (thisNodeID >=  kRootDirID) {
1122	myResult = 0;   /* default to "no access" */
1123
1124	/* check the cache before resorting to hitting the catalog */
1125
1126	/* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1127	 * to look any further after hitting cached dir */
1128
1129	if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1130	    cache->cachehits++;
1131	    myErr = cache->haveaccess[cache_index];
1132	    if (scope_index != -1) {
1133		if (myErr == ESRCH) {
1134		    myErr = 0;
1135		}
1136	    } else {
1137		scope_index = 0;   // so we'll just use the cache result
1138		scope_idx_start = ids_to_cache;
1139	    }
1140	    myResult = (myErr == 0) ? 1 : 0;
1141	    goto ExitThisRoutine;
1142	}
1143
1144
1145	if (parents) {
1146	    int tmp;
1147	    tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1148	    if (scope_index == -1)
1149		scope_index = tmp;
1150	    if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1151		scope_idx_start = ids_to_cache;
1152	    }
1153	}
1154
1155	/* remember which parents we want to cache */
1156	if (ids_to_cache < CACHE_LEVELS) {
1157	    parent_ids[ids_to_cache] = thisNodeID;
1158	    ids_to_cache++;
1159	}
1160	// Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1161	if (bitmap && map_size) {
1162	    bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1163	}
1164
1165
1166	/* do the lookup (checks the cnode hash, then the catalog) */
1167	myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1168	if (myErr) {
1169	    goto ExitThisRoutine; /* no access */
1170	}
1171
1172	/* Root always gets access. */
1173	if (suser(myp_ucred, NULL) == 0) {
1174		thisNodeID = catkey.hfsPlus.parentID;
1175		myResult = 1;
1176		continue;
1177	}
1178
1179	// if the thing has acl's, do the full permission check
1180	if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1181	    struct vnode *vp;
1182
1183	    /* get the vnode for this cnid */
1184	    myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1185	    if ( myErr ) {
1186		myResult = 0;
1187		goto ExitThisRoutine;
1188	    }
1189
1190	    thisNodeID = VTOC(vp)->c_parentcnid;
1191
1192	    hfs_unlock(VTOC(vp));
1193
1194	    if (vnode_vtype(vp) == VDIR) {
1195		myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1196	    } else {
1197		myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1198	    }
1199
1200	    vnode_put(vp);
1201	    if (myErr) {
1202		myResult = 0;
1203		goto ExitThisRoutine;
1204	    }
1205	} else {
1206	    unsigned int flags;
1207		int mode = cnattr.ca_mode & S_IFMT;
1208		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1209
1210		if (mode == S_IFDIR) {
1211			flags = R_OK | X_OK;
1212		} else {
1213			flags = R_OK;
1214		}
1215		if ( (myPerms & flags) != flags) {
1216			myResult = 0;
1217			myErr = EACCES;
1218			goto ExitThisRoutine;   /* no access */
1219		}
1220
1221	    /* up the hierarchy we go */
1222	    thisNodeID = catkey.hfsPlus.parentID;
1223	}
1224    }
1225
1226    /* if here, we have access to this node */
1227    myResult = 1;
1228
1229  ExitThisRoutine:
1230    if (parents && myErr == 0 && scope_index == -1) {
1231	myErr = ESRCH;
1232    }
1233
1234    if (myErr) {
1235	myResult = 0;
1236    }
1237    *err = myErr;
1238
1239    /* cache the parent directory(ies) */
1240    for (i = 0; i < ids_to_cache; i++) {
1241	if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1242	    add_node(cache, -1, parent_ids[i], ESRCH);
1243	} else {
1244	    add_node(cache, -1, parent_ids[i], myErr);
1245	}
1246    }
1247
1248    return (myResult);
1249}
1250
1251static int
1252do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1253    struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1254{
1255    boolean_t is64bit;
1256
1257    /*
1258     * NOTE: on entry, the vnode has an io_ref. In case this vnode
1259     * happens to be in our list of file_ids, we'll note it
1260     * avoid calling hfs_chashget_nowait() on that id as that
1261     * will cause a "locking against myself" panic.
1262     */
1263    Boolean check_leaf = true;
1264
1265    struct user64_ext_access_t *user_access_structp;
1266    struct user64_ext_access_t tmp_user_access;
1267    struct access_cache cache;
1268
1269    int error = 0, prev_parent_check_ok=1;
1270    unsigned int i;
1271
1272    short flags;
1273    unsigned int num_files = 0;
1274    int map_size = 0;
1275    int num_parents = 0;
1276    int *file_ids=NULL;
1277    short *access=NULL;
1278    char *bitmap=NULL;
1279    cnid_t *parents=NULL;
1280    int leaf_index;
1281
1282    cnid_t cnid;
1283    cnid_t prevParent_cnid = 0;
1284    unsigned int myPerms;
1285    short myaccess = 0;
1286    struct cat_attr cnattr;
1287    CatalogKey catkey;
1288    struct cnode *skip_cp = VTOC(vp);
1289    kauth_cred_t cred = vfs_context_ucred(context);
1290    proc_t p = vfs_context_proc(context);
1291
1292    is64bit = proc_is64bit(p);
1293
1294    /* initialize the local cache and buffers */
1295    cache.numcached = 0;
1296    cache.cachehits = 0;
1297    cache.lookups = 0;
1298    cache.acache = NULL;
1299    cache.haveaccess = NULL;
1300
1301    /* struct copyin done during dispatch... need to copy file_id array separately */
1302    if (ap->a_data == NULL) {
1303	error = EINVAL;
1304	goto err_exit_bulk_access;
1305    }
1306
1307    if (is64bit) {
1308	if (arg_size != sizeof(struct user64_ext_access_t)) {
1309	    error = EINVAL;
1310	    goto err_exit_bulk_access;
1311	}
1312
1313	user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1314
1315    } else if (arg_size == sizeof(struct user32_access_t)) {
1316	struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1317
1318	// convert an old style bulk-access struct to the new style
1319	tmp_user_access.flags     = accessp->flags;
1320	tmp_user_access.num_files = accessp->num_files;
1321	tmp_user_access.map_size  = 0;
1322	tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1323	tmp_user_access.bitmap    = USER_ADDR_NULL;
1324	tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1325	tmp_user_access.num_parents = 0;
1326	user_access_structp = &tmp_user_access;
1327
1328    } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1329	struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1330
1331	// up-cast from a 32-bit version of the struct
1332	tmp_user_access.flags     = accessp->flags;
1333	tmp_user_access.num_files = accessp->num_files;
1334	tmp_user_access.map_size  = accessp->map_size;
1335	tmp_user_access.num_parents  = accessp->num_parents;
1336
1337	tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1338	tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1339	tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1340	tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1341
1342	user_access_structp = &tmp_user_access;
1343    } else {
1344	error = EINVAL;
1345	goto err_exit_bulk_access;
1346    }
1347
1348    map_size = user_access_structp->map_size;
1349
1350    num_files = user_access_structp->num_files;
1351
1352    num_parents= user_access_structp->num_parents;
1353
1354    if (num_files < 1) {
1355	goto err_exit_bulk_access;
1356    }
1357    if (num_files > 1024) {
1358	error = EINVAL;
1359	goto err_exit_bulk_access;
1360    }
1361
1362    if (num_parents > 1024) {
1363	error = EINVAL;
1364	goto err_exit_bulk_access;
1365    }
1366
1367    file_ids = (int *) kalloc(sizeof(int) * num_files);
1368    access = (short *) kalloc(sizeof(short) * num_files);
1369    if (map_size) {
1370	bitmap = (char *) kalloc(sizeof(char) * map_size);
1371    }
1372
1373    if (num_parents) {
1374	parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1375    }
1376
1377    cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1378    cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1379
1380    if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1381	if (file_ids) {
1382	    kfree(file_ids, sizeof(int) * num_files);
1383	}
1384	if (bitmap) {
1385	    kfree(bitmap, sizeof(char) * map_size);
1386	}
1387	if (access) {
1388	    kfree(access, sizeof(short) * num_files);
1389	}
1390	if (cache.acache) {
1391	    kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1392	}
1393	if (cache.haveaccess) {
1394	    kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1395	}
1396	if (parents) {
1397	    kfree(parents, sizeof(cnid_t) * num_parents);
1398	}
1399	return ENOMEM;
1400    }
1401
1402    // make sure the bitmap is zero'ed out...
1403    if (bitmap) {
1404	bzero(bitmap, (sizeof(char) * map_size));
1405    }
1406
1407    if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1408		num_files * sizeof(int)))) {
1409	goto err_exit_bulk_access;
1410    }
1411
1412    if (num_parents) {
1413	if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1414		    num_parents * sizeof(cnid_t)))) {
1415	    goto err_exit_bulk_access;
1416	}
1417    }
1418
1419    flags = user_access_structp->flags;
1420    if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1421	flags = R_OK;
1422    }
1423
1424    /* check if we've been passed leaf node ids or parent ids */
1425    if (flags & PARENT_IDS_FLAG) {
1426	check_leaf = false;
1427    }
1428
1429    /* Check access to each file_id passed in */
1430    for (i = 0; i < num_files; i++) {
1431	leaf_index=-1;
1432	cnid = (cnid_t) file_ids[i];
1433
1434	/* root always has access */
1435	if ((!parents) && (!suser(cred, NULL))) {
1436	    access[i] = 0;
1437	    continue;
1438	}
1439
1440	if (check_leaf) {
1441	    /* do the lookup (checks the cnode hash, then the catalog) */
1442	    error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1443	    if (error) {
1444		access[i] = (short) error;
1445		continue;
1446	    }
1447
1448	    if (parents) {
1449		// Check if the leaf matches one of the parent scopes
1450		leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1451 		if (leaf_index >= 0 && parents[leaf_index] == cnid)
1452 		    prev_parent_check_ok = 0;
1453 		else if (leaf_index >= 0)
1454 		    prev_parent_check_ok = 1;
1455	    }
1456
1457	    // if the thing has acl's, do the full permission check
1458	    if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1459		struct vnode *cvp;
1460		int myErr = 0;
1461		/* get the vnode for this cnid */
1462		myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1463		if ( myErr ) {
1464		    access[i] = myErr;
1465		    continue;
1466		}
1467
1468		hfs_unlock(VTOC(cvp));
1469
1470		if (vnode_vtype(cvp) == VDIR) {
1471		    myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1472		} else {
1473		    myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1474		}
1475
1476		vnode_put(cvp);
1477		if (myErr) {
1478		    access[i] = myErr;
1479		    continue;
1480		}
1481	    } else {
1482		/* before calling CheckAccess(), check the target file for read access */
1483		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1484		    cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1485
1486		/* fail fast if no access */
1487		if ((myPerms & flags) == 0) {
1488		    access[i] = EACCES;
1489		    continue;
1490		}
1491	    }
1492	} else {
1493	    /* we were passed an array of parent ids */
1494	    catkey.hfsPlus.parentID = cnid;
1495	}
1496
1497	/* if the last guy had the same parent and had access, we're done */
1498 	if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1499	    cache.cachehits++;
1500	    access[i] = 0;
1501	    continue;
1502	}
1503
1504	myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1505	    skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1506
1507	if (myaccess || (error == ESRCH && leaf_index != -1)) {
1508	    access[i] = 0; // have access.. no errors to report
1509	} else {
1510	    access[i] = (error != 0 ? (short) error : EACCES);
1511	}
1512
1513	prevParent_cnid = catkey.hfsPlus.parentID;
1514    }
1515
1516    /* copyout the access array */
1517    if ((error = copyout((caddr_t)access, user_access_structp->access,
1518		num_files * sizeof (short)))) {
1519	goto err_exit_bulk_access;
1520    }
1521    if (map_size && bitmap) {
1522	if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1523		    map_size * sizeof (char)))) {
1524	    goto err_exit_bulk_access;
1525	}
1526    }
1527
1528
1529  err_exit_bulk_access:
1530
1531    if (file_ids)
1532	kfree(file_ids, sizeof(int) * num_files);
1533    if (parents)
1534	kfree(parents, sizeof(cnid_t) * num_parents);
1535    if (bitmap)
1536	kfree(bitmap, sizeof(char) * map_size);
1537    if (access)
1538	kfree(access, sizeof(short) * num_files);
1539    if (cache.acache)
1540	kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1541    if (cache.haveaccess)
1542	kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1543
1544    return (error);
1545}
1546
1547
1548/* end "bulk-access" support */
1549
1550
1551/*
1552 * Callback for use with freeze ioctl.
1553 */
1554static int
1555hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1556{
1557	vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1558
1559	return 0;
1560}
1561
1562/*
1563 * Control filesystem operating characteristics.
1564 */
1565int
1566hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1567		vnode_t a_vp;
1568		int  a_command;
1569		caddr_t  a_data;
1570		int  a_fflag;
1571		vfs_context_t a_context;
1572	} */ *ap)
1573{
1574	struct vnode * vp = ap->a_vp;
1575	struct hfsmount *hfsmp = VTOHFS(vp);
1576	vfs_context_t context = ap->a_context;
1577	kauth_cred_t cred = vfs_context_ucred(context);
1578	proc_t p = vfs_context_proc(context);
1579	struct vfsstatfs *vfsp;
1580	boolean_t is64bit;
1581	off_t jnl_start, jnl_size;
1582	struct hfs_journal_info *jip;
1583#if HFS_COMPRESSION
1584	int compressed = 0;
1585	off_t uncompressed_size = -1;
1586	int decmpfs_error = 0;
1587
1588	if (ap->a_command == F_RDADVISE) {
1589		/* we need to inspect the decmpfs state of the file as early as possible */
1590		compressed = hfs_file_is_compressed(VTOC(vp), 0);
1591		if (compressed) {
1592			if (VNODE_IS_RSRC(vp)) {
1593				/* if this is the resource fork, treat it as if it were empty */
1594				uncompressed_size = 0;
1595			} else {
1596				decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1597				if (decmpfs_error != 0) {
1598					/* failed to get the uncompressed size, we'll check for this later */
1599					uncompressed_size = -1;
1600				}
1601			}
1602		}
1603	}
1604#endif /* HFS_COMPRESSION */
1605
1606	is64bit = proc_is64bit(p);
1607
1608#if CONFIG_PROTECT
1609	{
1610		int error = 0;
1611		if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1612			return error;
1613		}
1614	}
1615#endif /* CONFIG_PROTECT */
1616
1617	switch (ap->a_command) {
1618
1619	case HFS_GETPATH:
1620	{
1621		struct vnode *file_vp;
1622		cnid_t  cnid;
1623		int  outlen;
1624		char *bufptr;
1625		int error;
1626		int flags = 0;
1627
1628		/* Caller must be owner of file system. */
1629		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1630		if (suser(cred, NULL) &&
1631			kauth_cred_getuid(cred) != vfsp->f_owner) {
1632			return (EACCES);
1633		}
1634		/* Target vnode must be file system's root. */
1635		if (!vnode_isvroot(vp)) {
1636			return (EINVAL);
1637		}
1638		bufptr = (char *)ap->a_data;
1639		cnid = strtoul(bufptr, NULL, 10);
1640		if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1641			flags |= BUILDPATH_VOLUME_RELATIVE;
1642		}
1643
1644		/* We need to call hfs_vfs_vget to leverage the code that will
1645		 * fix the origin list for us if needed, as opposed to calling
1646		 * hfs_vget, since we will need the parent for build_path call.
1647		 */
1648
1649		if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1650			return (error);
1651		}
1652		error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1653		vnode_put(file_vp);
1654
1655		return (error);
1656	}
1657
1658	case HFS_GET_WRITE_GEN_COUNTER:
1659	{
1660		struct cnode *cp = NULL;
1661		int error;
1662		u_int32_t *counter = (u_int32_t *)ap->a_data;
1663
1664		cp = VTOC(vp);
1665
1666		if (!vnode_isdir(vp) && !(vnode_isreg(vp)) &&
1667				!(vnode_islnk(vp))) {
1668			error = EBADF;
1669			*counter = 0;
1670			return error;
1671		}
1672
1673		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
1674		if (error == 0) {
1675			struct ubc_info *uip;
1676			int is_mapped_writable = 0;
1677
1678			if (UBCINFOEXISTS(vp)) {
1679				uip = vp->v_ubcinfo;
1680				if ((uip->ui_flags & UI_ISMAPPED) && (uip->ui_flags & UI_MAPPEDWRITE)) {
1681					is_mapped_writable = 1;
1682				}
1683			}
1684
1685
1686			if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1687				uint32_t gcount = hfs_get_gencount(cp);
1688				//
1689				// Even though we return EBUSY for files that are mmap'ed
1690				// we also want to bump the value so that the write-gen
1691				// counter will always be different once the file is unmapped
1692				// (since the file may be unmapped but the pageouts have not
1693				// yet happened).
1694				//
1695				if (is_mapped_writable) {
1696					hfs_incr_gencount (cp);
1697					gcount = hfs_get_gencount(cp);
1698				}
1699
1700				*counter = gcount;
1701			} else if (S_ISDIR(cp->c_attr.ca_mode)) {
1702				*counter = hfs_get_gencount(cp);
1703			} else {
1704				/* not a file or dir? silently return */
1705				*counter = 0;
1706			}
1707			hfs_unlock (cp);
1708
1709			if (is_mapped_writable) {
1710				error = EBUSY;
1711			}
1712		}
1713
1714		return error;
1715	}
1716
1717	case HFS_GET_DOCUMENT_ID:
1718	{
1719		struct cnode *cp = NULL;
1720		int error=0;
1721		u_int32_t *document_id = (u_int32_t *)ap->a_data;
1722
1723		cp = VTOC(vp);
1724
1725		if (cp->c_desc.cd_cnid == kHFSRootFolderID) {
1726			// the root-dir always has document id '2' (aka kHFSRootFolderID)
1727			*document_id = kHFSRootFolderID;
1728
1729		} else if ((S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode))) {
1730			int mark_it = 0;
1731			uint32_t tmp_doc_id;
1732
1733			//
1734			// we can use the FndrExtendedFileInfo because the doc-id is the first
1735			// thing in both it and the FndrExtendedDirInfo struct which is fixed
1736			// in format and can not change layout
1737			//
1738			struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1739
1740			hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
1741
1742			//
1743			// if the cnode isn't UF_TRACKED and the doc-id-allocate flag isn't set
1744			// then just return a zero for the doc-id
1745			//
1746			if (!(cp->c_bsdflags & UF_TRACKED) && !(ap->a_fflag & HFS_DOCUMENT_ID_ALLOCATE)) {
1747				*document_id = 0;
1748				hfs_unlock(cp);
1749				return 0;
1750			}
1751
1752			//
1753			// if the cnode isn't UF_TRACKED and the doc-id-allocate flag IS set,
1754			// then set mark_it so we know to set the UF_TRACKED flag once the
1755			// cnode is locked.
1756			//
1757			if (!(cp->c_bsdflags & UF_TRACKED) && (ap->a_fflag & HFS_DOCUMENT_ID_ALLOCATE)) {
1758				mark_it = 1;
1759			}
1760
1761			tmp_doc_id = extinfo->document_id;   // get a copy of this
1762
1763			hfs_unlock(cp);   // in case we have to call hfs_generate_document_id()
1764
1765			//
1766			// If the document_id isn't set, get a new one and then set it.
1767			// Note: we first get the document id, then lock the cnode to
1768			// avoid any deadlock potential between cp and the root vnode.
1769			//
1770			uint32_t new_id;
1771			if (tmp_doc_id == 0 && (error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1772
1773				if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) == 0) {
1774					extinfo->document_id = tmp_doc_id = new_id;
1775					//printf("ASSIGNING: doc-id %d to ino %d\n", extinfo->document_id, cp->c_fileid);
1776
1777					if (mark_it) {
1778						cp->c_bsdflags |= UF_TRACKED;
1779					}
1780
1781					// mark the cnode dirty
1782					cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1783
1784					int lockflags;
1785					if ((error = hfs_start_transaction(hfsmp)) == 0) {
1786						lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1787
1788						(void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1789
1790						hfs_systemfile_unlock (hfsmp, lockflags);
1791						(void) hfs_end_transaction(hfsmp);
1792					}
1793
1794#if CONFIG_FSE
1795					add_fsevent(FSE_DOCID_CHANGED, context,
1796						    FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1797						    FSE_ARG_INO,   (ino64_t)0,             // src inode #
1798						    FSE_ARG_INO,   (ino64_t)cp->c_fileid,  // dst inode #
1799						    FSE_ARG_INT32, extinfo->document_id,
1800						    FSE_ARG_DONE);
1801
1802					hfs_unlock (cp);    // so we can send the STAT_CHANGED event without deadlocking
1803
1804					if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1805						add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1806					}
1807#else
1808					hfs_unlock (cp);
1809#endif
1810				}
1811			}
1812
1813			*document_id = tmp_doc_id;
1814		} else {
1815			*document_id = 0;
1816		}
1817
1818		return error;
1819	}
1820
1821	case HFS_TRANSFER_DOCUMENT_ID:
1822	{
1823		struct cnode *cp = NULL;
1824		int error;
1825		u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1826		struct fileproc *to_fp;
1827		struct vnode *to_vp;
1828		struct cnode *to_cp;
1829
1830		cp = VTOC(vp);
1831
1832		if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1833			//printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1834			return error;
1835		}
1836		if ( (error = vnode_getwithref(to_vp)) ) {
1837			file_drop(to_fd);
1838			return error;
1839		}
1840
1841		if (VTOHFS(to_vp) != hfsmp) {
1842			error = EXDEV;
1843			goto transfer_cleanup;
1844		}
1845
1846		int need_unlock = 1;
1847		to_cp = VTOC(to_vp);
1848		error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1849		if (error != 0) {
1850			//printf("could not lock the pair of cnodes (error %d)\n", error);
1851			goto transfer_cleanup;
1852		}
1853
1854		if (!(cp->c_bsdflags & UF_TRACKED)) {
1855			error = EINVAL;
1856		} else if (to_cp->c_bsdflags & UF_TRACKED) {
1857			//
1858			// if the destination is already tracked, return an error
1859			// as otherwise it's a silent deletion of the target's
1860			// document-id
1861			//
1862			error = EEXIST;
1863		} else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1864			//
1865			// we can use the FndrExtendedFileInfo because the doc-id is the first
1866			// thing in both it and the ExtendedDirInfo struct which is fixed in
1867			// format and can not change layout
1868			//
1869			struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1870			struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1871
1872			if (f_extinfo->document_id == 0) {
1873				uint32_t new_id;
1874
1875				hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1876
1877				if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1878					//
1879					// re-lock the pair now that we have the document-id
1880					//
1881					hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1882					f_extinfo->document_id = new_id;
1883				} else {
1884					goto transfer_cleanup;
1885				}
1886			}
1887
1888			to_extinfo->document_id = f_extinfo->document_id;
1889			f_extinfo->document_id = 0;
1890			//printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1891
1892			// make sure the destination is also UF_TRACKED
1893			to_cp->c_bsdflags |= UF_TRACKED;
1894			cp->c_bsdflags &= ~UF_TRACKED;
1895
1896			// mark the cnodes dirty
1897			cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1898			to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1899
1900			int lockflags;
1901			if ((error = hfs_start_transaction(hfsmp)) == 0) {
1902
1903				lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1904
1905				(void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1906				(void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1907
1908				hfs_systemfile_unlock (hfsmp, lockflags);
1909				(void) hfs_end_transaction(hfsmp);
1910			}
1911
1912#if CONFIG_FSE
1913			add_fsevent(FSE_DOCID_CHANGED, context,
1914				    FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1915				    FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1916				    FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1917				    FSE_ARG_INT32, to_extinfo->document_id,
1918				    FSE_ARG_DONE);
1919
1920			hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1921			need_unlock = 0;
1922
1923			if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1924				add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1925			}
1926			if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1927				add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1928			}
1929#else
1930			hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1931			need_unlock = 0;
1932#endif
1933		}
1934
1935		if (need_unlock) {
1936			hfs_unlockpair(cp, to_cp);
1937		}
1938
1939	transfer_cleanup:
1940		vnode_put(to_vp);
1941		file_drop(to_fd);
1942
1943		return error;
1944	}
1945
1946	case HFS_PREV_LINK:
1947	case HFS_NEXT_LINK:
1948	{
1949		cnid_t linkfileid;
1950		cnid_t nextlinkid;
1951		cnid_t prevlinkid;
1952		int error;
1953
1954		/* Caller must be owner of file system. */
1955		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1956		if (suser(cred, NULL) &&
1957			kauth_cred_getuid(cred) != vfsp->f_owner) {
1958			return (EACCES);
1959		}
1960		/* Target vnode must be file system's root. */
1961		if (!vnode_isvroot(vp)) {
1962			return (EINVAL);
1963		}
1964		linkfileid = *(cnid_t *)ap->a_data;
1965		if (linkfileid < kHFSFirstUserCatalogNodeID) {
1966			return (EINVAL);
1967		}
1968		if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1969			return (error);
1970		}
1971		if (ap->a_command == HFS_NEXT_LINK) {
1972			*(cnid_t *)ap->a_data = nextlinkid;
1973		} else {
1974			*(cnid_t *)ap->a_data = prevlinkid;
1975		}
1976		return (0);
1977	}
1978
1979	case HFS_RESIZE_PROGRESS: {
1980
1981		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1982		if (suser(cred, NULL) &&
1983			kauth_cred_getuid(cred) != vfsp->f_owner) {
1984			return (EACCES); /* must be owner of file system */
1985		}
1986		if (!vnode_isvroot(vp)) {
1987			return (EINVAL);
1988		}
1989		/* file system must not be mounted read-only */
1990		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1991			return (EROFS);
1992		}
1993
1994		return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1995	}
1996
1997	case HFS_RESIZE_VOLUME: {
1998		u_int64_t newsize;
1999		u_int64_t cursize;
2000
2001		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2002		if (suser(cred, NULL) &&
2003			kauth_cred_getuid(cred) != vfsp->f_owner) {
2004			return (EACCES); /* must be owner of file system */
2005		}
2006		if (!vnode_isvroot(vp)) {
2007			return (EINVAL);
2008		}
2009
2010		/* filesystem must not be mounted read only */
2011		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2012			return (EROFS);
2013		}
2014		newsize = *(u_int64_t *)ap->a_data;
2015		cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
2016
2017		if (newsize > cursize) {
2018			return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
2019		} else if (newsize < cursize) {
2020			return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
2021		} else {
2022			return (0);
2023		}
2024	}
2025	case HFS_CHANGE_NEXT_ALLOCATION: {
2026		int error = 0;		/* Assume success */
2027		u_int32_t location;
2028
2029		if (vnode_vfsisrdonly(vp)) {
2030			return (EROFS);
2031		}
2032		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2033		if (suser(cred, NULL) &&
2034			kauth_cred_getuid(cred) != vfsp->f_owner) {
2035			return (EACCES); /* must be owner of file system */
2036		}
2037		if (!vnode_isvroot(vp)) {
2038			return (EINVAL);
2039		}
2040		hfs_lock_mount(hfsmp);
2041		location = *(u_int32_t *)ap->a_data;
2042		if ((location >= hfsmp->allocLimit) &&
2043			(location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
2044			error = EINVAL;
2045			goto fail_change_next_allocation;
2046		}
2047		/* Return previous value. */
2048		*(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
2049		if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
2050			/* On magic value for location, set nextAllocation to next block
2051			 * after metadata zone and set flag in mount structure to indicate
2052			 * that nextAllocation should not be updated again.
2053			 */
2054			if (hfsmp->hfs_metazone_end != 0) {
2055				HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
2056			}
2057			hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
2058		} else {
2059			hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
2060			HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
2061		}
2062		MarkVCBDirty(hfsmp);
2063fail_change_next_allocation:
2064		hfs_unlock_mount(hfsmp);
2065		return (error);
2066	}
2067
2068#if HFS_SPARSE_DEV
2069	case HFS_SETBACKINGSTOREINFO: {
2070		struct vnode * bsfs_rootvp;
2071		struct vnode * di_vp;
2072		struct hfs_backingstoreinfo *bsdata;
2073		int error = 0;
2074
2075		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2076			return (EROFS);
2077		}
2078		if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2079			return (EALREADY);
2080		}
2081		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2082		if (suser(cred, NULL) &&
2083			kauth_cred_getuid(cred) != vfsp->f_owner) {
2084			return (EACCES); /* must be owner of file system */
2085		}
2086		bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
2087		if (bsdata == NULL) {
2088			return (EINVAL);
2089		}
2090		if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
2091			return (error);
2092		}
2093		if ((error = vnode_getwithref(di_vp))) {
2094			file_drop(bsdata->backingfd);
2095			return(error);
2096		}
2097
2098		if (vnode_mount(vp) == vnode_mount(di_vp)) {
2099			(void)vnode_put(di_vp);
2100			file_drop(bsdata->backingfd);
2101			return (EINVAL);
2102		}
2103
2104		/*
2105		 * Obtain the backing fs root vnode and keep a reference
2106		 * on it.  This reference will be dropped in hfs_unmount.
2107		 */
2108		error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
2109		if (error) {
2110			(void)vnode_put(di_vp);
2111			file_drop(bsdata->backingfd);
2112			return (error);
2113		}
2114		vnode_ref(bsfs_rootvp);
2115		vnode_put(bsfs_rootvp);
2116
2117		hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
2118
2119		hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
2120		/* The free extent cache is managed differently for sparse devices.
2121		 * There is a window between which the volume is mounted and the
2122		 * device is marked as sparse, so the free extent cache for this
2123		 * volume is currently initialized as normal volume (sorted by block
2124		 * count).  Reset the cache so that it will be rebuilt again
2125		 * for sparse device (sorted by start block).
2126		 */
2127		ResetVCBFreeExtCache(hfsmp);
2128
2129		hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
2130		hfsmp->hfs_sparsebandblks *= 4;
2131
2132		/* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
2133
2134		/*
2135		 * If the sparse image is on a sparse image file (as opposed to a sparse
2136		 * bundle), then we may need to limit the free space to the maximum size
2137		 * of a file on that volume.  So we query (using pathconf), and if we get
2138		 * a meaningful result, we cache the number of blocks for later use in
2139		 * hfs_freeblks().
2140		 */
2141		hfsmp->hfs_backingfs_maxblocks = 0;
2142		if (vnode_vtype(di_vp) == VREG) {
2143			int terr;
2144			int hostbits;
2145			terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
2146			if (terr == 0 && hostbits != 0 && hostbits < 64) {
2147				u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
2148
2149				hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
2150			}
2151		}
2152
2153		(void)vnode_put(di_vp);
2154		file_drop(bsdata->backingfd);
2155		return (0);
2156	}
2157	case HFS_CLRBACKINGSTOREINFO: {
2158		struct vnode * tmpvp;
2159
2160		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2161		if (suser(cred, NULL) &&
2162			kauth_cred_getuid(cred) != vfsp->f_owner) {
2163			return (EACCES); /* must be owner of file system */
2164		}
2165		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2166			return (EROFS);
2167		}
2168
2169		if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
2170		    hfsmp->hfs_backingfs_rootvp) {
2171
2172			hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2173			tmpvp = hfsmp->hfs_backingfs_rootvp;
2174			hfsmp->hfs_backingfs_rootvp = NULLVP;
2175			hfsmp->hfs_sparsebandblks = 0;
2176			vnode_rele(tmpvp);
2177		}
2178		return (0);
2179	}
2180#endif /* HFS_SPARSE_DEV */
2181
2182	/* Change the next CNID stored in the VH */
2183	case HFS_CHANGE_NEXTCNID: {
2184		int error = 0;		/* Assume success */
2185		u_int32_t fileid;
2186		int wraparound = 0;
2187		int lockflags = 0;
2188
2189		if (vnode_vfsisrdonly(vp)) {
2190			return (EROFS);
2191		}
2192		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2193		if (suser(cred, NULL) &&
2194			kauth_cred_getuid(cred) != vfsp->f_owner) {
2195			return (EACCES); /* must be owner of file system */
2196		}
2197
2198		fileid = *(u_int32_t *)ap->a_data;
2199
2200		/* Must have catalog lock excl. to advance the CNID pointer */
2201		lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2202
2203		hfs_lock_mount(hfsmp);
2204
2205		/* If it is less than the current next CNID, force the wraparound bit to be set */
2206		if (fileid < hfsmp->vcbNxtCNID) {
2207			wraparound=1;
2208		}
2209
2210		/* Return previous value. */
2211		*(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2212
2213		hfsmp->vcbNxtCNID = fileid;
2214
2215		if (wraparound) {
2216			hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2217		}
2218
2219		MarkVCBDirty(hfsmp);
2220		hfs_unlock_mount(hfsmp);
2221		hfs_systemfile_unlock (hfsmp, lockflags);
2222
2223		return (error);
2224	}
2225
2226	case F_FREEZE_FS: {
2227		struct mount *mp;
2228
2229		mp = vnode_mount(vp);
2230		hfsmp = VFSTOHFS(mp);
2231
2232		if (!(hfsmp->jnl))
2233			return (ENOTSUP);
2234
2235		vfsp = vfs_statfs(mp);
2236
2237		if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2238			!kauth_cred_issuser(cred))
2239			return (EACCES);
2240
2241		lck_rw_lock_exclusive(&hfsmp->hfs_insync);
2242
2243		// flush things before we get started to try and prevent
2244		// dirty data from being paged out while we're frozen.
2245		// note: can't do this after taking the lock as it will
2246		// deadlock against ourselves.
2247		vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
2248		hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
2249
2250		// DO NOT call hfs_journal_flush() because that takes a
2251		// shared lock on the global exclusive lock!
2252		journal_flush(hfsmp->jnl, TRUE);
2253
2254		// don't need to iterate on all vnodes, we just need to
2255		// wait for writes to the system files and the device vnode
2256		//
2257		// Now that journal flush waits for all metadata blocks to
2258		// be written out, waiting for btree writes is probably no
2259		// longer required.
2260		if (HFSTOVCB(hfsmp)->extentsRefNum)
2261		    vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
2262		if (HFSTOVCB(hfsmp)->catalogRefNum)
2263		    vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
2264		if (HFSTOVCB(hfsmp)->allocationsRefNum)
2265		    vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
2266		if (hfsmp->hfs_attribute_vp)
2267		    vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
2268		vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
2269
2270		hfsmp->hfs_freezing_proc = current_proc();
2271
2272		return (0);
2273	}
2274
2275	case F_THAW_FS: {
2276		vfsp = vfs_statfs(vnode_mount(vp));
2277		if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2278			!kauth_cred_issuser(cred))
2279			return (EACCES);
2280
2281		// if we're not the one who froze the fs then we
2282		// can't thaw it.
2283		if (hfsmp->hfs_freezing_proc != current_proc()) {
2284		    return EPERM;
2285		}
2286
2287		// NOTE: if you add code here, also go check the
2288		//       code that "thaws" the fs in hfs_vnop_close()
2289		//
2290		hfsmp->hfs_freezing_proc = NULL;
2291		hfs_unlock_global (hfsmp);
2292		lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
2293
2294		return (0);
2295	}
2296
2297	case HFS_BULKACCESS_FSCTL: {
2298	    int size;
2299
2300	    if (hfsmp->hfs_flags & HFS_STANDARD) {
2301		return EINVAL;
2302	    }
2303
2304	    if (is64bit) {
2305		size = sizeof(struct user64_access_t);
2306	    } else {
2307		size = sizeof(struct user32_access_t);
2308	    }
2309
2310	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
2311	}
2312
2313	case HFS_EXT_BULKACCESS_FSCTL: {
2314	    int size;
2315
2316	    if (hfsmp->hfs_flags & HFS_STANDARD) {
2317		return EINVAL;
2318	    }
2319
2320	    if (is64bit) {
2321		size = sizeof(struct user64_ext_access_t);
2322	    } else {
2323		size = sizeof(struct user32_ext_access_t);
2324	    }
2325
2326	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
2327	}
2328
2329	case HFS_SET_XATTREXTENTS_STATE: {
2330		int state;
2331
2332		if (ap->a_data == NULL) {
2333			return (EINVAL);
2334		}
2335
2336		state = *(int *)ap->a_data;
2337
2338		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2339			return (EROFS);
2340		}
2341
2342		/* Super-user can enable or disable extent-based extended
2343		 * attribute support on a volume
2344		 * Note: Starting Mac OS X 10.7, extent-based extended attributes
2345		 * are enabled by default, so any change will be transient only
2346		 * till the volume is remounted.
2347		 */
2348		if (!kauth_cred_issuser(kauth_cred_get())) {
2349			return (EPERM);
2350		}
2351		if (state == 0 || state == 1)
2352			return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2353		else
2354			return (EINVAL);
2355	}
2356
2357	case F_SETSTATICCONTENT: {
2358		int error;
2359		int enable_static = 0;
2360		struct cnode *cp = NULL;
2361		/*
2362		 * lock the cnode, decorate the cnode flag, and bail out.
2363		 * VFS should have already authenticated the caller for us.
2364		 */
2365
2366		if (ap->a_data) {
2367			/*
2368			 * Note that even though ap->a_data is of type caddr_t,
2369			 * the fcntl layer at the syscall handler will pass in NULL
2370			 * or 1 depending on what the argument supplied to the fcntl
2371			 * was.  So it is in fact correct to check the ap->a_data
2372			 * argument for zero or non-zero value when deciding whether or not
2373			 * to enable the static bit in the cnode.
2374			 */
2375			enable_static = 1;
2376		}
2377		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2378			return EROFS;
2379		}
2380		cp = VTOC(vp);
2381
2382		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2383		if (error == 0) {
2384			if (enable_static) {
2385				cp->c_flag |= C_SSD_STATIC;
2386			}
2387			else {
2388				cp->c_flag &= ~C_SSD_STATIC;
2389			}
2390			hfs_unlock (cp);
2391		}
2392		return error;
2393	}
2394
2395	case F_SET_GREEDY_MODE: {
2396		int error;
2397		int enable_greedy_mode = 0;
2398		struct cnode *cp = NULL;
2399		/*
2400		 * lock the cnode, decorate the cnode flag, and bail out.
2401		 * VFS should have already authenticated the caller for us.
2402		 */
2403
2404		if (ap->a_data) {
2405			/*
2406			 * Note that even though ap->a_data is of type caddr_t,
2407			 * the fcntl layer at the syscall handler will pass in NULL
2408			 * or 1 depending on what the argument supplied to the fcntl
2409			 * was.  So it is in fact correct to check the ap->a_data
2410			 * argument for zero or non-zero value when deciding whether or not
2411			 * to enable the greedy mode bit in the cnode.
2412			 */
2413			enable_greedy_mode = 1;
2414		}
2415		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2416			return EROFS;
2417		}
2418		cp = VTOC(vp);
2419
2420		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2421		if (error == 0) {
2422			if (enable_greedy_mode) {
2423				cp->c_flag |= C_SSD_GREEDY_MODE;
2424			}
2425			else {
2426				cp->c_flag &= ~C_SSD_GREEDY_MODE;
2427			}
2428			hfs_unlock (cp);
2429		}
2430		return error;
2431	}
2432
2433	case F_MAKECOMPRESSED: {
2434		int error = 0;
2435		uint32_t gen_counter;
2436		struct cnode *cp = NULL;
2437		int reset_decmp = 0;
2438
2439		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2440			return EROFS;
2441		}
2442
2443		/*
2444		 * acquire & lock the cnode.
2445		 * VFS should have already authenticated the caller for us.
2446		 */
2447
2448		if (ap->a_data) {
2449			/*
2450			 * Cast the pointer into a uint32_t so we can extract the
2451			 * supplied generation counter.
2452			 */
2453			gen_counter = *((uint32_t*)ap->a_data);
2454		}
2455		else {
2456			return EINVAL;
2457		}
2458
2459#if HFS_COMPRESSION
2460		cp = VTOC(vp);
2461		/* Grab truncate lock first; we may truncate the file */
2462		hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2463
2464		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2465		if (error) {
2466			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2467			return error;
2468		}
2469
2470		/* Are there any other usecounts/FDs? */
2471		if (vnode_isinuse(vp, 1)) {
2472			hfs_unlock(cp);
2473			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2474			return EBUSY;
2475		}
2476
2477
2478		/* now we have the cnode locked down; Validate arguments */
2479		if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2480			/* EINVAL if you are trying to manipulate an IMMUTABLE file */
2481			hfs_unlock(cp);
2482			hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2483			return EINVAL;
2484		}
2485
2486		if ((hfs_get_gencount (cp)) == gen_counter) {
2487			/*
2488			 * OK, the gen_counter matched.  Go for it:
2489			 * Toggle state bits, truncate file, and suppress mtime update
2490			 */
2491			reset_decmp = 1;
2492			cp->c_bsdflags |= UF_COMPRESSED;
2493
2494			error = hfs_truncate(vp, 0, IO_NDELAY, 0, (HFS_TRUNCATE_SKIPTIMES), ap->a_context);
2495		}
2496		else {
2497			error = ESTALE;
2498		}
2499
2500		/* Unlock cnode before executing decmpfs ; they may need to get an EA */
2501		hfs_unlock(cp);
2502
2503		/*
2504		 * Reset the decmp state while still holding the truncate lock. We need to
2505		 * serialize here against a listxattr on this node which may occur at any
2506		 * time.
2507		 *
2508		 * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2509		 * that will still potentially require getting the com.apple.decmpfs EA. If the
2510	 	 * EA is required, then we can't hold the cnode lock, because the getxattr call is
2511		 * generic(through VFS), and can't pass along any info telling it that we're already
2512		 * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2513		 * and trying to fill in the hfs_file_is_compressed info during the callback
2514		 * operation, which will result in deadlock against the b-tree node.
2515		 *
2516		 * So, to serialize against listxattr (which will grab buf_t meta references on
2517		 * the b-tree blocks), we hold the truncate lock as we're manipulating the
2518		 * decmpfs payload.
2519		 */
2520		if ((reset_decmp) && (error == 0)) {
2521			decmpfs_cnode *dp = VTOCMP (vp);
2522			if (dp != NULL) {
2523				decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2524			}
2525
2526			/* Initialize the decmpfs node as needed */
2527			(void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2528		}
2529
2530		hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2531
2532#endif
2533		return error;
2534	}
2535
2536	case F_SETBACKINGSTORE: {
2537
2538		int error = 0;
2539
2540		/*
2541		 * See comment in F_SETSTATICCONTENT re: using
2542	     * a null check for a_data
2543  		 */
2544		if (ap->a_data) {
2545			error = hfs_set_backingstore (vp, 1);
2546		}
2547		else {
2548			error = hfs_set_backingstore (vp, 0);
2549		}
2550
2551		return error;
2552	}
2553
2554	case F_GETPATH_MTMINFO: {
2555		int error = 0;
2556
2557		int *data = (int*) ap->a_data;
2558
2559		/* Ask if this is a backingstore vnode */
2560		error = hfs_is_backingstore (vp, data);
2561
2562		return error;
2563	}
2564
2565	case F_FULLFSYNC: {
2566		int error;
2567
2568		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2569			return (EROFS);
2570		}
2571		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2572		if (error == 0) {
2573			error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2574			hfs_unlock(VTOC(vp));
2575		}
2576
2577		return error;
2578	}
2579
2580	case F_CHKCLEAN: {
2581		register struct cnode *cp;
2582		int error;
2583
2584		if (!vnode_isreg(vp))
2585			return EINVAL;
2586
2587		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2588		if (error == 0) {
2589			cp = VTOC(vp);
2590			/*
2591			 * used by regression test to determine if
2592			 * all the dirty pages (via write) have been cleaned
2593			 * after a call to 'fsysnc'.
2594			 */
2595			error = is_file_clean(vp, VTOF(vp)->ff_size);
2596			hfs_unlock(cp);
2597		}
2598		return (error);
2599	}
2600
2601	case F_RDADVISE: {
2602		register struct radvisory *ra;
2603		struct filefork *fp;
2604		int error;
2605
2606		if (!vnode_isreg(vp))
2607			return EINVAL;
2608
2609		ra = (struct radvisory *)(ap->a_data);
2610		fp = VTOF(vp);
2611
2612		/* Protect against a size change. */
2613		hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2614
2615#if HFS_COMPRESSION
2616		if (compressed && (uncompressed_size == -1)) {
2617			/* fetching the uncompressed size failed above, so return the error */
2618			error = decmpfs_error;
2619		} else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2620				   (!compressed && (ra->ra_offset >= fp->ff_size))) {
2621			error = EFBIG;
2622		}
2623#else /* HFS_COMPRESSION */
2624		if (ra->ra_offset >= fp->ff_size) {
2625			error = EFBIG;
2626		}
2627#endif /* HFS_COMPRESSION */
2628		else {
2629			error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2630		}
2631
2632		hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2633		return (error);
2634	}
2635
2636	case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2637	{
2638		if (is64bit) {
2639			*(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2640		}
2641		else {
2642			*(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2643		}
2644		return 0;
2645	}
2646
2647	case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2648	    *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2649	    break;
2650
2651	case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2652	    *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2653	    break;
2654
2655	case HFS_FSCTL_GET_VERY_LOW_DISK:
2656	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2657	    break;
2658
2659	case HFS_FSCTL_SET_VERY_LOW_DISK:
2660	    if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2661		return EINVAL;
2662	    }
2663
2664	    hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2665	    break;
2666
2667	case HFS_FSCTL_GET_LOW_DISK:
2668	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2669	    break;
2670
2671	case HFS_FSCTL_SET_LOW_DISK:
2672	    if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2673		|| *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2674
2675		return EINVAL;
2676	    }
2677
2678	    hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2679	    break;
2680
2681	case HFS_FSCTL_GET_DESIRED_DISK:
2682	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2683	    break;
2684
2685	case HFS_FSCTL_SET_DESIRED_DISK:
2686	    if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2687		return EINVAL;
2688	    }
2689
2690	    hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2691	    break;
2692
2693	case HFS_VOLUME_STATUS:
2694	    *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2695	    break;
2696
2697	case HFS_SET_BOOT_INFO:
2698		if (!vnode_isvroot(vp))
2699			return(EINVAL);
2700		if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2701			return(EACCES);	/* must be superuser or owner of filesystem */
2702		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2703			return (EROFS);
2704		}
2705		hfs_lock_mount (hfsmp);
2706		bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2707		hfs_unlock_mount (hfsmp);
2708		(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2709		break;
2710
2711	case HFS_GET_BOOT_INFO:
2712		if (!vnode_isvroot(vp))
2713			return(EINVAL);
2714		hfs_lock_mount (hfsmp);
2715		bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2716		hfs_unlock_mount(hfsmp);
2717		break;
2718
2719	case HFS_MARK_BOOT_CORRUPT:
2720		/* Mark the boot volume corrupt by setting
2721		 * kHFSVolumeInconsistentBit in the volume header.  This will
2722		 * force fsck_hfs on next mount.
2723		 */
2724		if (!kauth_cred_issuser(kauth_cred_get())) {
2725			return EACCES;
2726		}
2727
2728		/* Allowed only on the root vnode of the boot volume */
2729		if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2730		    !vnode_isvroot(vp)) {
2731			return EINVAL;
2732		}
2733		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2734			return (EROFS);
2735		}
2736		printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2737		hfs_mark_volume_inconsistent(hfsmp);
2738		break;
2739
2740	case HFS_FSCTL_GET_JOURNAL_INFO:
2741		jip = (struct hfs_journal_info*)ap->a_data;
2742
2743		if (vp == NULLVP)
2744		        return EINVAL;
2745
2746	    if (hfsmp->jnl == NULL) {
2747			jnl_start = 0;
2748			jnl_size  = 0;
2749	    } else {
2750			jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2751			jnl_size  = (off_t)hfsmp->jnl_size;
2752	    }
2753
2754		jip->jstart = jnl_start;
2755		jip->jsize = jnl_size;
2756		break;
2757
2758	case HFS_SET_ALWAYS_ZEROFILL: {
2759	    struct cnode *cp = VTOC(vp);
2760
2761	    if (*(int *)ap->a_data) {
2762		cp->c_flag |= C_ALWAYS_ZEROFILL;
2763	    } else {
2764		cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2765	    }
2766	    break;
2767	}
2768
2769	case HFS_DISABLE_METAZONE: {
2770		/* Only root can disable metadata zone */
2771		if (!kauth_cred_issuser(kauth_cred_get())) {
2772			return EACCES;
2773		}
2774		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2775			return (EROFS);
2776		}
2777
2778		/* Disable metadata zone now */
2779		(void) hfs_metadatazone_init(hfsmp, true);
2780		printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2781		break;
2782	}
2783
2784	default:
2785		return (ENOTTY);
2786	}
2787
2788	return 0;
2789}
2790
2791/*
2792 * select
2793 */
2794int
2795hfs_vnop_select(__unused struct vnop_select_args *ap)
2796/*
2797	struct vnop_select_args {
2798		vnode_t a_vp;
2799		int  a_which;
2800		int  a_fflags;
2801		void *a_wql;
2802		vfs_context_t a_context;
2803	};
2804*/
2805{
2806	/*
2807	 * We should really check to see if I/O is possible.
2808	 */
2809	return (1);
2810}
2811
2812/*
2813 * Converts a logical block number to a physical block, and optionally returns
2814 * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2815 * The physical block number is based on the device block size, currently its 512.
2816 * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2817 */
2818int
2819hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2820{
2821	struct filefork *fp = VTOF(vp);
2822	struct hfsmount *hfsmp = VTOHFS(vp);
2823	int  retval = E_NONE;
2824	u_int32_t  logBlockSize;
2825	size_t  bytesContAvail = 0;
2826	off_t  blockposition;
2827	int lockExtBtree;
2828	int lockflags = 0;
2829
2830	/*
2831	 * Check for underlying vnode requests and ensure that logical
2832	 * to physical mapping is requested.
2833	 */
2834	if (vpp != NULL)
2835		*vpp = hfsmp->hfs_devvp;
2836	if (bnp == NULL)
2837		return (0);
2838
2839	logBlockSize = GetLogicalBlockSize(vp);
2840	blockposition = (off_t)bn * logBlockSize;
2841
2842	lockExtBtree = overflow_extents(fp);
2843
2844	if (lockExtBtree)
2845		lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2846
2847	retval = MacToVFSError(
2848                            MapFileBlockC (HFSTOVCB(hfsmp),
2849                                            (FCB*)fp,
2850                                            MAXPHYSIO,
2851                                            blockposition,
2852                                            bnp,
2853                                            &bytesContAvail));
2854
2855	if (lockExtBtree)
2856		hfs_systemfile_unlock(hfsmp, lockflags);
2857
2858	if (retval == E_NONE) {
2859		/* Figure out how many read ahead blocks there are */
2860		if (runp != NULL) {
2861			if (can_cluster(logBlockSize)) {
2862				/* Make sure this result never goes negative: */
2863				*runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2864			} else {
2865				*runp = 0;
2866			}
2867		}
2868	}
2869	return (retval);
2870}
2871
2872/*
2873 * Convert logical block number to file offset.
2874 */
2875int
2876hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2877/*
2878	struct vnop_blktooff_args {
2879		vnode_t a_vp;
2880		daddr64_t a_lblkno;
2881		off_t *a_offset;
2882	};
2883*/
2884{
2885	if (ap->a_vp == NULL)
2886		return (EINVAL);
2887	*ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2888
2889	return(0);
2890}
2891
2892/*
2893 * Convert file offset to logical block number.
2894 */
2895int
2896hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2897/*
2898	struct vnop_offtoblk_args {
2899		vnode_t a_vp;
2900		off_t a_offset;
2901		daddr64_t *a_lblkno;
2902	};
2903*/
2904{
2905	if (ap->a_vp == NULL)
2906		return (EINVAL);
2907	*ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2908
2909	return(0);
2910}
2911
2912/*
2913 * Map file offset to physical block number.
2914 *
2915 * If this function is called for write operation, and if the file
2916 * had virtual blocks allocated (delayed allocation), real blocks
2917 * are allocated by calling ExtendFileC().
2918 *
2919 * If this function is called for read operation, and if the file
2920 * had virtual blocks allocated (delayed allocation), no change
2921 * to the size of file is done, and if required, rangelist is
2922 * searched for mapping.
2923 *
2924 * System file cnodes are expected to be locked (shared or exclusive).
2925 */
2926int
2927hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2928/*
2929	struct vnop_blockmap_args {
2930		vnode_t a_vp;
2931		off_t a_foffset;
2932		size_t a_size;
2933		daddr64_t *a_bpn;
2934		size_t *a_run;
2935		void *a_poff;
2936		int a_flags;
2937		vfs_context_t a_context;
2938	};
2939*/
2940{
2941	struct vnode *vp = ap->a_vp;
2942	struct cnode *cp;
2943	struct filefork *fp;
2944	struct hfsmount *hfsmp;
2945	size_t bytesContAvail = 0;
2946	int retval = E_NONE;
2947	int syslocks = 0;
2948	int lockflags = 0;
2949	struct rl_entry *invalid_range;
2950	enum rl_overlaptype overlaptype;
2951	int started_tr = 0;
2952	int tooklock = 0;
2953
2954#if HFS_COMPRESSION
2955	if (VNODE_IS_RSRC(vp)) {
2956		/* allow blockmaps to the resource fork */
2957	} else {
2958		if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2959			int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2960			switch(state) {
2961				case FILE_IS_COMPRESSED:
2962					return ENOTSUP;
2963				case FILE_IS_CONVERTING:
2964					/* if FILE_IS_CONVERTING, we allow blockmap */
2965					break;
2966				default:
2967					printf("invalid state %d for compressed file\n", state);
2968					/* fall through */
2969			}
2970		}
2971	}
2972#endif /* HFS_COMPRESSION */
2973
2974	/* Do not allow blockmap operation on a directory */
2975	if (vnode_isdir(vp)) {
2976		return (ENOTSUP);
2977	}
2978
2979	/*
2980	 * Check for underlying vnode requests and ensure that logical
2981	 * to physical mapping is requested.
2982	 */
2983	if (ap->a_bpn == NULL)
2984		return (0);
2985
2986	if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2987		if (VTOC(vp)->c_lockowner != current_thread()) {
2988			hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
2989			tooklock = 1;
2990		}
2991	}
2992	hfsmp = VTOHFS(vp);
2993	cp = VTOC(vp);
2994	fp = VTOF(vp);
2995
2996retry:
2997	/* Check virtual blocks only when performing write operation */
2998	if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2999		if (hfs_start_transaction(hfsmp) != 0) {
3000			retval = EINVAL;
3001			goto exit;
3002		} else {
3003			started_tr = 1;
3004		}
3005		syslocks = SFL_EXTENTS | SFL_BITMAP;
3006
3007	} else if (overflow_extents(fp)) {
3008		syslocks = SFL_EXTENTS;
3009	}
3010
3011	if (syslocks)
3012		lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3013
3014	/*
3015	 * Check for any delayed allocations.
3016	 */
3017	if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3018		int64_t actbytes;
3019		u_int32_t loanedBlocks;
3020
3021		//
3022		// Make sure we have a transaction.  It's possible
3023		// that we came in and fp->ff_unallocblocks was zero
3024		// but during the time we blocked acquiring the extents
3025		// btree, ff_unallocblocks became non-zero and so we
3026		// will need to start a transaction.
3027		//
3028		if (started_tr == 0) {
3029			if (syslocks) {
3030				hfs_systemfile_unlock(hfsmp, lockflags);
3031				syslocks = 0;
3032			}
3033			goto retry;
3034		}
3035
3036		/*
3037		 * Note: ExtendFileC will Release any blocks on loan and
3038		 * aquire real blocks.  So we ask to extend by zero bytes
3039		 * since ExtendFileC will account for the virtual blocks.
3040		 */
3041
3042		loanedBlocks = fp->ff_unallocblocks;
3043		retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3044				     kEFAllMask | kEFNoClumpMask, &actbytes);
3045
3046		if (retval) {
3047			fp->ff_unallocblocks = loanedBlocks;
3048			cp->c_blocks += loanedBlocks;
3049			fp->ff_blocks += loanedBlocks;
3050
3051			hfs_lock_mount (hfsmp);
3052			hfsmp->loanedBlocks += loanedBlocks;
3053			hfs_unlock_mount (hfsmp);
3054
3055			hfs_systemfile_unlock(hfsmp, lockflags);
3056			cp->c_flag |= C_MODIFIED;
3057			if (started_tr) {
3058				(void) hfs_update(vp, TRUE);
3059				(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3060
3061				hfs_end_transaction(hfsmp);
3062				started_tr = 0;
3063			}
3064			goto exit;
3065		}
3066	}
3067
3068	retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
3069	                       ap->a_bpn, &bytesContAvail);
3070	if (syslocks) {
3071		hfs_systemfile_unlock(hfsmp, lockflags);
3072		syslocks = 0;
3073	}
3074
3075	if (started_tr) {
3076		(void) hfs_update(vp, TRUE);
3077		(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3078		hfs_end_transaction(hfsmp);
3079		started_tr = 0;
3080	}
3081	if (retval) {
3082		/* On write, always return error because virtual blocks, if any,
3083		 * should have been allocated in ExtendFileC().  We do not
3084		 * allocate virtual blocks on read, therefore return error
3085		 * only if no virtual blocks are allocated.  Otherwise we search
3086		 * rangelist for zero-fills
3087		 */
3088		if ((MacToVFSError(retval) != ERANGE) ||
3089		    (ap->a_flags & VNODE_WRITE) ||
3090		    ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3091			goto exit;
3092		}
3093
3094		/* Validate if the start offset is within logical file size */
3095		if (ap->a_foffset >= fp->ff_size) {
3096			goto exit;
3097		}
3098
3099		/*
3100		 * At this point, we have encountered a failure during
3101		 * MapFileBlockC that resulted in ERANGE, and we are not servicing
3102		 * a write, and there are borrowed blocks.
3103		 *
3104		 * However, the cluster layer will not call blockmap for
3105		 * blocks that are borrowed and in-cache.  We have to assume that
3106		 * because we observed ERANGE being emitted from MapFileBlockC, this
3107		 * extent range is not valid on-disk.  So we treat this as a
3108		 * mapping that needs to be zero-filled prior to reading.
3109		 *
3110		 * Note that under certain circumstances (such as non-contiguous
3111		 * userland VM mappings in the calling process), cluster_io
3112		 * may be forced to split a large I/O driven by hfs_vnop_write
3113		 * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
3114		 * the case here, then we have already removed the invalid range list
3115		 * mapping prior to getting to this blockmap call, so we should not
3116		 * search the invalid rangelist for this byte range.
3117		 */
3118
3119		bytesContAvail = fp->ff_size - ap->a_foffset;
3120		/*
3121		 * Clip the contiguous available bytes to, at most, the allowable
3122		 * maximum or the amount requested.
3123		 */
3124
3125		if (bytesContAvail > ap->a_size) {
3126			bytesContAvail = ap->a_size;
3127		}
3128
3129		*ap->a_bpn = (daddr64_t) -1;
3130		retval = 0;
3131
3132		goto exit;
3133	}
3134
3135	/* MapFileC() found a valid extent in the filefork.  Search the
3136	 * mapping information further for invalid file ranges
3137	 */
3138	overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3139	                      ap->a_foffset + (off_t)bytesContAvail - 1,
3140	                      &invalid_range);
3141	if (overlaptype != RL_NOOVERLAP) {
3142		switch(overlaptype) {
3143		case RL_MATCHINGOVERLAP:
3144		case RL_OVERLAPCONTAINSRANGE:
3145		case RL_OVERLAPSTARTSBEFORE:
3146			/* There's no valid block for this byte offset */
3147			*ap->a_bpn = (daddr64_t)-1;
3148			/* There's no point limiting the amount to be returned
3149			 * if the invalid range that was hit extends all the way
3150			 * to the EOF (i.e. there's no valid bytes between the
3151			 * end of this range and the file's EOF):
3152			 */
3153			if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3154			    ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3155				bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3156			}
3157			break;
3158
3159		case RL_OVERLAPISCONTAINED:
3160		case RL_OVERLAPENDSAFTER:
3161			/* The range of interest hits an invalid block before the end: */
3162			if (invalid_range->rl_start == ap->a_foffset) {
3163				/* There's actually no valid information to be had starting here: */
3164				*ap->a_bpn = (daddr64_t)-1;
3165				if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3166				    ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3167					bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3168				}
3169			} else {
3170				bytesContAvail = invalid_range->rl_start - ap->a_foffset;
3171			}
3172			break;
3173
3174		case RL_NOOVERLAP:
3175			break;
3176		} /* end switch */
3177		if (bytesContAvail > ap->a_size)
3178			bytesContAvail = ap->a_size;
3179	}
3180
3181exit:
3182	if (retval == 0) {
3183		if (ap->a_run)
3184			*ap->a_run = bytesContAvail;
3185
3186		if (ap->a_poff)
3187			*(int *)ap->a_poff = 0;
3188	}
3189
3190	if (tooklock)
3191		hfs_unlock(cp);
3192
3193	return (MacToVFSError(retval));
3194}
3195
3196/*
3197 * prepare and issue the I/O
3198 * buf_strategy knows how to deal
3199 * with requests that require
3200 * fragmented I/Os
3201 */
3202int
3203hfs_vnop_strategy(struct vnop_strategy_args *ap)
3204{
3205	buf_t	bp = ap->a_bp;
3206	vnode_t	vp = buf_vnode(bp);
3207	int error = 0;
3208
3209	/* Mark buffer as containing static data if cnode flag set */
3210	if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3211		buf_markstatic(bp);
3212	}
3213
3214	/* Mark buffer as containing static data if cnode flag set */
3215	if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3216		bufattr_markgreedymode((bufattr_t)(&bp->b_attr));
3217	}
3218
3219#if CONFIG_PROTECT
3220	cnode_t *cp = NULL;
3221
3222	if ((cp = cp_get_protected_cnode(vp)) != NULL) {
3223		/*
3224		 * We rely upon the truncate lock to protect the
3225		 * CP cache key from getting tossed prior to our IO finishing here.
3226		 * Nearly all cluster io calls to manipulate file payload from HFS
3227		 * take the truncate lock before calling into the cluster
3228		 * layer to ensure the file size does not change, or that they
3229		 * have exclusive right to change the EOF of the file.
3230		 * That same guarantee protects us here since the code that
3231		 * deals with CP lock events must now take the truncate lock
3232		 * before doing anything.
3233		 *
3234		 * There is 1 exception here:
3235		 * 1) One exception should be the VM swapfile IO, because HFS will
3236		 * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
3237		 * swapfile code only without holding the truncate lock.  This is because
3238		 * individual swapfiles are maintained at fixed-length sizes by the VM code.
3239		 * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
3240		 * create our own UPL and thus take the truncate lock before calling
3241		 * into the cluster layer.  In that case, however, we are not concerned
3242		 * with the CP blob being wiped out in the middle of the IO
3243		 * because there isn't anything to toss; the VM swapfile key stays
3244		 * in-core as long as the file is open.
3245		 *
3246		 * NB:
3247		 * For filesystem resize, we may not have access to the underlying
3248		 * file's cache key for whatever reason (device may be locked).  However,
3249		 * we do not need it since we are going to use the temporary HFS-wide resize key
3250		 * which is generated once we start relocating file content.  If this file's I/O
3251		 * should be done using the resize key, it will have been supplied already, so
3252		 * do not attach the file's cp blob to the buffer.
3253		 */
3254		if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
3255			buf_setcpaddr(bp, cp->c_cpentry);
3256		}
3257	}
3258#endif /* CONFIG_PROTECT */
3259
3260	error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3261
3262	return error;
3263}
3264
3265static int
3266hfs_minorupdate(struct vnode *vp) {
3267	struct cnode *cp = VTOC(vp);
3268	cp->c_flag &= ~C_MODIFIED;
3269	cp->c_touch_acctime = 0;
3270	cp->c_touch_chgtime = 0;
3271	cp->c_touch_modtime = 0;
3272
3273	return 0;
3274}
3275
3276int
3277do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3278{
3279	register struct cnode *cp = VTOC(vp);
3280    	struct filefork *fp = VTOF(vp);
3281	struct proc *p = vfs_context_proc(context);;
3282	kauth_cred_t cred = vfs_context_ucred(context);
3283	int retval;
3284	off_t bytesToAdd;
3285	off_t actualBytesAdded;
3286	off_t filebytes;
3287	u_int32_t fileblocks;
3288	int blksize;
3289	struct hfsmount *hfsmp;
3290	int lockflags;
3291	int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
3292	int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3293
3294	blksize = VTOVCB(vp)->blockSize;
3295	fileblocks = fp->ff_blocks;
3296	filebytes = (off_t)fileblocks * (off_t)blksize;
3297
3298	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
3299		 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3300
3301	if (length < 0)
3302		return (EINVAL);
3303
3304	/* This should only happen with a corrupt filesystem */
3305	if ((off_t)fp->ff_size < 0)
3306		return (EINVAL);
3307
3308	if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3309		return (EFBIG);
3310
3311	hfsmp = VTOHFS(vp);
3312
3313	retval = E_NONE;
3314
3315	/* Files that are changing size are not hot file candidates. */
3316	if (hfsmp->hfc_stage == HFC_RECORDING) {
3317		fp->ff_bytesread = 0;
3318	}
3319
3320	/*
3321	 * We cannot just check if fp->ff_size == length (as an optimization)
3322	 * since there may be extra physical blocks that also need truncation.
3323	 */
3324#if QUOTA
3325	if ((retval = hfs_getinoquota(cp)))
3326		return(retval);
3327#endif /* QUOTA */
3328
3329	/*
3330	 * Lengthen the size of the file. We must ensure that the
3331	 * last byte of the file is allocated. Since the smallest
3332	 * value of ff_size is 0, length will be at least 1.
3333	 */
3334	if (length > (off_t)fp->ff_size) {
3335#if QUOTA
3336		retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3337				   cred, 0);
3338		if (retval)
3339			goto Err_Exit;
3340#endif /* QUOTA */
3341		/*
3342		 * If we don't have enough physical space then
3343		 * we need to extend the physical size.
3344		 */
3345		if (length > filebytes) {
3346			int eflags;
3347			u_int32_t blockHint = 0;
3348
3349			/* All or nothing and don't round up to clumpsize. */
3350			eflags = kEFAllMask | kEFNoClumpMask;
3351
3352			if (cred && suser(cred, NULL) != 0)
3353				eflags |= kEFReserveMask;  /* keep a reserve */
3354
3355			/*
3356			 * Allocate Journal and Quota files in metadata zone.
3357			 */
3358			if (filebytes == 0 &&
3359			    hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3360			    hfs_virtualmetafile(cp)) {
3361				eflags |= kEFMetadataMask;
3362				blockHint = hfsmp->hfs_metazone_start;
3363			}
3364			if (hfs_start_transaction(hfsmp) != 0) {
3365			    retval = EINVAL;
3366			    goto Err_Exit;
3367			}
3368
3369			/* Protect extents b-tree and allocation bitmap */
3370			lockflags = SFL_BITMAP;
3371			if (overflow_extents(fp))
3372				lockflags |= SFL_EXTENTS;
3373			lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3374
3375			while ((length > filebytes) && (retval == E_NONE)) {
3376				bytesToAdd = length - filebytes;
3377				retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3378                                                    (FCB*)fp,
3379                                                    bytesToAdd,
3380                                                    blockHint,
3381                                                    eflags,
3382                                                    &actualBytesAdded));
3383
3384				filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3385				if (actualBytesAdded == 0 && retval == E_NONE) {
3386					if (length > filebytes)
3387						length = filebytes;
3388					break;
3389				}
3390			} /* endwhile */
3391
3392			hfs_systemfile_unlock(hfsmp, lockflags);
3393
3394			if (hfsmp->jnl) {
3395				if (skipupdate) {
3396					(void) hfs_minorupdate(vp);
3397				}
3398				else {
3399					(void) hfs_update(vp, TRUE);
3400					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3401				}
3402			}
3403
3404			hfs_end_transaction(hfsmp);
3405
3406			if (retval)
3407				goto Err_Exit;
3408
3409			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3410				(int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3411		}
3412
3413		if (!(flags & IO_NOZEROFILL)) {
3414			if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3415				struct rl_entry *invalid_range;
3416				off_t zero_limit;
3417
3418				zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
3419				if (length < zero_limit) zero_limit = length;
3420
3421				if (length > (off_t)fp->ff_size) {
3422					struct timeval tv;
3423
3424		   			/* Extending the file: time to fill out the current last page w. zeroes? */
3425		   			if ((fp->ff_size & PAGE_MASK_64) &&
3426					    (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
3427					    fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
3428
3429						/* There's some valid data at the start of the (current) last page
3430						   of the file, so zero out the remainder of that page to ensure the
3431						   entire page contains valid data.  Since there is no invalid range
3432						   possible past the (current) eof, there's no need to remove anything
3433						   from the invalid range list before calling cluster_write():	*/
3434						hfs_unlock(cp);
3435						retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
3436								fp->ff_size, (off_t)0,
3437								(flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
3438						hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3439						if (retval) goto Err_Exit;
3440
3441						/* Merely invalidate the remaining area, if necessary: */
3442						if (length > zero_limit) {
3443							microuptime(&tv);
3444							rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3445							cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3446						}
3447		   			} else {
3448					/* The page containing the (current) eof is invalid: just add the
3449					   remainder of the page to the invalid list, along with the area
3450					   being newly allocated:
3451					 */
3452					microuptime(&tv);
3453					rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3454					cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3455					};
3456				}
3457			} else {
3458					panic("hfs_truncate: invoked on non-UBC object?!");
3459			};
3460		}
3461		if (suppress_times == 0) {
3462			cp->c_touch_modtime = TRUE;
3463		}
3464		fp->ff_size = length;
3465
3466	} else { /* Shorten the size of the file */
3467
3468		if ((off_t)fp->ff_size > length) {
3469			/* Any space previously marked as invalid is now irrelevant: */
3470			rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3471		}
3472
3473		/*
3474		 * Account for any unmapped blocks. Note that the new
3475		 * file length can still end up with unmapped blocks.
3476		 */
3477		if (fp->ff_unallocblocks > 0) {
3478			u_int32_t finalblks;
3479			u_int32_t loanedBlocks;
3480
3481			hfs_lock_mount(hfsmp);
3482			loanedBlocks = fp->ff_unallocblocks;
3483			cp->c_blocks -= loanedBlocks;
3484			fp->ff_blocks -= loanedBlocks;
3485			fp->ff_unallocblocks = 0;
3486
3487			hfsmp->loanedBlocks -= loanedBlocks;
3488
3489			finalblks = (length + blksize - 1) / blksize;
3490			if (finalblks > fp->ff_blocks) {
3491				/* calculate required unmapped blocks */
3492				loanedBlocks = finalblks - fp->ff_blocks;
3493				hfsmp->loanedBlocks += loanedBlocks;
3494
3495				fp->ff_unallocblocks = loanedBlocks;
3496				cp->c_blocks += loanedBlocks;
3497				fp->ff_blocks += loanedBlocks;
3498			}
3499			hfs_unlock_mount (hfsmp);
3500		}
3501
3502		/*
3503		 * For a TBE process the deallocation of the file blocks is
3504		 * delayed until the file is closed.  And hfs_close calls
3505		 * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
3506		 * isn't set, we make sure this isn't a TBE process.
3507		 */
3508		if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
3509#if QUOTA
3510		  off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3511#endif /* QUOTA */
3512		  if (hfs_start_transaction(hfsmp) != 0) {
3513		      retval = EINVAL;
3514		      goto Err_Exit;
3515		  }
3516
3517			if (fp->ff_unallocblocks == 0) {
3518				/* Protect extents b-tree and allocation bitmap */
3519				lockflags = SFL_BITMAP;
3520				if (overflow_extents(fp))
3521					lockflags |= SFL_EXTENTS;
3522				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3523
3524				retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3525													 FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3526
3527				hfs_systemfile_unlock(hfsmp, lockflags);
3528			}
3529			if (hfsmp->jnl) {
3530				if (retval == 0) {
3531					fp->ff_size = length;
3532				}
3533				if (skipupdate) {
3534					(void) hfs_minorupdate(vp);
3535				}
3536				else {
3537					(void) hfs_update(vp, TRUE);
3538					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3539				}
3540			}
3541			hfs_end_transaction(hfsmp);
3542
3543			filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3544			if (retval)
3545				goto Err_Exit;
3546#if QUOTA
3547			/* These are bytesreleased */
3548			(void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3549#endif /* QUOTA */
3550		}
3551		/*
3552		 * Only set update flag if the logical length changes & we aren't
3553		 * suppressing modtime updates.
3554		 */
3555		if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3556			cp->c_touch_modtime = TRUE;
3557		}
3558		fp->ff_size = length;
3559	}
3560	if (cp->c_mode & (S_ISUID | S_ISGID)) {
3561		if (!vfs_context_issuser(context)) {
3562			cp->c_mode &= ~(S_ISUID | S_ISGID);
3563			skipupdate = 0;
3564		}
3565	}
3566	if (skipupdate) {
3567		retval = hfs_minorupdate(vp);
3568	}
3569	else {
3570		cp->c_touch_chgtime = TRUE;	/* status changed */
3571		if (suppress_times == 0) {
3572			cp->c_touch_modtime = TRUE;	/* file data was modified */
3573
3574			/*
3575			 * If we are not suppressing the modtime update, then
3576			 * update the gen count as well.
3577			 */
3578			if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3579				hfs_incr_gencount(cp);
3580			}
3581		}
3582
3583		retval = hfs_update(vp, MNT_WAIT);
3584	}
3585	if (retval) {
3586		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3587		     -1, -1, -1, retval, 0);
3588	}
3589
3590Err_Exit:
3591
3592	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
3593		 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3594
3595	return (retval);
3596}
3597
3598/*
3599 * Preparation which must be done prior to deleting the catalog record
3600 * of a file or directory.  In order to make the on-disk as safe as possible,
3601 * we remove the catalog entry before releasing the bitmap blocks and the
3602 * overflow extent records.  However, some work must be done prior to deleting
3603 * the catalog record.
3604 *
3605 * When calling this function, the cnode must exist both in memory and on-disk.
3606 * If there are both resource fork and data fork vnodes, this function should
3607 * be called on both.
3608 */
3609
3610int
3611hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3612
3613	struct filefork *fp = VTOF(vp);
3614	struct cnode *cp = VTOC(vp);
3615#if QUOTA
3616	int retval = 0;
3617#endif /* QUOTA */
3618
3619	/* Cannot truncate an HFS directory! */
3620	if (vnode_isdir(vp)) {
3621		return (EISDIR);
3622	}
3623
3624	/*
3625	 * See the comment below in hfs_truncate for why we need to call
3626	 * setsize here.  Essentially we want to avoid pending IO if we
3627	 * already know that the blocks are going to be released here.
3628	 * This function is only called when totally removing all storage for a file, so
3629	 * we can take a shortcut and immediately setsize (0);
3630	 */
3631	ubc_setsize(vp, 0);
3632
3633	/* This should only happen with a corrupt filesystem */
3634	if ((off_t)fp->ff_size < 0)
3635		return (EINVAL);
3636
3637	/*
3638	 * We cannot just check if fp->ff_size == length (as an optimization)
3639	 * since there may be extra physical blocks that also need truncation.
3640	 */
3641#if QUOTA
3642	if ((retval = hfs_getinoquota(cp))) {
3643		return(retval);
3644	}
3645#endif /* QUOTA */
3646
3647	/* Wipe out any invalid ranges which have yet to be backed by disk */
3648	rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3649
3650	/*
3651	 * Account for any unmapped blocks. Since we're deleting the
3652	 * entire file, we don't have to worry about just shrinking
3653	 * to a smaller number of borrowed blocks.
3654	 */
3655	if (fp->ff_unallocblocks > 0) {
3656		u_int32_t loanedBlocks;
3657
3658		hfs_lock_mount (hfsmp);
3659		loanedBlocks = fp->ff_unallocblocks;
3660		cp->c_blocks -= loanedBlocks;
3661		fp->ff_blocks -= loanedBlocks;
3662		fp->ff_unallocblocks = 0;
3663
3664		hfsmp->loanedBlocks -= loanedBlocks;
3665
3666		hfs_unlock_mount (hfsmp);
3667	}
3668
3669	return 0;
3670}
3671
3672
3673/*
3674 * Special wrapper around calling TruncateFileC.  This function is useable
3675 * even when the catalog record does not exist any longer, making it ideal
3676 * for use when deleting a file.  The simplification here is that we know
3677 * that we are releasing all blocks.
3678 *
3679 * Note that this function may be called when there is no vnode backing
3680 * the file fork in question.  We may call this from hfs_vnop_inactive
3681 * to clear out resource fork data (and may not want to clear out the data
3682 * fork yet).  As a result, we pointer-check both sets of inputs before
3683 * doing anything with them.
3684 *
3685 * The caller is responsible for saving off a copy of the filefork(s)
3686 * embedded within the cnode prior to calling this function.  The pointers
3687 * supplied as arguments must be valid even if the cnode is no longer valid.
3688 */
3689
3690int
3691hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3692					 struct filefork *rsrcfork, u_int32_t fileid) {
3693
3694	off_t filebytes;
3695	u_int32_t fileblocks;
3696	int blksize = 0;
3697	int error = 0;
3698	int lockflags;
3699
3700	blksize = hfsmp->blockSize;
3701
3702	/* Data Fork */
3703	if ((datafork != NULL) && (datafork->ff_blocks > 0)) {
3704		fileblocks = datafork->ff_blocks;
3705		filebytes = (off_t)fileblocks * (off_t)blksize;
3706
3707		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3708
3709		while (filebytes > 0) {
3710			if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
3711				filebytes -= HFS_BIGFILE_SIZE;
3712			} else {
3713				filebytes = 0;
3714			}
3715
3716			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
3717			if (hfs_start_transaction(hfsmp) != 0) {
3718				error = EINVAL;
3719				break;
3720			}
3721
3722			if (datafork->ff_unallocblocks == 0) {
3723				/* Protect extents b-tree and allocation bitmap */
3724				lockflags = SFL_BITMAP;
3725				if (overflow_extents(datafork))
3726					lockflags |= SFL_EXTENTS;
3727				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3728
3729				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3730
3731				hfs_systemfile_unlock(hfsmp, lockflags);
3732			}
3733			if (error == 0) {
3734				datafork->ff_size = filebytes;
3735			}
3736			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3737
3738			/* Finish the transaction and start over if necessary */
3739			hfs_end_transaction(hfsmp);
3740
3741			if (error) {
3742				break;
3743			}
3744		}
3745	}
3746
3747	/* Resource fork */
3748	if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
3749		fileblocks = rsrcfork->ff_blocks;
3750		filebytes = (off_t)fileblocks * (off_t)blksize;
3751
3752		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3753
3754		while (filebytes > 0) {
3755			if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
3756				filebytes -= HFS_BIGFILE_SIZE;
3757			} else {
3758				filebytes = 0;
3759			}
3760
3761			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
3762			if (hfs_start_transaction(hfsmp) != 0) {
3763				error = EINVAL;
3764				break;
3765			}
3766
3767			if (rsrcfork->ff_unallocblocks == 0) {
3768				/* Protect extents b-tree and allocation bitmap */
3769				lockflags = SFL_BITMAP;
3770				if (overflow_extents(rsrcfork))
3771					lockflags |= SFL_EXTENTS;
3772				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3773
3774				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3775
3776				hfs_systemfile_unlock(hfsmp, lockflags);
3777			}
3778			if (error == 0) {
3779				rsrcfork->ff_size = filebytes;
3780			}
3781			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3782
3783			/* Finish the transaction and start over if necessary */
3784			hfs_end_transaction(hfsmp);
3785
3786			if (error) {
3787				break;
3788			}
3789		}
3790	}
3791
3792	return error;
3793}
3794
3795
3796/*
3797 * Truncate a cnode to at most length size, freeing (or adding) the
3798 * disk blocks.
3799 */
3800int
3801hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
3802             int truncateflags, vfs_context_t context)
3803{
3804    	struct filefork *fp = VTOF(vp);
3805	off_t filebytes;
3806	u_int32_t fileblocks;
3807	int blksize, error = 0;
3808	struct cnode *cp = VTOC(vp);
3809
3810	/* Cannot truncate an HFS directory! */
3811	if (vnode_isdir(vp)) {
3812		return (EISDIR);
3813	}
3814	/* A swap file cannot change size. */
3815	if (vnode_isswap(vp) && (length != 0)) {
3816		return (EPERM);
3817	}
3818
3819	blksize = VTOVCB(vp)->blockSize;
3820	fileblocks = fp->ff_blocks;
3821	filebytes = (off_t)fileblocks * (off_t)blksize;
3822
3823	//
3824	// Have to do this here so that we don't wind up with
3825	// i/o pending for blocks that are about to be released
3826	// if we truncate the file.
3827	//
3828	// If skipsetsize is set, then the caller is responsible
3829	// for the ubc_setsize.
3830	//
3831	// Even if skipsetsize is set, if the length is zero we
3832	// want to call ubc_setsize() because as of SnowLeopard
3833	// it will no longer cause any page-ins and it will drop
3834	// any dirty pages so that we don't do any i/o that we
3835	// don't have to.  This also prevents a race where i/o
3836	// for truncated blocks may overwrite later data if the
3837	// blocks get reallocated to a different file.
3838	//
3839	if (!skipsetsize || length == 0)
3840		ubc_setsize(vp, length);
3841
3842	// have to loop truncating or growing files that are
3843	// really big because otherwise transactions can get
3844	// enormous and consume too many kernel resources.
3845
3846	if (length < filebytes) {
3847		while (filebytes > length) {
3848			if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3849		    		filebytes -= HFS_BIGFILE_SIZE;
3850			} else {
3851		    		filebytes = length;
3852			}
3853			cp->c_flag |= C_FORCEUPDATE;
3854			error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3855			if (error)
3856				break;
3857		}
3858	} else if (length > filebytes) {
3859		while (filebytes < length) {
3860			if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3861				filebytes += HFS_BIGFILE_SIZE;
3862			} else {
3863				filebytes = length;
3864			}
3865			cp->c_flag |= C_FORCEUPDATE;
3866			error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3867			if (error)
3868				break;
3869		}
3870	} else /* Same logical size */ {
3871
3872		error = do_hfs_truncate(vp, length, flags, truncateflags, context);
3873	}
3874	/* Files that are changing size are not hot file candidates. */
3875	if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3876		fp->ff_bytesread = 0;
3877	}
3878
3879	return (error);
3880}
3881
3882
3883
3884/*
3885 * Preallocate file storage space.
3886 */
3887int
3888hfs_vnop_allocate(struct vnop_allocate_args /* {
3889		vnode_t a_vp;
3890		off_t a_length;
3891		u_int32_t  a_flags;
3892		off_t *a_bytesallocated;
3893		off_t a_offset;
3894		vfs_context_t a_context;
3895	} */ *ap)
3896{
3897	struct vnode *vp = ap->a_vp;
3898	struct cnode *cp;
3899	struct filefork *fp;
3900	ExtendedVCB *vcb;
3901	off_t length = ap->a_length;
3902	off_t startingPEOF;
3903	off_t moreBytesRequested;
3904	off_t actualBytesAdded;
3905	off_t filebytes;
3906	u_int32_t fileblocks;
3907	int retval, retval2;
3908	u_int32_t blockHint;
3909	u_int32_t extendFlags;   /* For call to ExtendFileC */
3910	struct hfsmount *hfsmp;
3911	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3912	int lockflags;
3913	time_t orig_ctime;
3914
3915	*(ap->a_bytesallocated) = 0;
3916
3917	if (!vnode_isreg(vp))
3918		return (EISDIR);
3919	if (length < (off_t)0)
3920		return (EINVAL);
3921
3922	cp = VTOC(vp);
3923
3924	orig_ctime = VTOC(vp)->c_ctime;
3925
3926	check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3927
3928	hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3929
3930	if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
3931		goto Err_Exit;
3932	}
3933
3934	fp = VTOF(vp);
3935	hfsmp = VTOHFS(vp);
3936	vcb = VTOVCB(vp);
3937
3938	fileblocks = fp->ff_blocks;
3939	filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3940
3941	if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3942		retval = EINVAL;
3943		goto Err_Exit;
3944	}
3945
3946	/* Fill in the flags word for the call to Extend the file */
3947
3948	extendFlags = kEFNoClumpMask;
3949	if (ap->a_flags & ALLOCATECONTIG)
3950		extendFlags |= kEFContigMask;
3951	if (ap->a_flags & ALLOCATEALL)
3952		extendFlags |= kEFAllMask;
3953	if (cred && suser(cred, NULL) != 0)
3954		extendFlags |= kEFReserveMask;
3955	if (hfs_virtualmetafile(cp))
3956		extendFlags |= kEFMetadataMask;
3957
3958	retval = E_NONE;
3959	blockHint = 0;
3960	startingPEOF = filebytes;
3961
3962	if (ap->a_flags & ALLOCATEFROMPEOF)
3963		length += filebytes;
3964	else if (ap->a_flags & ALLOCATEFROMVOL)
3965		blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3966
3967	/* If no changes are necesary, then we're done */
3968	if (filebytes == length)
3969		goto Std_Exit;
3970
3971	/*
3972	 * Lengthen the size of the file. We must ensure that the
3973	 * last byte of the file is allocated. Since the smallest
3974	 * value of filebytes is 0, length will be at least 1.
3975	 */
3976	if (length > filebytes) {
3977		off_t total_bytes_added = 0, orig_request_size;
3978
3979		orig_request_size = moreBytesRequested = length - filebytes;
3980
3981#if QUOTA
3982		retval = hfs_chkdq(cp,
3983				(int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3984				cred, 0);
3985		if (retval)
3986			goto Err_Exit;
3987
3988#endif /* QUOTA */
3989		/*
3990		 * Metadata zone checks.
3991		 */
3992		if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3993			/*
3994			 * Allocate Journal and Quota files in metadata zone.
3995			 */
3996			if (hfs_virtualmetafile(cp)) {
3997				blockHint = hfsmp->hfs_metazone_start;
3998			} else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3999				   (blockHint <= hfsmp->hfs_metazone_end)) {
4000				/*
4001				 * Move blockHint outside metadata zone.
4002				 */
4003				blockHint = hfsmp->hfs_metazone_end + 1;
4004			}
4005		}
4006
4007
4008		while ((length > filebytes) && (retval == E_NONE)) {
4009		    off_t bytesRequested;
4010
4011		    if (hfs_start_transaction(hfsmp) != 0) {
4012			retval = EINVAL;
4013			goto Err_Exit;
4014		    }
4015
4016		    /* Protect extents b-tree and allocation bitmap */
4017		    lockflags = SFL_BITMAP;
4018		    if (overflow_extents(fp))
4019			lockflags |= SFL_EXTENTS;
4020		    lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4021
4022		    if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4023			bytesRequested = HFS_BIGFILE_SIZE;
4024		    } else {
4025			bytesRequested = moreBytesRequested;
4026		    }
4027
4028		    if (extendFlags & kEFContigMask) {
4029			    // if we're on a sparse device, this will force it to do a
4030			    // full scan to find the space needed.
4031			    hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4032		    }
4033
4034		    retval = MacToVFSError(ExtendFileC(vcb,
4035						(FCB*)fp,
4036						bytesRequested,
4037						blockHint,
4038						extendFlags,
4039						&actualBytesAdded));
4040
4041		    if (retval == E_NONE) {
4042			*(ap->a_bytesallocated) += actualBytesAdded;
4043			total_bytes_added += actualBytesAdded;
4044			moreBytesRequested -= actualBytesAdded;
4045			if (blockHint != 0) {
4046			    blockHint += actualBytesAdded / vcb->blockSize;
4047			}
4048		    }
4049		    filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4050
4051		    hfs_systemfile_unlock(hfsmp, lockflags);
4052
4053		    if (hfsmp->jnl) {
4054			(void) hfs_update(vp, TRUE);
4055			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4056		    }
4057
4058		    hfs_end_transaction(hfsmp);
4059		}
4060
4061
4062		/*
4063		 * if we get an error and no changes were made then exit
4064		 * otherwise we must do the hfs_update to reflect the changes
4065		 */
4066		if (retval && (startingPEOF == filebytes))
4067			goto Err_Exit;
4068
4069		/*
4070		 * Adjust actualBytesAdded to be allocation block aligned, not
4071		 * clump size aligned.
4072		 * NOTE: So what we are reporting does not affect reality
4073		 * until the file is closed, when we truncate the file to allocation
4074		 * block size.
4075		 */
4076		if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4077			*(ap->a_bytesallocated) =
4078				roundup(orig_request_size, (off_t)vcb->blockSize);
4079
4080	} else { /* Shorten the size of the file */
4081
4082		if (fp->ff_size > length) {
4083			/*
4084			 * Any buffers that are past the truncation point need to be
4085			 * invalidated (to maintain buffer cache consistency).
4086			 */
4087		}
4088
4089		retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
4090		filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4091
4092		/*
4093		 * if we get an error and no changes were made then exit
4094		 * otherwise we must do the hfs_update to reflect the changes
4095		 */
4096		if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4097#if QUOTA
4098		/* These are  bytesreleased */
4099		(void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4100#endif /* QUOTA */
4101
4102		if (fp->ff_size > filebytes) {
4103			fp->ff_size = filebytes;
4104
4105			hfs_unlock(cp);
4106			ubc_setsize(vp, fp->ff_size);
4107			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4108		}
4109	}
4110
4111Std_Exit:
4112	cp->c_touch_chgtime = TRUE;
4113	cp->c_touch_modtime = TRUE;
4114	retval2 = hfs_update(vp, MNT_WAIT);
4115
4116	if (retval == 0)
4117		retval = retval2;
4118Err_Exit:
4119	hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4120	hfs_unlock(cp);
4121	return (retval);
4122}
4123
4124
4125/*
4126 * Pagein for HFS filesystem
4127 */
4128int
4129hfs_vnop_pagein(struct vnop_pagein_args *ap)
4130/*
4131	struct vnop_pagein_args {
4132	   	vnode_t a_vp,
4133	   	upl_t 	      a_pl,
4134		vm_offset_t   a_pl_offset,
4135		off_t         a_f_offset,
4136		size_t        a_size,
4137		int           a_flags
4138		vfs_context_t a_context;
4139	};
4140*/
4141{
4142	vnode_t 	vp;
4143	struct cnode	*cp;
4144	struct filefork *fp;
4145	int		error = 0;
4146	upl_t 		upl;
4147	upl_page_info_t	*pl;
4148	off_t		f_offset;
4149	int		offset;
4150	int		isize;
4151	int		pg_index;
4152	boolean_t	truncate_lock_held = FALSE;
4153	boolean_t 	file_converted = FALSE;
4154	kern_return_t	kret;
4155
4156	vp = ap->a_vp;
4157	cp = VTOC(vp);
4158	fp = VTOF(vp);
4159
4160#if CONFIG_PROTECT
4161	if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4162		/*
4163		 * If we errored here, then this means that one of two things occurred:
4164		 * 1. there was a problem with the decryption of the key.
4165		 * 2. the device is locked and we are not allowed to access this particular file.
4166		 *
4167		 * Either way, this means that we need to shut down this upl now.  As long as
4168		 * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4169		 * then we create a upl and immediately abort it.
4170		 */
4171		if (ap->a_pl == NULL) {
4172			/* create the upl */
4173			ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4174					UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4175			/* mark the range as needed so it doesn't immediately get discarded upon abort */
4176			ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4177
4178			/* Abort the range */
4179			ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4180		}
4181
4182
4183		return error;
4184	}
4185#endif /* CONFIG_PROTECT */
4186
4187	if (ap->a_pl != NULL) {
4188		/*
4189		 * this can only happen for swap files now that
4190		 * we're asking for V2 paging behavior...
4191		 * so don't need to worry about decompression, or
4192		 * keeping track of blocks read or taking the truncate lock
4193		 */
4194		error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4195				       ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4196		goto pagein_done;
4197	}
4198
4199retry_pagein:
4200	/*
4201	 * take truncate lock (shared/recursive) to guard against
4202	 * zero-fill thru fsync interfering, but only for v2
4203	 *
4204	 * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4205	 * lock shared and we are allowed to recurse 1 level if this thread already
4206	 * owns the lock exclusively... this can legally occur
4207	 * if we are doing a shrinking ftruncate against a file
4208	 * that is mapped private, and the pages being truncated
4209	 * do not currently exist in the cache... in that case
4210	 * we will have to page-in the missing pages in order
4211	 * to provide them to the private mapping... we must
4212	 * also call hfs_unlock_truncate with a postive been_recursed
4213	 * arg to indicate that if we have recursed, there is no need to drop
4214	 * the lock.  Allowing this simple recursion is necessary
4215	 * in order to avoid a certain deadlock... since the ftruncate
4216	 * already holds the truncate lock exclusively, if we try
4217	 * to acquire it shared to protect the pagein path, we will
4218	 * hang this thread
4219	 *
4220	 * NOTE: The if () block below is a workaround in order to prevent a
4221	 * VM deadlock. See rdar://7853471.
4222	 *
4223	 * If we are in a forced unmount, then launchd will still have the
4224	 * dyld_shared_cache file mapped as it is trying to reboot.  If we
4225	 * take the truncate lock here to service a page fault, then our
4226	 * thread could deadlock with the forced-unmount.  The forced unmount
4227	 * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4228	 * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4229	 * thread will think it needs to copy all of the data out of the file
4230	 * and into a VM copy object.  If we hold the cnode lock here, then that
4231	 * VM operation will not be able to proceed, because we'll set a busy page
4232	 * before attempting to grab the lock.  Note that this isn't as simple as "don't
4233	 * call ubc_setsize" because doing that would just shift the problem to the
4234	 * ubc_msync done before the vnode is reclaimed.
4235	 *
4236	 * So, if a forced unmount on this volume is in flight AND the cnode is
4237	 * marked C_DELETED, then just go ahead and do the page in without taking
4238	 * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4239	 * that is not going to be available on the next mount, this seems like a
4240	 * OK solution from a correctness point of view, even though it is hacky.
4241	 */
4242	if (vfs_isforce(vp->v_mount)) {
4243		if (cp->c_flag & C_DELETED) {
4244			/* If we don't get it, then just go ahead and operate without the lock */
4245			truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4246		}
4247	}
4248	else {
4249		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4250		truncate_lock_held = TRUE;
4251	}
4252
4253	kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4254
4255	if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4256		error = EINVAL;
4257		goto pagein_done;
4258	}
4259	ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4260
4261	isize = ap->a_size;
4262
4263	/*
4264	 * Scan from the back to find the last page in the UPL, so that we
4265	 * aren't looking at a UPL that may have already been freed by the
4266	 * preceding aborts/completions.
4267	 */
4268	for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4269		if (upl_page_present(pl, --pg_index))
4270			break;
4271		if (pg_index == 0) {
4272			/*
4273			 * no absent pages were found in the range specified
4274			 * just abort the UPL to get rid of it and then we're done
4275			 */
4276			ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4277			goto pagein_done;
4278		}
4279	}
4280	/*
4281	 * initialize the offset variables before we touch the UPL.
4282	 * f_offset is the position into the file, in bytes
4283	 * offset is the position into the UPL, in bytes
4284	 * pg_index is the pg# of the UPL we're operating on
4285	 * isize is the offset into the UPL of the last page that is present.
4286	 */
4287	isize = ((pg_index + 1) * PAGE_SIZE);
4288	pg_index = 0;
4289	offset = 0;
4290	f_offset = ap->a_f_offset;
4291
4292	while (isize) {
4293		int  xsize;
4294		int  num_of_pages;
4295
4296		if ( !upl_page_present(pl, pg_index)) {
4297			/*
4298			 * we asked for RET_ONLY_ABSENT, so it's possible
4299			 * to get back empty slots in the UPL.
4300			 * just skip over them
4301			 */
4302			f_offset += PAGE_SIZE;
4303			offset   += PAGE_SIZE;
4304			isize    -= PAGE_SIZE;
4305			pg_index++;
4306
4307			continue;
4308		}
4309		/*
4310		 * We know that we have at least one absent page.
4311		 * Now checking to see how many in a row we have
4312		 */
4313		num_of_pages = 1;
4314		xsize = isize - PAGE_SIZE;
4315
4316		while (xsize) {
4317			if ( !upl_page_present(pl, pg_index + num_of_pages))
4318				break;
4319			num_of_pages++;
4320			xsize -= PAGE_SIZE;
4321		}
4322		xsize = num_of_pages * PAGE_SIZE;
4323
4324#if HFS_COMPRESSION
4325		if (VNODE_IS_RSRC(vp)) {
4326			/* allow pageins of the resource fork */
4327		} else {
4328			int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4329
4330			if (compressed) {
4331				if (truncate_lock_held) {
4332					/*
4333					 * can't hold the truncate lock when calling into the decmpfs layer
4334					 * since it calls back into this layer... even though we're only
4335					 * holding the lock in shared mode, and the re-entrant path only
4336					 * takes the lock shared, we can deadlock if some other thread
4337					 * tries to grab the lock exclusively in between.
4338					 */
4339					hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4340					truncate_lock_held = FALSE;
4341				}
4342				ap->a_pl = upl;
4343				ap->a_pl_offset = offset;
4344				ap->a_f_offset = f_offset;
4345				ap->a_size = xsize;
4346
4347				error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4348				/*
4349				 * note that decpfs_pagein_compressed can change the state of
4350				 * 'compressed'... it will set it to 0 if the file is no longer
4351				 * compressed once the compression lock is successfully taken
4352				 * i.e. we would block on that lock while the file is being inflated
4353				 */
4354				if (compressed) {
4355					if (error == 0) {
4356						/* successful page-in, update the access time */
4357						VTOC(vp)->c_touch_acctime = TRUE;
4358
4359						/* compressed files are not hot file candidates */
4360						if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4361							fp->ff_bytesread = 0;
4362						}
4363					} else if (error == EAGAIN) {
4364						/*
4365						 * EAGAIN indicates someone else already holds the compression lock...
4366						 * to avoid deadlocking, we'll abort this range of pages with an
4367						 * indication that the pagein needs to be redriven
4368						 */
4369			        		ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4370					}
4371					goto pagein_next_range;
4372				}
4373				else {
4374					/*
4375					 * Set file_converted only if the file became decompressed while we were
4376					 * paging in.  If it were still compressed, we would re-start the loop using the goto
4377					 * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4378					 * condition below, since we could have avoided taking the truncate lock to prevent
4379					 * a deadlock in the force unmount case.
4380					 */
4381					file_converted = TRUE;
4382				}
4383			}
4384			if (file_converted == TRUE) {
4385				/*
4386				 * the file was converted back to a regular file after we first saw it as compressed
4387				 * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4388				 * reset a_size so that we consider what remains of the original request
4389				 * and null out a_upl and a_pl_offset.
4390				 *
4391				 * We should only be able to get into this block if the decmpfs_pagein_compressed
4392				 * successfully decompressed the range in question for this file.
4393				 */
4394				ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4395
4396				ap->a_size = isize;
4397				ap->a_pl = NULL;
4398				ap->a_pl_offset = 0;
4399
4400				/* Reset file_converted back to false so that we don't infinite-loop. */
4401				file_converted = FALSE;
4402				goto retry_pagein;
4403			}
4404		}
4405#endif
4406		error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4407
4408		/*
4409		 * Keep track of blocks read.
4410		 */
4411		if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4412			int bytesread;
4413			int took_cnode_lock = 0;
4414
4415			if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4416				bytesread = fp->ff_size;
4417			else
4418				bytesread = xsize;
4419
4420			/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4421			if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4422				hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4423				took_cnode_lock = 1;
4424			}
4425			/*
4426			 * If this file hasn't been seen since the start of
4427			 * the current sampling period then start over.
4428			 */
4429			if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4430				struct timeval tv;
4431
4432				fp->ff_bytesread = bytesread;
4433				microtime(&tv);
4434				cp->c_atime = tv.tv_sec;
4435			} else {
4436				fp->ff_bytesread += bytesread;
4437			}
4438			cp->c_touch_acctime = TRUE;
4439			if (took_cnode_lock)
4440				hfs_unlock(cp);
4441		}
4442pagein_next_range:
4443		f_offset += xsize;
4444		offset   += xsize;
4445		isize    -= xsize;
4446		pg_index += num_of_pages;
4447
4448		error = 0;
4449	}
4450
4451pagein_done:
4452	if (truncate_lock_held == TRUE) {
4453		/* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4454		hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4455	}
4456
4457	return (error);
4458}
4459
4460/*
4461 * Pageout for HFS filesystem.
4462 */
4463int
4464hfs_vnop_pageout(struct vnop_pageout_args *ap)
4465/*
4466	struct vnop_pageout_args {
4467	   vnode_t a_vp,
4468	   upl_t         a_pl,
4469	   vm_offset_t   a_pl_offset,
4470	   off_t         a_f_offset,
4471	   size_t        a_size,
4472	   int           a_flags
4473	   vfs_context_t a_context;
4474	};
4475*/
4476{
4477	vnode_t vp = ap->a_vp;
4478	struct cnode *cp;
4479	struct filefork *fp;
4480	int retval = 0;
4481	off_t filesize;
4482	upl_t 		upl;
4483	upl_page_info_t* pl;
4484	vm_offset_t	a_pl_offset;
4485	int		a_flags;
4486	int is_pageoutv2 = 0;
4487	kern_return_t kret;
4488
4489	cp = VTOC(vp);
4490	fp = VTOF(vp);
4491
4492	/*
4493	 * Figure out where the file ends, for pageout purposes.  If
4494	 * ff_new_size > ff_size, then we're in the middle of extending the
4495	 * file via a write, so it is safe (and necessary) that we be able
4496	 * to pageout up to that point.
4497	 */
4498	filesize = fp->ff_size;
4499	if (fp->ff_new_size > filesize)
4500		filesize = fp->ff_new_size;
4501
4502	a_flags = ap->a_flags;
4503	a_pl_offset = ap->a_pl_offset;
4504
4505	if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
4506		hfs_incr_gencount (cp);
4507	}
4508
4509	/*
4510	 * we can tell if we're getting the new or old behavior from the UPL
4511	 */
4512	if ((upl = ap->a_pl) == NULL) {
4513		int request_flags;
4514
4515		is_pageoutv2 = 1;
4516		/*
4517		 * we're in control of any UPL we commit
4518		 * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4519		 */
4520		a_flags &= ~UPL_NOCOMMIT;
4521		a_pl_offset = 0;
4522
4523		/*
4524		 * For V2 semantics, we want to take the cnode truncate lock
4525		 * shared to guard against the file size changing via zero-filling.
4526		 *
4527		 * However, we have to be careful because we may be invoked
4528		 * via the ubc_msync path to write out dirty mmap'd pages
4529		 * in response to a lock event on a content-protected
4530		 * filesystem (e.g. to write out class A files).
4531		 * As a result, we want to take the truncate lock 'SHARED' with
4532		 * the mini-recursion locktype so that we don't deadlock/panic
4533		 * because we may be already holding the truncate lock exclusive to force any other
4534		 * IOs to have blocked behind us.
4535		 */
4536		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4537
4538		if (a_flags & UPL_MSYNC) {
4539			request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4540		}
4541		else {
4542			request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4543		}
4544
4545		kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4546
4547		if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4548			retval = EINVAL;
4549			goto pageout_done;
4550		}
4551	}
4552	/*
4553	 * from this point forward upl points at the UPL we're working with
4554	 * it was either passed in or we succesfully created it
4555	 */
4556
4557	/*
4558	 * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4559	 * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4560	 * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4561	 * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4562	 * logic in vnode_pageout except that we need to do it after grabbing the truncate
4563	 * lock in HFS so that we don't lock invert ourselves.
4564	 *
4565	 * Note that we can still get into this function on behalf of the default pager with
4566	 * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4567	 * since fsync and other writing threads will grab the locks, then mark the
4568	 * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4569	 * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4570	 * we do not try to grab anything for the pre-V2 case, which should only be accessed
4571	 * by the paging/VM system.
4572	 */
4573
4574	if (is_pageoutv2) {
4575		off_t f_offset;
4576		int offset;
4577		int isize;
4578		int pg_index;
4579		int error;
4580		int error_ret = 0;
4581
4582		isize = ap->a_size;
4583		f_offset = ap->a_f_offset;
4584
4585		/*
4586		 * Scan from the back to find the last page in the UPL, so that we
4587		 * aren't looking at a UPL that may have already been freed by the
4588		 * preceding aborts/completions.
4589		 */
4590		for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4591			if (upl_page_present(pl, --pg_index))
4592				break;
4593			if (pg_index == 0) {
4594				ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4595				goto pageout_done;
4596			}
4597		}
4598
4599		/*
4600		 * initialize the offset variables before we touch the UPL.
4601		 * a_f_offset is the position into the file, in bytes
4602		 * offset is the position into the UPL, in bytes
4603		 * pg_index is the pg# of the UPL we're operating on.
4604		 * isize is the offset into the UPL of the last non-clean page.
4605		 */
4606		isize = ((pg_index + 1) * PAGE_SIZE);
4607
4608		offset = 0;
4609		pg_index = 0;
4610
4611		while (isize) {
4612			int  xsize;
4613			int  num_of_pages;
4614
4615			if ( !upl_page_present(pl, pg_index)) {
4616				/*
4617				 * we asked for RET_ONLY_DIRTY, so it's possible
4618				 * to get back empty slots in the UPL.
4619				 * just skip over them
4620				 */
4621				f_offset += PAGE_SIZE;
4622				offset   += PAGE_SIZE;
4623				isize    -= PAGE_SIZE;
4624				pg_index++;
4625
4626				continue;
4627			}
4628			if ( !upl_dirty_page(pl, pg_index)) {
4629				panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4630			}
4631
4632			/*
4633			 * We know that we have at least one dirty page.
4634			 * Now checking to see how many in a row we have
4635			 */
4636			num_of_pages = 1;
4637			xsize = isize - PAGE_SIZE;
4638
4639			while (xsize) {
4640				if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4641					break;
4642				num_of_pages++;
4643				xsize -= PAGE_SIZE;
4644			}
4645			xsize = num_of_pages * PAGE_SIZE;
4646
4647			if (!vnode_isswap(vp)) {
4648				off_t end_of_range;
4649				int tooklock;
4650
4651				tooklock = 0;
4652
4653				if (cp->c_lockowner != current_thread()) {
4654					if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4655						/*
4656						 * we're in the v2 path, so we are the
4657						 * owner of the UPL... we may have already
4658						 * processed some of the UPL, so abort it
4659						 * from the current working offset to the
4660						 * end of the UPL
4661						 */
4662						ubc_upl_abort_range(upl,
4663								    offset,
4664								    ap->a_size - offset,
4665								    UPL_ABORT_FREE_ON_EMPTY);
4666						goto pageout_done;
4667					}
4668					tooklock = 1;
4669				}
4670				end_of_range = f_offset + xsize - 1;
4671
4672				if (end_of_range >= filesize) {
4673					end_of_range = (off_t)(filesize - 1);
4674				}
4675				if (f_offset < filesize) {
4676					rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4677					cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4678				}
4679				if (tooklock) {
4680					hfs_unlock(cp);
4681				}
4682			}
4683			if ((error = cluster_pageout(vp, upl, offset, f_offset,
4684							xsize, filesize, a_flags))) {
4685				if (error_ret == 0)
4686					error_ret = error;
4687			}
4688			f_offset += xsize;
4689			offset   += xsize;
4690			isize    -= xsize;
4691			pg_index += num_of_pages;
4692		}
4693		/* capture errnos bubbled out of cluster_pageout if they occurred */
4694		if (error_ret != 0) {
4695			retval = error_ret;
4696		}
4697	} /* end block for v2 pageout behavior */
4698	else {
4699		if (!vnode_isswap(vp)) {
4700			off_t end_of_range;
4701			int tooklock = 0;
4702
4703			if (cp->c_lockowner != current_thread()) {
4704				if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4705					if (!(a_flags & UPL_NOCOMMIT)) {
4706						ubc_upl_abort_range(upl,
4707								    a_pl_offset,
4708								    ap->a_size,
4709								    UPL_ABORT_FREE_ON_EMPTY);
4710					}
4711					goto pageout_done;
4712				}
4713				tooklock = 1;
4714			}
4715			end_of_range = ap->a_f_offset + ap->a_size - 1;
4716
4717			if (end_of_range >= filesize) {
4718				end_of_range = (off_t)(filesize - 1);
4719			}
4720			if (ap->a_f_offset < filesize) {
4721				rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4722				cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4723			}
4724
4725			if (tooklock) {
4726				hfs_unlock(cp);
4727			}
4728		}
4729		/*
4730		 * just call cluster_pageout for old pre-v2 behavior
4731		 */
4732		retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4733				ap->a_size, filesize, a_flags);
4734	}
4735
4736	/*
4737	 * If data was written, update the modification time of the file.
4738	 * If setuid or setgid bits are set and this process is not the
4739	 * superuser then clear the setuid and setgid bits as a precaution
4740	 * against tampering.
4741	 */
4742	if (retval == 0) {
4743		cp->c_touch_modtime = TRUE;
4744		cp->c_touch_chgtime = TRUE;
4745		if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4746		    (vfs_context_suser(ap->a_context) != 0)) {
4747			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4748			cp->c_mode &= ~(S_ISUID | S_ISGID);
4749			hfs_unlock(cp);
4750		}
4751	}
4752
4753pageout_done:
4754	if (is_pageoutv2) {
4755		/*
4756		 * Release the truncate lock.  Note that because
4757		 * we may have taken the lock recursively by
4758		 * being invoked via ubc_msync due to lockdown,
4759		 * we should release it recursively, too.
4760		 */
4761		hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4762	}
4763	return (retval);
4764}
4765
4766/*
4767 * Intercept B-Tree node writes to unswap them if necessary.
4768 */
4769int
4770hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4771{
4772	int retval = 0;
4773	register struct buf *bp = ap->a_bp;
4774	register struct vnode *vp = buf_vnode(bp);
4775	BlockDescriptor block;
4776
4777	/* Trap B-Tree writes */
4778	if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4779	    (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4780	    (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4781	    (vp == VTOHFS(vp)->hfc_filevp)) {
4782
4783		/*
4784		 * Swap and validate the node if it is in native byte order.
4785		 * This is always be true on big endian, so we always validate
4786		 * before writing here.  On little endian, the node typically has
4787		 * been swapped and validated when it was written to the journal,
4788		 * so we won't do anything here.
4789		 */
4790		if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4791			/* Prepare the block pointer */
4792			block.blockHeader = bp;
4793			block.buffer = (char *)buf_dataptr(bp);
4794			block.blockNum = buf_lblkno(bp);
4795			/* not found in cache ==> came from disk */
4796			block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4797			block.blockSize = buf_count(bp);
4798
4799			/* Endian un-swap B-Tree node */
4800			retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4801			if (retval)
4802				panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4803		}
4804	}
4805
4806	/* This buffer shouldn't be locked anymore but if it is clear it */
4807	if ((buf_flags(bp) & B_LOCKED)) {
4808	        // XXXdbg
4809	        if (VTOHFS(vp)->jnl) {
4810		        panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4811		}
4812		buf_clearflags(bp, B_LOCKED);
4813	}
4814	retval = vn_bwrite (ap);
4815
4816	return (retval);
4817}
4818
4819/*
4820 * Relocate a file to a new location on disk
4821 *  cnode must be locked on entry
4822 *
4823 * Relocation occurs by cloning the file's data from its
4824 * current set of blocks to a new set of blocks. During
4825 * the relocation all of the blocks (old and new) are
4826 * owned by the file.
4827 *
4828 * -----------------
4829 * |///////////////|
4830 * -----------------
4831 * 0               N (file offset)
4832 *
4833 * -----------------     -----------------
4834 * |///////////////|     |               |     STEP 1 (acquire new blocks)
4835 * -----------------     -----------------
4836 * 0               N     N+1             2N
4837 *
4838 * -----------------     -----------------
4839 * |///////////////|     |///////////////|     STEP 2 (clone data)
4840 * -----------------     -----------------
4841 * 0               N     N+1             2N
4842 *
4843 *                       -----------------
4844 *                       |///////////////|     STEP 3 (head truncate blocks)
4845 *                       -----------------
4846 *                       0               N
4847 *
4848 * During steps 2 and 3 page-outs to file offsets less
4849 * than or equal to N are suspended.
4850 *
4851 * During step 3 page-ins to the file get suspended.
4852 */
4853int
4854hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4855	struct  proc *p)
4856{
4857	struct  cnode *cp;
4858	struct  filefork *fp;
4859	struct  hfsmount *hfsmp;
4860	u_int32_t  headblks;
4861	u_int32_t  datablks;
4862	u_int32_t  blksize;
4863	u_int32_t  growsize;
4864	u_int32_t  nextallocsave;
4865	daddr64_t  sector_a,  sector_b;
4866	int eflags;
4867	off_t  newbytes;
4868	int  retval;
4869	int lockflags = 0;
4870	int took_trunc_lock = 0;
4871	int started_tr = 0;
4872	enum vtype vnodetype;
4873
4874	vnodetype = vnode_vtype(vp);
4875	if (vnodetype != VREG) {
4876		/* Not allowed to move symlinks. */
4877		return (EPERM);
4878	}
4879
4880	hfsmp = VTOHFS(vp);
4881	if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4882		return (ENOSPC);
4883	}
4884
4885	cp = VTOC(vp);
4886	fp = VTOF(vp);
4887	if (fp->ff_unallocblocks)
4888		return (EINVAL);
4889
4890#if CONFIG_PROTECT
4891	/*
4892	 * <rdar://problem/9118426>
4893	 * Disable HFS file relocation on content-protected filesystems
4894	 */
4895	if (cp_fs_protected (hfsmp->hfs_mp)) {
4896		return EINVAL;
4897	}
4898#endif
4899	/* If it's an SSD, also disable HFS relocation */
4900	if (hfsmp->hfs_flags & HFS_SSD) {
4901		return EINVAL;
4902	}
4903
4904
4905	blksize = hfsmp->blockSize;
4906	if (blockHint == 0)
4907		blockHint = hfsmp->nextAllocation;
4908
4909	if (fp->ff_size > 0x7fffffff) {
4910		return (EFBIG);
4911	}
4912
4913	//
4914	// We do not believe that this call to hfs_fsync() is
4915	// necessary and it causes a journal transaction
4916	// deadlock so we are removing it.
4917	//
4918	//if (vnodetype == VREG && !vnode_issystem(vp)) {
4919	//	retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4920	//	if (retval)
4921	//		return (retval);
4922	//}
4923
4924	if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4925		hfs_unlock(cp);
4926		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4927		/* Force lock since callers expects lock to be held. */
4928		if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
4929			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4930			return (retval);
4931		}
4932		/* No need to continue if file was removed. */
4933		if (cp->c_flag & C_NOEXISTS) {
4934			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4935			return (ENOENT);
4936		}
4937		took_trunc_lock = 1;
4938	}
4939	headblks = fp->ff_blocks;
4940	datablks = howmany(fp->ff_size, blksize);
4941	growsize = datablks * blksize;
4942	eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
4943	if (blockHint >= hfsmp->hfs_metazone_start &&
4944	    blockHint <= hfsmp->hfs_metazone_end)
4945		eflags |= kEFMetadataMask;
4946
4947	if (hfs_start_transaction(hfsmp) != 0) {
4948		if (took_trunc_lock)
4949			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4950	    return (EINVAL);
4951	}
4952	started_tr = 1;
4953	/*
4954	 * Protect the extents b-tree and the allocation bitmap
4955	 * during MapFileBlockC and ExtendFileC operations.
4956	 */
4957	lockflags = SFL_BITMAP;
4958	if (overflow_extents(fp))
4959		lockflags |= SFL_EXTENTS;
4960	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4961
4962	retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
4963	if (retval) {
4964		retval = MacToVFSError(retval);
4965		goto out;
4966	}
4967
4968	/*
4969	 * STEP 1 - acquire new allocation blocks.
4970	 */
4971	nextallocsave = hfsmp->nextAllocation;
4972	retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
4973	if (eflags & kEFMetadataMask) {
4974		hfs_lock_mount(hfsmp);
4975		HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
4976		MarkVCBDirty(hfsmp);
4977		hfs_unlock_mount(hfsmp);
4978	}
4979
4980	retval = MacToVFSError(retval);
4981	if (retval == 0) {
4982		cp->c_flag |= C_MODIFIED;
4983		if (newbytes < growsize) {
4984			retval = ENOSPC;
4985			goto restore;
4986		} else if (fp->ff_blocks < (headblks + datablks)) {
4987			printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
4988			retval = ENOSPC;
4989			goto restore;
4990		}
4991
4992		retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
4993		if (retval) {
4994			retval = MacToVFSError(retval);
4995		} else if ((sector_a + 1) == sector_b) {
4996			retval = ENOSPC;
4997			goto restore;
4998		} else if ((eflags & kEFMetadataMask) &&
4999		           ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5000		              hfsmp->hfs_metazone_end)) {
5001#if 0
5002			const char * filestr;
5003			char emptystr = '\0';
5004
5005			if (cp->c_desc.cd_nameptr != NULL) {
5006				filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5007			} else if (vnode_name(vp) != NULL) {
5008				filestr = vnode_name(vp);
5009			} else {
5010				filestr = &emptystr;
5011			}
5012#endif
5013			retval = ENOSPC;
5014			goto restore;
5015		}
5016	}
5017	/* Done with system locks and journal for now. */
5018	hfs_systemfile_unlock(hfsmp, lockflags);
5019	lockflags = 0;
5020	hfs_end_transaction(hfsmp);
5021	started_tr = 0;
5022
5023	if (retval) {
5024		/*
5025		 * Check to see if failure is due to excessive fragmentation.
5026		 */
5027		if ((retval == ENOSPC) &&
5028		    (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5029			hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5030		}
5031		goto out;
5032	}
5033	/*
5034	 * STEP 2 - clone file data into the new allocation blocks.
5035	 */
5036
5037	if (vnodetype == VLNK)
5038		retval = EPERM;
5039	else if (vnode_issystem(vp))
5040		retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5041	else
5042		retval = hfs_clonefile(vp, headblks, datablks, blksize);
5043
5044	/* Start transaction for step 3 or for a restore. */
5045	if (hfs_start_transaction(hfsmp) != 0) {
5046		retval = EINVAL;
5047		goto out;
5048	}
5049	started_tr = 1;
5050	if (retval)
5051		goto restore;
5052
5053	/*
5054	 * STEP 3 - switch to cloned data and remove old blocks.
5055	 */
5056	lockflags = SFL_BITMAP;
5057	if (overflow_extents(fp))
5058		lockflags |= SFL_EXTENTS;
5059	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5060
5061	retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5062
5063	hfs_systemfile_unlock(hfsmp, lockflags);
5064	lockflags = 0;
5065	if (retval)
5066		goto restore;
5067out:
5068	if (took_trunc_lock)
5069		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5070
5071	if (lockflags) {
5072		hfs_systemfile_unlock(hfsmp, lockflags);
5073		lockflags = 0;
5074	}
5075
5076	/* Push cnode's new extent data to disk. */
5077	if (retval == 0) {
5078		(void) hfs_update(vp, MNT_WAIT);
5079	}
5080	if (hfsmp->jnl) {
5081		if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5082			(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5083		else
5084			(void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
5085	}
5086exit:
5087	if (started_tr)
5088		hfs_end_transaction(hfsmp);
5089
5090	return (retval);
5091
5092restore:
5093	if (fp->ff_blocks == headblks) {
5094		if (took_trunc_lock)
5095			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5096		goto exit;
5097	}
5098	/*
5099	 * Give back any newly allocated space.
5100	 */
5101	if (lockflags == 0) {
5102		lockflags = SFL_BITMAP;
5103		if (overflow_extents(fp))
5104			lockflags |= SFL_EXTENTS;
5105		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5106	}
5107
5108	(void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5109						 FTOC(fp)->c_fileid, false);
5110
5111	hfs_systemfile_unlock(hfsmp, lockflags);
5112	lockflags = 0;
5113
5114	if (took_trunc_lock)
5115		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5116	goto exit;
5117}
5118
5119
5120/*
5121 * Clone a file's data within the file.
5122 *
5123 */
5124static int
5125hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5126{
5127	caddr_t  bufp;
5128	size_t  bufsize;
5129	size_t  copysize;
5130        size_t  iosize;
5131	size_t  offset;
5132	off_t	writebase;
5133	uio_t auio;
5134	int  error = 0;
5135
5136	writebase = blkstart * blksize;
5137	copysize = blkcnt * blksize;
5138	iosize = bufsize = MIN(copysize, 128 * 1024);
5139	offset = 0;
5140
5141	hfs_unlock(VTOC(vp));
5142
5143#if CONFIG_PROTECT
5144	if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5145		hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5146		return (error);
5147	}
5148#endif /* CONFIG_PROTECT */
5149
5150	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5151		hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5152		return (ENOMEM);
5153	}
5154
5155	auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5156
5157	while (offset < copysize) {
5158		iosize = MIN(copysize - offset, iosize);
5159
5160		uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5161		uio_addiov(auio, (uintptr_t)bufp, iosize);
5162
5163		error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5164		if (error) {
5165			printf("hfs_clonefile: cluster_read failed - %d\n", error);
5166			break;
5167		}
5168		if (uio_resid(auio) != 0) {
5169			printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5170			error = EIO;
5171			break;
5172		}
5173
5174		uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5175		uio_addiov(auio, (uintptr_t)bufp, iosize);
5176
5177		error = cluster_write(vp, auio, writebase + offset,
5178		                      writebase + offset + iosize,
5179		                      uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5180		if (error) {
5181			printf("hfs_clonefile: cluster_write failed - %d\n", error);
5182			break;
5183		}
5184		if (uio_resid(auio) != 0) {
5185			printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5186			error = EIO;
5187			break;
5188		}
5189		offset += iosize;
5190	}
5191	uio_free(auio);
5192
5193	if ((blksize & PAGE_MASK)) {
5194		/*
5195		 * since the copy may not have started on a PAGE
5196		 * boundary (or may not have ended on one), we
5197		 * may have pages left in the cache since NOCACHE
5198		 * will let partially written pages linger...
5199		 * lets just flush the entire range to make sure
5200		 * we don't have any pages left that are beyond
5201		 * (or intersect) the real LEOF of this file
5202		 */
5203		ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5204	} else {
5205		/*
5206		 * No need to call ubc_sync_range or hfs_invalbuf
5207		 * since the file was copied using IO_NOCACHE and
5208		 * the copy was done starting and ending on a page
5209		 * boundary in the file.
5210		 */
5211	}
5212	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5213
5214	hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5215	return (error);
5216}
5217
5218/*
5219 * Clone a system (metadata) file.
5220 *
5221 */
5222static int
5223hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5224                 kauth_cred_t cred, struct proc *p)
5225{
5226	caddr_t  bufp;
5227	char * offset;
5228	size_t  bufsize;
5229	size_t  iosize;
5230	struct buf *bp = NULL;
5231	daddr64_t  blkno;
5232 	daddr64_t  blk;
5233	daddr64_t  start_blk;
5234	daddr64_t  last_blk;
5235	int  breadcnt;
5236        int  i;
5237	int  error = 0;
5238
5239
5240	iosize = GetLogicalBlockSize(vp);
5241	bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5242	breadcnt = bufsize / iosize;
5243
5244	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5245		return (ENOMEM);
5246	}
5247	start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5248	last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5249	blkno = 0;
5250
5251	while (blkno < last_blk) {
5252		/*
5253		 * Read up to a megabyte
5254		 */
5255		offset = bufp;
5256		for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5257			error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5258			if (error) {
5259				printf("hfs_clonesysfile: meta_bread error %d\n", error);
5260				goto out;
5261			}
5262			if (buf_count(bp) != iosize) {
5263				printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5264				goto out;
5265			}
5266			bcopy((char *)buf_dataptr(bp), offset, iosize);
5267
5268			buf_markinvalid(bp);
5269			buf_brelse(bp);
5270			bp = NULL;
5271
5272			offset += iosize;
5273		}
5274
5275		/*
5276		 * Write up to a megabyte
5277		 */
5278		offset = bufp;
5279		for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5280			bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5281			if (bp == NULL) {
5282				printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5283				error = EIO;
5284				goto out;
5285			}
5286			bcopy(offset, (char *)buf_dataptr(bp), iosize);
5287			error = (int)buf_bwrite(bp);
5288			bp = NULL;
5289			if (error)
5290				goto out;
5291			offset += iosize;
5292		}
5293	}
5294out:
5295	if (bp) {
5296		buf_brelse(bp);
5297	}
5298
5299	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5300
5301	error = hfs_fsync(vp, MNT_WAIT, 0, p);
5302
5303	return (error);
5304}
5305