1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*	@(#)hfs_readwrite.c	1.0
29 *
30 *	(c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
31 *
32 *	hfs_readwrite.c -- vnode operations to deal with reading and writing files.
33 *
34 */
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/resourcevar.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/filedesc.h>
42#include <sys/stat.h>
43#include <sys/buf.h>
44#include <sys/buf_internal.h>
45#include <sys/proc.h>
46#include <sys/kauth.h>
47#include <sys/vnode.h>
48#include <sys/vnode_internal.h>
49#include <sys/uio.h>
50#include <sys/vfs_context.h>
51#include <sys/fsevents.h>
52#include <kern/kalloc.h>
53#include <sys/disk.h>
54#include <sys/sysctl.h>
55#include <sys/fsctl.h>
56#include <sys/mount_internal.h>
57
58#include <miscfs/specfs/specdev.h>
59
60#include <sys/ubc.h>
61#include <sys/ubc_internal.h>
62
63#include <vm/vm_pageout.h>
64#include <vm/vm_kern.h>
65
66#include <sys/kdebug.h>
67
68#include	"hfs.h"
69#include	"hfs_attrlist.h"
70#include	"hfs_endian.h"
71#include  	"hfs_fsctl.h"
72#include	"hfs_quota.h"
73#include	"hfscommon/headers/FileMgrInternal.h"
74#include	"hfscommon/headers/BTreesInternal.h"
75#include	"hfs_cnode.h"
76#include	"hfs_dbg.h"
77
78#define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
79
80enum {
81	MAXHFSFILESIZE = 0x7FFFFFFF		/* this needs to go in the mount structure */
82};
83
84/* from bsd/hfs/hfs_vfsops.c */
85extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
86
87static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
88static int  hfs_clonefile(struct vnode *, int, int, int);
89static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
90static int  hfs_minorupdate(struct vnode *vp);
91static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
92
93
94int flush_cache_on_write = 0;
95SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
96
97/*
98 * Read data from a file.
99 */
100int
101hfs_vnop_read(struct vnop_read_args *ap)
102{
103	/*
104	   struct vnop_read_args {
105	   struct vnodeop_desc *a_desc;
106	   vnode_t a_vp;
107	   struct uio *a_uio;
108	   int a_ioflag;
109	   vfs_context_t a_context;
110	   };
111	 */
112
113	uio_t uio = ap->a_uio;
114	struct vnode *vp = ap->a_vp;
115	struct cnode *cp;
116	struct filefork *fp;
117	struct hfsmount *hfsmp;
118	off_t filesize;
119	off_t filebytes;
120	off_t start_resid = uio_resid(uio);
121	off_t offset = uio_offset(uio);
122	int retval = 0;
123	int took_truncate_lock = 0;
124	int io_throttle = 0;
125
126	/* Preflight checks */
127	if (!vnode_isreg(vp)) {
128		/* can only read regular files */
129		if (vnode_isdir(vp))
130			return (EISDIR);
131		else
132			return (EPERM);
133	}
134	if (start_resid == 0)
135		return (0);		/* Nothing left to do */
136	if (offset < 0)
137		return (EINVAL);	/* cant read from a negative offset */
138
139#if HFS_COMPRESSION
140	if (VNODE_IS_RSRC(vp)) {
141		if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
142			return 0;
143		}
144		/* otherwise read the resource fork normally */
145	} else {
146		int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
147		if (compressed) {
148			retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
149			if (compressed) {
150				if (retval == 0) {
151					/* successful read, update the access time */
152					VTOC(vp)->c_touch_acctime = TRUE;
153
154					/* compressed files are not hot file candidates */
155					if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
156						VTOF(vp)->ff_bytesread = 0;
157					}
158				}
159				return retval;
160			}
161			/* otherwise the file was converted back to a regular file while we were reading it */
162			retval = 0;
163		} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
164			int error;
165
166			error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
167			if (error) {
168				return error;
169			}
170
171		}
172	}
173#endif /* HFS_COMPRESSION */
174
175	cp = VTOC(vp);
176	fp = VTOF(vp);
177	hfsmp = VTOHFS(vp);
178
179#if CONFIG_PROTECT
180	if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
181		goto exit;
182	}
183#endif
184
185	/*
186	 * If this read request originated from a syscall (as opposed to
187	 * an in-kernel page fault or something), then set it up for
188	 * throttle checks.  For example, large EAs may cause a VNOP_READ
189	 * to occur, and we wouldn't want to throttle I/O while holding the
190	 * EA B-Tree lock.
191	 */
192	if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
193		io_throttle = IO_RETURN_ON_THROTTLE;
194	}
195
196read_again:
197
198	/* Protect against a size change. */
199	hfs_lock_truncate(cp, HFS_SHARED_LOCK);
200	took_truncate_lock = 1;
201
202	filesize = fp->ff_size;
203	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
204	if (offset > filesize) {
205		if ((hfsmp->hfs_flags & HFS_STANDARD) &&
206		    (offset > (off_t)MAXHFSFILESIZE)) {
207			retval = EFBIG;
208		}
209		goto exit;
210	}
211
212	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
213		(int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
214
215	retval = cluster_read(vp, uio, filesize, ap->a_ioflag | (io_throttle));
216
217	cp->c_touch_acctime = TRUE;
218
219	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
220		(int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
221
222	/*
223	 * Keep track blocks read
224	 */
225	if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
226		int took_cnode_lock = 0;
227		off_t bytesread;
228
229		bytesread = start_resid - uio_resid(uio);
230
231		/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
232		if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
233			hfs_lock(cp, HFS_FORCE_LOCK);
234			took_cnode_lock = 1;
235		}
236		/*
237		 * If this file hasn't been seen since the start of
238		 * the current sampling period then start over.
239		 */
240		if (cp->c_atime < hfsmp->hfc_timebase) {
241			struct timeval tv;
242
243			fp->ff_bytesread = bytesread;
244			microtime(&tv);
245			cp->c_atime = tv.tv_sec;
246		} else {
247			fp->ff_bytesread += bytesread;
248		}
249		if (took_cnode_lock)
250			hfs_unlock(cp);
251	}
252exit:
253	if (took_truncate_lock) {
254		hfs_unlock_truncate(cp, 0);
255	}
256	if (retval == EAGAIN) {
257		throttle_lowpri_io(1);
258
259		retval = 0;
260		goto read_again;
261	}
262	return (retval);
263}
264
265/*
266 * Write data to a file.
267 */
268int
269hfs_vnop_write(struct vnop_write_args *ap)
270{
271	uio_t uio = ap->a_uio;
272	struct vnode *vp = ap->a_vp;
273	struct cnode *cp;
274	struct filefork *fp;
275	struct hfsmount *hfsmp;
276	kauth_cred_t cred = NULL;
277	off_t origFileSize;
278	off_t writelimit;
279	off_t bytesToAdd = 0;
280	off_t actualBytesAdded;
281	off_t filebytes;
282	off_t offset;
283	ssize_t resid;
284	int eflags;
285	int ioflag = ap->a_ioflag;
286	int retval = 0;
287	int lockflags;
288	int cnode_locked = 0;
289	int partialwrite = 0;
290	int do_snapshot = 1;
291	time_t orig_ctime=VTOC(vp)->c_ctime;
292	int took_truncate_lock = 0;
293	int io_return_on_throttle = 0;
294	struct rl_entry *invalid_range;
295
296#if HFS_COMPRESSION
297	if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
298		int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
299		switch(state) {
300			case FILE_IS_COMPRESSED:
301				return EACCES;
302			case FILE_IS_CONVERTING:
303				/* if FILE_IS_CONVERTING, we allow writes but do not
304				   bother with snapshots or else we will deadlock.
305				*/
306				do_snapshot = 0;
307				break;
308			default:
309				printf("invalid state %d for compressed file\n", state);
310				/* fall through */
311		}
312	} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
313		int error;
314
315		error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
316		if (error != 0) {
317			return error;
318		}
319	}
320
321	if (do_snapshot) {
322		check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
323	}
324
325#endif
326
327	// LP64todo - fix this! uio_resid may be 64-bit value
328	resid = uio_resid(uio);
329	offset = uio_offset(uio);
330
331	if (offset < 0)
332		return (EINVAL);
333	if (resid == 0)
334		return (E_NONE);
335	if (!vnode_isreg(vp))
336		return (EPERM);  /* Can only write regular files */
337
338	cp = VTOC(vp);
339	fp = VTOF(vp);
340	hfsmp = VTOHFS(vp);
341
342#if CONFIG_PROTECT
343	if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
344		goto exit;
345	}
346#endif
347
348	eflags = kEFDeferMask;	/* defer file block allocations */
349#if HFS_SPARSE_DEV
350	/*
351	 * When the underlying device is sparse and space
352	 * is low (< 8MB), stop doing delayed allocations
353	 * and begin doing synchronous I/O.
354	 */
355	if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
356	    (hfs_freeblks(hfsmp, 0) < 2048)) {
357		eflags &= ~kEFDeferMask;
358		ioflag |= IO_SYNC;
359	}
360#endif /* HFS_SPARSE_DEV */
361
362	if ((ioflag & (IO_SINGLE_WRITER | IO_RETURN_ON_THROTTLE)) ==
363			(IO_SINGLE_WRITER | IO_RETURN_ON_THROTTLE)) {
364		io_return_on_throttle = IO_RETURN_ON_THROTTLE;
365	}
366again:
367	/* Protect against a size change. */
368	/*
369	 * Protect against a size change.
370	 *
371	 * Note: If took_truncate_lock is true, then we previously got the lock shared
372	 * but needed to upgrade to exclusive.  So try getting it exclusive from the
373	 * start.
374	 */
375	if (ioflag & IO_APPEND || took_truncate_lock) {
376		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
377	}
378	else {
379		hfs_lock_truncate(cp, HFS_SHARED_LOCK);
380	}
381	took_truncate_lock = 1;
382
383	/* Update UIO */
384	if (ioflag & IO_APPEND) {
385		uio_setoffset(uio, fp->ff_size);
386		offset = fp->ff_size;
387	}
388	if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
389		retval = EPERM;
390		goto exit;
391	}
392
393	origFileSize = fp->ff_size;
394	writelimit = offset + resid;
395	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
396
397	/*
398	 * We may need an exclusive truncate lock for several reasons, all
399	 * of which are because we may be writing to a (portion of a) block
400	 * for the first time, and we need to make sure no readers see the
401	 * prior, uninitialized contents of the block.  The cases are:
402	 *
403	 * 1. We have unallocated (delayed allocation) blocks.  We may be
404	 *    allocating new blocks to the file and writing to them.
405	 *    (A more precise check would be whether the range we're writing
406	 *    to contains delayed allocation blocks.)
407	 * 2. We need to extend the file.  The bytes between the old EOF
408	 *    and the new EOF are not yet initialized.  This is important
409	 *    even if we're not allocating new blocks to the file.  If the
410	 *    old EOF and new EOF are in the same block, we still need to
411	 *    protect that range of bytes until they are written for the
412	 *    first time.
413	 * 3. The write overlaps some invalid ranges (delayed zero fill; that
414	 *    part of the file has been allocated, but not yet written).
415	 *
416	 * If we had a shared lock with the above cases, we need to try to upgrade
417	 * to an exclusive lock.  If the upgrade fails, we will lose the shared
418	 * lock, and will need to take the truncate lock again; the took_truncate_lock
419	 * flag will still be set, causing us to try for an exclusive lock next time.
420	 *
421	 * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
422	 * lock is held, since it protects the range lists.
423	 */
424	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
425	    ((fp->ff_unallocblocks != 0) ||
426	     (writelimit > origFileSize))) {
427		if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
428			/*
429			 * Lock upgrade failed and we lost our shared lock, try again.
430			 * Note: we do not set took_truncate_lock=0 here.  Leaving it
431			 * set to 1 will cause us to try to get the lock exclusive.
432			 */
433			goto again;
434		}
435		else {
436			/* Store the owner in the c_truncatelockowner field if we successfully upgrade */
437			cp->c_truncatelockowner = current_thread();
438		}
439	}
440
441	if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
442		goto exit;
443	}
444	cnode_locked = 1;
445
446	/*
447	 * Now that we have the cnode lock, see if there are delayed zero fill ranges
448	 * overlapping our write.  If so, we need the truncate lock exclusive (see above).
449	 */
450	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
451	    (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
452	    	/*
453		 * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
454		 * a deadlock, rather than simply returning failure.  (That is, it apparently does
455		 * not behave like a "try_lock").  Since this condition is rare, just drop the
456		 * cnode lock and try again.  Since took_truncate_lock is set, we will
457		 * automatically take the truncate lock exclusive.
458		 */
459		hfs_unlock(cp);
460		cnode_locked = 0;
461		hfs_unlock_truncate(cp, 0);
462		goto again;
463	}
464
465	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
466		     (int)offset, uio_resid(uio), (int)fp->ff_size,
467		     (int)filebytes, 0);
468
469	/* Check if we do not need to extend the file */
470	if (writelimit <= filebytes) {
471		goto sizeok;
472	}
473
474	cred = vfs_context_ucred(ap->a_context);
475	bytesToAdd = writelimit - filebytes;
476
477#if QUOTA
478	retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
479			   cred, 0);
480	if (retval)
481		goto exit;
482#endif /* QUOTA */
483
484	if (hfs_start_transaction(hfsmp) != 0) {
485		retval = EINVAL;
486		goto exit;
487	}
488
489	while (writelimit > filebytes) {
490		bytesToAdd = writelimit - filebytes;
491		if (cred && suser(cred, NULL) != 0)
492			eflags |= kEFReserveMask;
493
494		/* Protect extents b-tree and allocation bitmap */
495		lockflags = SFL_BITMAP;
496		if (overflow_extents(fp))
497			lockflags |= SFL_EXTENTS;
498		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
499
500		/* Files that are changing size are not hot file candidates. */
501		if (hfsmp->hfc_stage == HFC_RECORDING) {
502			fp->ff_bytesread = 0;
503		}
504		retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
505				0, eflags, &actualBytesAdded));
506
507		hfs_systemfile_unlock(hfsmp, lockflags);
508
509		if ((actualBytesAdded == 0) && (retval == E_NONE))
510			retval = ENOSPC;
511		if (retval != E_NONE)
512			break;
513		filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
514		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
515			(int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
516	}
517	(void) hfs_update(vp, TRUE);
518	(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
519	(void) hfs_end_transaction(hfsmp);
520
521	/*
522	 * If we didn't grow the file enough try a partial write.
523	 * POSIX expects this behavior.
524	 */
525	if ((retval == ENOSPC) && (filebytes > offset)) {
526		retval = 0;
527		partialwrite = 1;
528		uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
529		resid -= bytesToAdd;
530		writelimit = filebytes;
531	}
532sizeok:
533	if (retval == E_NONE) {
534		off_t filesize;
535		off_t zero_off;
536		off_t tail_off;
537		off_t inval_start;
538		off_t inval_end;
539		off_t io_start;
540		int lflag;
541
542		if (writelimit > fp->ff_size)
543			filesize = writelimit;
544		else
545			filesize = fp->ff_size;
546
547		lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
548
549		if (offset <= fp->ff_size) {
550			zero_off = offset & ~PAGE_MASK_64;
551
552			/* Check to see whether the area between the zero_offset and the start
553			   of the transfer to see whether is invalid and should be zero-filled
554			   as part of the transfer:
555			 */
556			if (offset > zero_off) {
557			        if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
558				        lflag |= IO_HEADZEROFILL;
559			}
560		} else {
561			off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
562
563			/* The bytes between fp->ff_size and uio->uio_offset must never be
564			   read without being zeroed.  The current last block is filled with zeroes
565			   if it holds valid data but in all cases merely do a little bookkeeping
566			   to track the area from the end of the current last page to the start of
567			   the area actually written.  For the same reason only the bytes up to the
568			   start of the page where this write will start is invalidated; any remainder
569			   before uio->uio_offset is explicitly zeroed as part of the cluster_write.
570
571			   Note that inval_start, the start of the page after the current EOF,
572			   may be past the start of the write, in which case the zeroing
573			   will be handled by the cluser_write of the actual data.
574			 */
575			inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
576			inval_end = offset & ~PAGE_MASK_64;
577			zero_off = fp->ff_size;
578
579			if ((fp->ff_size & PAGE_MASK_64) &&
580				(rl_scan(&fp->ff_invalidranges,
581							eof_page_base,
582							fp->ff_size - 1,
583							&invalid_range) != RL_NOOVERLAP)) {
584				/* The page containing the EOF is not valid, so the
585				   entire page must be made inaccessible now.  If the write
586				   starts on a page beyond the page containing the eof
587				   (inval_end > eof_page_base), add the
588				   whole page to the range to be invalidated.  Otherwise
589				   (i.e. if the write starts on the same page), zero-fill
590				   the entire page explicitly now:
591				 */
592				if (inval_end > eof_page_base) {
593					inval_start = eof_page_base;
594				} else {
595					zero_off = eof_page_base;
596				};
597			};
598
599			if (inval_start < inval_end) {
600				struct timeval tv;
601				/* There's some range of data that's going to be marked invalid */
602
603				if (zero_off < inval_start) {
604					/* The pages between inval_start and inval_end are going to be invalidated,
605					   and the actual write will start on a page past inval_end.  Now's the last
606					   chance to zero-fill the page containing the EOF:
607					 */
608					hfs_unlock(cp);
609					cnode_locked = 0;
610					retval = cluster_write(vp, (uio_t) 0,
611							fp->ff_size, inval_start,
612							zero_off, (off_t)0,
613							lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
614					hfs_lock(cp, HFS_FORCE_LOCK);
615					cnode_locked = 1;
616					if (retval) goto ioerr_exit;
617					offset = uio_offset(uio);
618				};
619
620				/* Mark the remaining area of the newly allocated space as invalid: */
621				rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
622				microuptime(&tv);
623				cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
624				zero_off = fp->ff_size = inval_end;
625			};
626
627			if (offset > zero_off) lflag |= IO_HEADZEROFILL;
628		};
629
630		/* Check to see whether the area between the end of the write and the end of
631		   the page it falls in is invalid and should be zero-filled as part of the transfer:
632		 */
633		tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
634		if (tail_off > filesize) tail_off = filesize;
635		if (tail_off > writelimit) {
636			if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
637				lflag |= IO_TAILZEROFILL;
638			};
639		};
640
641		/*
642		 * if the write starts beyond the current EOF (possibly advanced in the
643		 * zeroing of the last block, above), then we'll zero fill from the current EOF
644		 * to where the write begins:
645		 *
646		 * NOTE: If (and ONLY if) the portion of the file about to be written is
647		 *       before the current EOF it might be marked as invalid now and must be
648		 *       made readable (removed from the invalid ranges) before cluster_write
649		 *       tries to write it:
650		 */
651		io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
652		if (io_start < fp->ff_size) {
653			off_t io_end;
654
655			io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
656			rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
657		};
658
659		hfs_unlock(cp);
660		cnode_locked = 0;
661
662		/*
663		 * We need to tell UBC the fork's new size BEFORE calling
664		 * cluster_write, in case any of the new pages need to be
665		 * paged out before cluster_write completes (which does happen
666		 * in embedded systems due to extreme memory pressure).
667		 * Similarly, we need to tell hfs_vnop_pageout what the new EOF
668		 * will be, so that it can pass that on to cluster_pageout, and
669		 * allow those pageouts.
670		 *
671		 * We don't update ff_size yet since we don't want pageins to
672		 * be able to see uninitialized data between the old and new
673		 * EOF, until cluster_write has completed and initialized that
674		 * part of the file.
675		 *
676		 * The vnode pager relies on the file size last given to UBC via
677		 * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
678		 * ff_size (whichever is larger).  NOTE: ff_new_size is always
679		 * zero, unless we are extending the file via write.
680		 */
681		if (filesize > fp->ff_size) {
682			fp->ff_new_size = filesize;
683			ubc_setsize(vp, filesize);
684		}
685		retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
686				tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
687		if (retval) {
688			fp->ff_new_size = 0;	/* no longer extending; use ff_size */
689
690			if (retval == EAGAIN) {
691				/*
692				 * EAGAIN indicates that we still have I/O to do, but
693				 * that we now need to be throttled
694				 */
695				if (resid != uio_resid(uio)) {
696					/*
697					 * did manage to do some I/O before returning EAGAIN
698					 */
699					resid = uio_resid(uio);
700					offset = uio_offset(uio);
701
702					cp->c_touch_chgtime = TRUE;
703					cp->c_touch_modtime = TRUE;
704				}
705				if (filesize > fp->ff_size) {
706					/*
707					 * we called ubc_setsize before the call to
708					 * cluster_write... since we only partially
709					 * completed the I/O, we need to
710					 * re-adjust our idea of the filesize based
711					 * on our interim EOF
712					 */
713					ubc_setsize(vp, offset);
714
715					fp->ff_size = offset;
716				}
717				goto exit;
718			}
719			if (filesize > origFileSize) {
720				ubc_setsize(vp, origFileSize);
721			}
722			goto ioerr_exit;
723		}
724
725		if (filesize > origFileSize) {
726			fp->ff_size = filesize;
727
728			/* Files that are changing size are not hot file candidates. */
729			if (hfsmp->hfc_stage == HFC_RECORDING) {
730				fp->ff_bytesread = 0;
731			}
732		}
733		fp->ff_new_size = 0;	/* ff_size now has the correct size */
734
735		/* If we wrote some bytes, then touch the change and mod times */
736		if (resid > uio_resid(uio)) {
737			cp->c_touch_chgtime = TRUE;
738			cp->c_touch_modtime = TRUE;
739		}
740	}
741	if (partialwrite) {
742		uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
743		resid += bytesToAdd;
744	}
745
746	// XXXdbg - see radar 4871353 for more info
747	{
748	    if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
749		VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
750	    }
751	}
752
753ioerr_exit:
754	/*
755	 * If we successfully wrote any data, and we are not the superuser
756	 * we clear the setuid and setgid bits as a precaution against
757	 * tampering.
758	 */
759	if (cp->c_mode & (S_ISUID | S_ISGID)) {
760		cred = vfs_context_ucred(ap->a_context);
761		if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
762			if (!cnode_locked) {
763				hfs_lock(cp, HFS_FORCE_LOCK);
764				cnode_locked = 1;
765			}
766			cp->c_mode &= ~(S_ISUID | S_ISGID);
767		}
768	}
769	if (retval) {
770		if (ioflag & IO_UNIT) {
771			if (!cnode_locked) {
772				hfs_lock(cp, HFS_FORCE_LOCK);
773				cnode_locked = 1;
774			}
775			(void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
776			                   0, 0, ap->a_context);
777			// LP64todo - fix this!  resid needs to by user_ssize_t
778			uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
779			uio_setresid(uio, resid);
780			filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
781		}
782	} else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
783		if (!cnode_locked) {
784			hfs_lock(cp, HFS_FORCE_LOCK);
785			cnode_locked = 1;
786		}
787		retval = hfs_update(vp, TRUE);
788	}
789	/* Updating vcbWrCnt doesn't need to be atomic. */
790	hfsmp->vcbWrCnt++;
791
792	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
793		(int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
794exit:
795	if (cnode_locked)
796		hfs_unlock(cp);
797
798	if (took_truncate_lock) {
799		hfs_unlock_truncate(cp, 0);
800	}
801	if (retval == EAGAIN) {
802		throttle_lowpri_io(1);
803
804		retval = 0;
805		goto again;
806	}
807	return (retval);
808}
809
810/* support for the "bulk-access" fcntl */
811
812#define CACHE_LEVELS 16
813#define NUM_CACHE_ENTRIES (64*16)
814#define PARENT_IDS_FLAG 0x100
815
816struct access_cache {
817       int numcached;
818       int cachehits; /* these two for statistics gathering */
819       int lookups;
820       unsigned int *acache;
821       unsigned char *haveaccess;
822};
823
824struct access_t {
825	uid_t     uid;              /* IN: effective user id */
826	short     flags;            /* IN: access requested (i.e. R_OK) */
827	short     num_groups;       /* IN: number of groups user belongs to */
828	int       num_files;        /* IN: number of files to process */
829	int       *file_ids;        /* IN: array of file ids */
830	gid_t     *groups;          /* IN: array of groups */
831	short     *access;          /* OUT: access info for each file (0 for 'has access') */
832} __attribute__((unavailable)); // this structure is for reference purposes only
833
834struct user32_access_t {
835	uid_t     uid;              /* IN: effective user id */
836	short     flags;            /* IN: access requested (i.e. R_OK) */
837	short     num_groups;       /* IN: number of groups user belongs to */
838	int       num_files;        /* IN: number of files to process */
839	user32_addr_t      file_ids;        /* IN: array of file ids */
840	user32_addr_t      groups;          /* IN: array of groups */
841	user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
842};
843
844struct user64_access_t {
845	uid_t		uid;			/* IN: effective user id */
846	short		flags;			/* IN: access requested (i.e. R_OK) */
847	short		num_groups;		/* IN: number of groups user belongs to */
848	int		num_files;		/* IN: number of files to process */
849	user64_addr_t	file_ids;		/* IN: array of file ids */
850	user64_addr_t	groups;			/* IN: array of groups */
851	user64_addr_t	access;			/* OUT: access info for each file (0 for 'has access') */
852};
853
854
855// these are the "extended" versions of the above structures
856// note that it is crucial that they be different sized than
857// the regular version
858struct ext_access_t {
859	uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
860	uint32_t   num_files;       /* IN: number of files to process */
861	uint32_t   map_size;        /* IN: size of the bit map */
862	uint32_t  *file_ids;        /* IN: Array of file ids */
863	char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
864	short     *access;          /* OUT: access info for each file (0 for 'has access') */
865	uint32_t   num_parents;   /* future use */
866	cnid_t      *parents;   /* future use */
867} __attribute__((unavailable)); // this structure is for reference purposes only
868
869struct user32_ext_access_t {
870	uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
871	uint32_t   num_files;       /* IN: number of files to process */
872	uint32_t   map_size;        /* IN: size of the bit map */
873	user32_addr_t  file_ids;        /* IN: Array of file ids */
874	user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
875	user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
876	uint32_t   num_parents;   /* future use */
877	user32_addr_t parents;   /* future use */
878};
879
880struct user64_ext_access_t {
881	uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
882	uint32_t      num_files;    /* IN: number of files to process */
883	uint32_t      map_size;     /* IN: size of the bit map */
884	user64_addr_t   file_ids;     /* IN: array of file ids */
885	user64_addr_t   bitmap;       /* IN: array of groups */
886	user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
887	uint32_t      num_parents;/* future use */
888	user64_addr_t   parents;/* future use */
889};
890
891
892/*
893 * Perform a binary search for the given parent_id. Return value is
894 * the index if there is a match.  If no_match_indexp is non-NULL it
895 * will be assigned with the index to insert the item (even if it was
896 * not found).
897 */
898static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
899{
900    int index=-1;
901    unsigned int lo=0;
902
903    do {
904	unsigned int mid = ((hi - lo)/2) + lo;
905	unsigned int this_id = array[mid];
906
907	if (parent_id == this_id) {
908	    hi = mid;
909	    break;
910	}
911
912	if (parent_id < this_id) {
913	    hi = mid;
914	    continue;
915	}
916
917	if (parent_id > this_id) {
918	    lo = mid + 1;
919	    continue;
920	}
921    } while(lo < hi);
922
923    /* check if lo and hi converged on the match */
924    if (parent_id == array[hi]) {
925	index = hi;
926    }
927
928    if (no_match_indexp) {
929	*no_match_indexp = hi;
930    }
931
932    return index;
933}
934
935
936static int
937lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
938{
939    unsigned int hi;
940    int matches = 0;
941    int index, no_match_index;
942
943    if (cache->numcached == 0) {
944	*indexp = 0;
945	return 0; // table is empty, so insert at index=0 and report no match
946    }
947
948    if (cache->numcached > NUM_CACHE_ENTRIES) {
949	/*printf("hfs: EGAD! numcached is %d... cut our losses and trim to %d\n",
950	  cache->numcached, NUM_CACHE_ENTRIES);*/
951	cache->numcached = NUM_CACHE_ENTRIES;
952    }
953
954    hi = cache->numcached - 1;
955
956    index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
957
958    /* if no existing entry found, find index for new one */
959    if (index == -1) {
960	index = no_match_index;
961	matches = 0;
962    } else {
963	matches = 1;
964    }
965
966    *indexp = index;
967    return matches;
968}
969
970/*
971 * Add a node to the access_cache at the given index (or do a lookup first
972 * to find the index if -1 is passed in). We currently do a replace rather
973 * than an insert if the cache is full.
974 */
975static void
976add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
977{
978    int lookup_index = -1;
979
980    /* need to do a lookup first if -1 passed for index */
981    if (index == -1) {
982	if (lookup_bucket(cache, &lookup_index, nodeID)) {
983	    if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
984		// only update an entry if the previous access was ESRCH (i.e. a scope checking error)
985		cache->haveaccess[lookup_index] = access;
986	    }
987
988	    /* mission accomplished */
989	    return;
990	} else {
991	    index = lookup_index;
992	}
993
994    }
995
996    /* if the cache is full, do a replace rather than an insert */
997    if (cache->numcached >= NUM_CACHE_ENTRIES) {
998	//printf("hfs: cache is full (%d). replace at index %d\n", cache->numcached, index);
999	cache->numcached = NUM_CACHE_ENTRIES-1;
1000
1001	if (index > cache->numcached) {
1002	    //    printf("hfs: index %d pinned to %d\n", index, cache->numcached);
1003	    index = cache->numcached;
1004	}
1005    }
1006
1007    if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1008	index++;
1009    }
1010
1011    if (index >= 0 && index < cache->numcached) {
1012	/* only do bcopy if we're inserting */
1013	bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1014	bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1015    }
1016
1017    cache->acache[index] = nodeID;
1018    cache->haveaccess[index] = access;
1019    cache->numcached++;
1020}
1021
1022
1023struct cinfo {
1024    uid_t   uid;
1025    gid_t   gid;
1026    mode_t  mode;
1027    cnid_t  parentcnid;
1028    u_int16_t recflags;
1029};
1030
1031static int
1032snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
1033{
1034    struct cinfo *cip = (struct cinfo *)arg;
1035
1036    cip->uid = attrp->ca_uid;
1037    cip->gid = attrp->ca_gid;
1038    cip->mode = attrp->ca_mode;
1039    cip->parentcnid = descp->cd_parentcnid;
1040    cip->recflags = attrp->ca_recflags;
1041
1042    return (0);
1043}
1044
1045/*
1046 * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1047 * isn't incore, then go to the catalog.
1048 */
1049static int
1050do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1051    struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1052{
1053    int error = 0;
1054
1055    /* if this id matches the one the fsctl was called with, skip the lookup */
1056    if (cnid == skip_cp->c_cnid) {
1057	cnattrp->ca_uid = skip_cp->c_uid;
1058	cnattrp->ca_gid = skip_cp->c_gid;
1059	cnattrp->ca_mode = skip_cp->c_mode;
1060	cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1061	keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1062    } else {
1063	struct cinfo c_info;
1064
1065	/* otherwise, check the cnode hash incase the file/dir is incore */
1066	if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
1067	    cnattrp->ca_uid = c_info.uid;
1068	    cnattrp->ca_gid = c_info.gid;
1069	    cnattrp->ca_mode = c_info.mode;
1070	    cnattrp->ca_recflags = c_info.recflags;
1071	    keyp->hfsPlus.parentID = c_info.parentcnid;
1072	} else {
1073	    int lockflags;
1074
1075	    if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1076		    throttle_lowpri_io(1);
1077
1078	    lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1079
1080	    /* lookup this cnid in the catalog */
1081	    error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1082
1083	    hfs_systemfile_unlock(hfsmp, lockflags);
1084
1085	    cache->lookups++;
1086	}
1087    }
1088
1089    return (error);
1090}
1091
1092
1093/*
1094 * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1095 * up to CACHE_LEVELS as we progress towards the root.
1096 */
1097static int
1098do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1099    struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1100    struct vfs_context *my_context,
1101    char *bitmap,
1102    uint32_t map_size,
1103    cnid_t* parents,
1104    uint32_t num_parents)
1105{
1106    int                     myErr = 0;
1107    int                     myResult;
1108    HFSCatalogNodeID        thisNodeID;
1109    unsigned int            myPerms;
1110    struct cat_attr         cnattr;
1111    int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1112    CatalogKey              catkey;
1113
1114    int i = 0, ids_to_cache = 0;
1115    int parent_ids[CACHE_LEVELS];
1116
1117    thisNodeID = nodeID;
1118    while (thisNodeID >=  kRootDirID) {
1119	myResult = 0;   /* default to "no access" */
1120
1121	/* check the cache before resorting to hitting the catalog */
1122
1123	/* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1124	 * to look any further after hitting cached dir */
1125
1126	if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1127	    cache->cachehits++;
1128	    myErr = cache->haveaccess[cache_index];
1129	    if (scope_index != -1) {
1130		if (myErr == ESRCH) {
1131		    myErr = 0;
1132		}
1133	    } else {
1134		scope_index = 0;   // so we'll just use the cache result
1135		scope_idx_start = ids_to_cache;
1136	    }
1137	    myResult = (myErr == 0) ? 1 : 0;
1138	    goto ExitThisRoutine;
1139	}
1140
1141
1142	if (parents) {
1143	    int tmp;
1144	    tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1145	    if (scope_index == -1)
1146		scope_index = tmp;
1147	    if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1148		scope_idx_start = ids_to_cache;
1149	    }
1150	}
1151
1152	/* remember which parents we want to cache */
1153	if (ids_to_cache < CACHE_LEVELS) {
1154	    parent_ids[ids_to_cache] = thisNodeID;
1155	    ids_to_cache++;
1156	}
1157	// Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1158	if (bitmap && map_size) {
1159	    bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1160	}
1161
1162
1163	/* do the lookup (checks the cnode hash, then the catalog) */
1164	myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1165	if (myErr) {
1166	    goto ExitThisRoutine; /* no access */
1167	}
1168
1169	/* Root always gets access. */
1170	if (suser(myp_ucred, NULL) == 0) {
1171		thisNodeID = catkey.hfsPlus.parentID;
1172		myResult = 1;
1173		continue;
1174	}
1175
1176	// if the thing has acl's, do the full permission check
1177	if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1178	    struct vnode *vp;
1179
1180	    /* get the vnode for this cnid */
1181	    myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1182	    if ( myErr ) {
1183		myResult = 0;
1184		goto ExitThisRoutine;
1185	    }
1186
1187	    thisNodeID = VTOC(vp)->c_parentcnid;
1188
1189	    hfs_unlock(VTOC(vp));
1190
1191	    if (vnode_vtype(vp) == VDIR) {
1192		myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1193	    } else {
1194		myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1195	    }
1196
1197	    vnode_put(vp);
1198	    if (myErr) {
1199		myResult = 0;
1200		goto ExitThisRoutine;
1201	    }
1202	} else {
1203	    unsigned int flags;
1204		int mode = cnattr.ca_mode & S_IFMT;
1205		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1206
1207		if (mode == S_IFDIR) {
1208			flags = R_OK | X_OK;
1209		} else {
1210			flags = R_OK;
1211		}
1212		if ( (myPerms & flags) != flags) {
1213			myResult = 0;
1214			myErr = EACCES;
1215			goto ExitThisRoutine;   /* no access */
1216		}
1217
1218	    /* up the hierarchy we go */
1219	    thisNodeID = catkey.hfsPlus.parentID;
1220	}
1221    }
1222
1223    /* if here, we have access to this node */
1224    myResult = 1;
1225
1226  ExitThisRoutine:
1227    if (parents && myErr == 0 && scope_index == -1) {
1228	myErr = ESRCH;
1229    }
1230
1231    if (myErr) {
1232	myResult = 0;
1233    }
1234    *err = myErr;
1235
1236    /* cache the parent directory(ies) */
1237    for (i = 0; i < ids_to_cache; i++) {
1238	if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1239	    add_node(cache, -1, parent_ids[i], ESRCH);
1240	} else {
1241	    add_node(cache, -1, parent_ids[i], myErr);
1242	}
1243    }
1244
1245    return (myResult);
1246}
1247
1248static int
1249do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1250    struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1251{
1252    boolean_t is64bit;
1253
1254    /*
1255     * NOTE: on entry, the vnode has an io_ref. In case this vnode
1256     * happens to be in our list of file_ids, we'll note it
1257     * avoid calling hfs_chashget_nowait() on that id as that
1258     * will cause a "locking against myself" panic.
1259     */
1260    Boolean check_leaf = true;
1261
1262    struct user64_ext_access_t *user_access_structp;
1263    struct user64_ext_access_t tmp_user_access;
1264    struct access_cache cache;
1265
1266    int error = 0, prev_parent_check_ok=1;
1267    unsigned int i;
1268
1269    short flags;
1270    unsigned int num_files = 0;
1271    int map_size = 0;
1272    int num_parents = 0;
1273    int *file_ids=NULL;
1274    short *access=NULL;
1275    char *bitmap=NULL;
1276    cnid_t *parents=NULL;
1277    int leaf_index;
1278
1279    cnid_t cnid;
1280    cnid_t prevParent_cnid = 0;
1281    unsigned int myPerms;
1282    short myaccess = 0;
1283    struct cat_attr cnattr;
1284    CatalogKey catkey;
1285    struct cnode *skip_cp = VTOC(vp);
1286    kauth_cred_t cred = vfs_context_ucred(context);
1287    proc_t p = vfs_context_proc(context);
1288
1289    is64bit = proc_is64bit(p);
1290
1291    /* initialize the local cache and buffers */
1292    cache.numcached = 0;
1293    cache.cachehits = 0;
1294    cache.lookups = 0;
1295    cache.acache = NULL;
1296    cache.haveaccess = NULL;
1297
1298    /* struct copyin done during dispatch... need to copy file_id array separately */
1299    if (ap->a_data == NULL) {
1300	error = EINVAL;
1301	goto err_exit_bulk_access;
1302    }
1303
1304    if (is64bit) {
1305	if (arg_size != sizeof(struct user64_ext_access_t)) {
1306	    error = EINVAL;
1307	    goto err_exit_bulk_access;
1308	}
1309
1310	user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1311
1312    } else if (arg_size == sizeof(struct user32_access_t)) {
1313	struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1314
1315	// convert an old style bulk-access struct to the new style
1316	tmp_user_access.flags     = accessp->flags;
1317	tmp_user_access.num_files = accessp->num_files;
1318	tmp_user_access.map_size  = 0;
1319	tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1320	tmp_user_access.bitmap    = USER_ADDR_NULL;
1321	tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1322	tmp_user_access.num_parents = 0;
1323	user_access_structp = &tmp_user_access;
1324
1325    } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1326	struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1327
1328	// up-cast from a 32-bit version of the struct
1329	tmp_user_access.flags     = accessp->flags;
1330	tmp_user_access.num_files = accessp->num_files;
1331	tmp_user_access.map_size  = accessp->map_size;
1332	tmp_user_access.num_parents  = accessp->num_parents;
1333
1334	tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1335	tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1336	tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1337	tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1338
1339	user_access_structp = &tmp_user_access;
1340    } else {
1341	error = EINVAL;
1342	goto err_exit_bulk_access;
1343    }
1344
1345    map_size = user_access_structp->map_size;
1346
1347    num_files = user_access_structp->num_files;
1348
1349    num_parents= user_access_structp->num_parents;
1350
1351    if (num_files < 1) {
1352	goto err_exit_bulk_access;
1353    }
1354    if (num_files > 1024) {
1355	error = EINVAL;
1356	goto err_exit_bulk_access;
1357    }
1358
1359    if (num_parents > 1024) {
1360	error = EINVAL;
1361	goto err_exit_bulk_access;
1362    }
1363
1364    file_ids = (int *) kalloc(sizeof(int) * num_files);
1365    access = (short *) kalloc(sizeof(short) * num_files);
1366    if (map_size) {
1367	bitmap = (char *) kalloc(sizeof(char) * map_size);
1368    }
1369
1370    if (num_parents) {
1371	parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1372    }
1373
1374    cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1375    cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1376
1377    if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1378	if (file_ids) {
1379	    kfree(file_ids, sizeof(int) * num_files);
1380	}
1381	if (bitmap) {
1382	    kfree(bitmap, sizeof(char) * map_size);
1383	}
1384	if (access) {
1385	    kfree(access, sizeof(short) * num_files);
1386	}
1387	if (cache.acache) {
1388	    kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1389	}
1390	if (cache.haveaccess) {
1391	    kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1392	}
1393	if (parents) {
1394	    kfree(parents, sizeof(cnid_t) * num_parents);
1395	}
1396	return ENOMEM;
1397    }
1398
1399    // make sure the bitmap is zero'ed out...
1400    if (bitmap) {
1401	bzero(bitmap, (sizeof(char) * map_size));
1402    }
1403
1404    if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1405		num_files * sizeof(int)))) {
1406	goto err_exit_bulk_access;
1407    }
1408
1409    if (num_parents) {
1410	if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1411		    num_parents * sizeof(cnid_t)))) {
1412	    goto err_exit_bulk_access;
1413	}
1414    }
1415
1416    flags = user_access_structp->flags;
1417    if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1418	flags = R_OK;
1419    }
1420
1421    /* check if we've been passed leaf node ids or parent ids */
1422    if (flags & PARENT_IDS_FLAG) {
1423	check_leaf = false;
1424    }
1425
1426    /* Check access to each file_id passed in */
1427    for (i = 0; i < num_files; i++) {
1428	leaf_index=-1;
1429	cnid = (cnid_t) file_ids[i];
1430
1431	/* root always has access */
1432	if ((!parents) && (!suser(cred, NULL))) {
1433	    access[i] = 0;
1434	    continue;
1435	}
1436
1437	if (check_leaf) {
1438	    /* do the lookup (checks the cnode hash, then the catalog) */
1439	    error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1440	    if (error) {
1441		access[i] = (short) error;
1442		continue;
1443	    }
1444
1445	    if (parents) {
1446		// Check if the leaf matches one of the parent scopes
1447		leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1448 		if (leaf_index >= 0 && parents[leaf_index] == cnid)
1449 		    prev_parent_check_ok = 0;
1450 		else if (leaf_index >= 0)
1451 		    prev_parent_check_ok = 1;
1452	    }
1453
1454	    // if the thing has acl's, do the full permission check
1455	    if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1456		struct vnode *cvp;
1457		int myErr = 0;
1458		/* get the vnode for this cnid */
1459		myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1460		if ( myErr ) {
1461		    access[i] = myErr;
1462		    continue;
1463		}
1464
1465		hfs_unlock(VTOC(cvp));
1466
1467		if (vnode_vtype(cvp) == VDIR) {
1468		    myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1469		} else {
1470		    myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1471		}
1472
1473		vnode_put(cvp);
1474		if (myErr) {
1475		    access[i] = myErr;
1476		    continue;
1477		}
1478	    } else {
1479		/* before calling CheckAccess(), check the target file for read access */
1480		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1481		    cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1482
1483		/* fail fast if no access */
1484		if ((myPerms & flags) == 0) {
1485		    access[i] = EACCES;
1486		    continue;
1487		}
1488	    }
1489	} else {
1490	    /* we were passed an array of parent ids */
1491	    catkey.hfsPlus.parentID = cnid;
1492	}
1493
1494	/* if the last guy had the same parent and had access, we're done */
1495 	if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1496	    cache.cachehits++;
1497	    access[i] = 0;
1498	    continue;
1499	}
1500
1501	myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1502	    skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1503
1504	if (myaccess || (error == ESRCH && leaf_index != -1)) {
1505	    access[i] = 0; // have access.. no errors to report
1506	} else {
1507	    access[i] = (error != 0 ? (short) error : EACCES);
1508	}
1509
1510	prevParent_cnid = catkey.hfsPlus.parentID;
1511    }
1512
1513    /* copyout the access array */
1514    if ((error = copyout((caddr_t)access, user_access_structp->access,
1515		num_files * sizeof (short)))) {
1516	goto err_exit_bulk_access;
1517    }
1518    if (map_size && bitmap) {
1519	if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1520		    map_size * sizeof (char)))) {
1521	    goto err_exit_bulk_access;
1522	}
1523    }
1524
1525
1526  err_exit_bulk_access:
1527
1528    //printf("hfs: on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1529
1530    if (file_ids)
1531	kfree(file_ids, sizeof(int) * num_files);
1532    if (parents)
1533	kfree(parents, sizeof(cnid_t) * num_parents);
1534    if (bitmap)
1535	kfree(bitmap, sizeof(char) * map_size);
1536    if (access)
1537	kfree(access, sizeof(short) * num_files);
1538    if (cache.acache)
1539	kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1540    if (cache.haveaccess)
1541	kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1542
1543    return (error);
1544}
1545
1546
1547/* end "bulk-access" support */
1548
1549
1550/*
1551 * Callback for use with freeze ioctl.
1552 */
1553static int
1554hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1555{
1556	vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1557
1558	return 0;
1559}
1560
1561/*
1562 * Control filesystem operating characteristics.
1563 */
1564int
1565hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1566		vnode_t a_vp;
1567		int  a_command;
1568		caddr_t  a_data;
1569		int  a_fflag;
1570		vfs_context_t a_context;
1571	} */ *ap)
1572{
1573	struct vnode * vp = ap->a_vp;
1574	struct hfsmount *hfsmp = VTOHFS(vp);
1575	vfs_context_t context = ap->a_context;
1576	kauth_cred_t cred = vfs_context_ucred(context);
1577	proc_t p = vfs_context_proc(context);
1578	struct vfsstatfs *vfsp;
1579	boolean_t is64bit;
1580	off_t jnl_start, jnl_size;
1581	struct hfs_journal_info *jip;
1582#if HFS_COMPRESSION
1583	int compressed = 0;
1584	off_t uncompressed_size = -1;
1585	int decmpfs_error = 0;
1586
1587	if (ap->a_command == F_RDADVISE) {
1588		/* we need to inspect the decmpfs state of the file as early as possible */
1589		compressed = hfs_file_is_compressed(VTOC(vp), 0);
1590		if (compressed) {
1591			if (VNODE_IS_RSRC(vp)) {
1592				/* if this is the resource fork, treat it as if it were empty */
1593				uncompressed_size = 0;
1594			} else {
1595				decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1596				if (decmpfs_error != 0) {
1597					/* failed to get the uncompressed size, we'll check for this later */
1598					uncompressed_size = -1;
1599				}
1600			}
1601		}
1602	}
1603#endif /* HFS_COMPRESSION */
1604
1605	is64bit = proc_is64bit(p);
1606
1607#if CONFIG_PROTECT
1608	{
1609		int error = 0;
1610		if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1611			return error;
1612		}
1613	}
1614#endif /* CONFIG_PROTECT */
1615
1616	switch (ap->a_command) {
1617
1618	case HFS_GETPATH:
1619	{
1620		struct vnode *file_vp;
1621		cnid_t  cnid;
1622		int  outlen;
1623		char *bufptr;
1624		int error;
1625
1626		/* Caller must be owner of file system. */
1627		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1628		if (suser(cred, NULL) &&
1629			kauth_cred_getuid(cred) != vfsp->f_owner) {
1630			return (EACCES);
1631		}
1632		/* Target vnode must be file system's root. */
1633		if (!vnode_isvroot(vp)) {
1634			return (EINVAL);
1635		}
1636		bufptr = (char *)ap->a_data;
1637		cnid = strtoul(bufptr, NULL, 10);
1638
1639		/* We need to call hfs_vfs_vget to leverage the code that will
1640		 * fix the origin list for us if needed, as opposed to calling
1641		 * hfs_vget, since we will need the parent for build_path call.
1642		 */
1643
1644		if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1645			return (error);
1646		}
1647		error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1648		vnode_put(file_vp);
1649
1650		return (error);
1651	}
1652
1653	case HFS_PREV_LINK:
1654	case HFS_NEXT_LINK:
1655	{
1656		cnid_t linkfileid;
1657		cnid_t nextlinkid;
1658		cnid_t prevlinkid;
1659		int error;
1660
1661		/* Caller must be owner of file system. */
1662		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1663		if (suser(cred, NULL) &&
1664			kauth_cred_getuid(cred) != vfsp->f_owner) {
1665			return (EACCES);
1666		}
1667		/* Target vnode must be file system's root. */
1668		if (!vnode_isvroot(vp)) {
1669			return (EINVAL);
1670		}
1671		linkfileid = *(cnid_t *)ap->a_data;
1672		if (linkfileid < kHFSFirstUserCatalogNodeID) {
1673			return (EINVAL);
1674		}
1675		if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1676			return (error);
1677		}
1678		if (ap->a_command == HFS_NEXT_LINK) {
1679			*(cnid_t *)ap->a_data = nextlinkid;
1680		} else {
1681			*(cnid_t *)ap->a_data = prevlinkid;
1682		}
1683		return (0);
1684	}
1685
1686	case HFS_RESIZE_PROGRESS: {
1687
1688		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1689		if (suser(cred, NULL) &&
1690			kauth_cred_getuid(cred) != vfsp->f_owner) {
1691			return (EACCES); /* must be owner of file system */
1692		}
1693		if (!vnode_isvroot(vp)) {
1694			return (EINVAL);
1695		}
1696		/* file system must not be mounted read-only */
1697		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1698			return (EROFS);
1699		}
1700
1701		return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1702	}
1703
1704	case HFS_RESIZE_VOLUME: {
1705		u_int64_t newsize;
1706		u_int64_t cursize;
1707
1708		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1709		if (suser(cred, NULL) &&
1710			kauth_cred_getuid(cred) != vfsp->f_owner) {
1711			return (EACCES); /* must be owner of file system */
1712		}
1713		if (!vnode_isvroot(vp)) {
1714			return (EINVAL);
1715		}
1716
1717		/* filesystem must not be mounted read only */
1718		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1719			return (EROFS);
1720		}
1721		newsize = *(u_int64_t *)ap->a_data;
1722		cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1723
1724		if (newsize > cursize) {
1725			return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1726		} else if (newsize < cursize) {
1727			return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1728		} else {
1729			return (0);
1730		}
1731	}
1732	case HFS_CHANGE_NEXT_ALLOCATION: {
1733		int error = 0;		/* Assume success */
1734		u_int32_t location;
1735
1736		if (vnode_vfsisrdonly(vp)) {
1737			return (EROFS);
1738		}
1739		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1740		if (suser(cred, NULL) &&
1741			kauth_cred_getuid(cred) != vfsp->f_owner) {
1742			return (EACCES); /* must be owner of file system */
1743		}
1744		if (!vnode_isvroot(vp)) {
1745			return (EINVAL);
1746		}
1747		HFS_MOUNT_LOCK(hfsmp, TRUE);
1748		location = *(u_int32_t *)ap->a_data;
1749		if ((location >= hfsmp->allocLimit) &&
1750			(location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1751			error = EINVAL;
1752			goto fail_change_next_allocation;
1753		}
1754		/* Return previous value. */
1755		*(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1756		if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1757			/* On magic value for location, set nextAllocation to next block
1758			 * after metadata zone and set flag in mount structure to indicate
1759			 * that nextAllocation should not be updated again.
1760			 */
1761			if (hfsmp->hfs_metazone_end != 0) {
1762				HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1763			}
1764			hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1765		} else {
1766			hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1767			HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1768		}
1769		MarkVCBDirty(hfsmp);
1770fail_change_next_allocation:
1771		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1772		return (error);
1773	}
1774
1775#if HFS_SPARSE_DEV
1776	case HFS_SETBACKINGSTOREINFO: {
1777		struct vnode * bsfs_rootvp;
1778		struct vnode * di_vp;
1779		struct hfs_backingstoreinfo *bsdata;
1780		int error = 0;
1781
1782		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1783			return (EROFS);
1784		}
1785		if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1786			return (EALREADY);
1787		}
1788		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1789		if (suser(cred, NULL) &&
1790			kauth_cred_getuid(cred) != vfsp->f_owner) {
1791			return (EACCES); /* must be owner of file system */
1792		}
1793		bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1794		if (bsdata == NULL) {
1795			return (EINVAL);
1796		}
1797		if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1798			return (error);
1799		}
1800		if ((error = vnode_getwithref(di_vp))) {
1801			file_drop(bsdata->backingfd);
1802			return(error);
1803		}
1804
1805		if (vnode_mount(vp) == vnode_mount(di_vp)) {
1806			(void)vnode_put(di_vp);
1807			file_drop(bsdata->backingfd);
1808			return (EINVAL);
1809		}
1810
1811		/*
1812		 * Obtain the backing fs root vnode and keep a reference
1813		 * on it.  This reference will be dropped in hfs_unmount.
1814		 */
1815		error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1816		if (error) {
1817			(void)vnode_put(di_vp);
1818			file_drop(bsdata->backingfd);
1819			return (error);
1820		}
1821		vnode_ref(bsfs_rootvp);
1822		vnode_put(bsfs_rootvp);
1823
1824		hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1825
1826		hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1827		/* The free extent cache is managed differently for sparse devices.
1828		 * There is a window between which the volume is mounted and the
1829		 * device is marked as sparse, so the free extent cache for this
1830		 * volume is currently initialized as normal volume (sorted by block
1831		 * count).  Reset the cache so that it will be rebuilt again
1832		 * for sparse device (sorted by start block).
1833		 */
1834		ResetVCBFreeExtCache(hfsmp);
1835
1836		hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1837		hfsmp->hfs_sparsebandblks *= 4;
1838
1839		vfs_markdependency(hfsmp->hfs_mp);
1840
1841		/*
1842		 * If the sparse image is on a sparse image file (as opposed to a sparse
1843		 * bundle), then we may need to limit the free space to the maximum size
1844		 * of a file on that volume.  So we query (using pathconf), and if we get
1845		 * a meaningful result, we cache the number of blocks for later use in
1846		 * hfs_freeblks().
1847		 */
1848		hfsmp->hfs_backingfs_maxblocks = 0;
1849		if (vnode_vtype(di_vp) == VREG) {
1850			int terr;
1851			int hostbits;
1852			terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1853			if (terr == 0 && hostbits != 0 && hostbits < 64) {
1854				u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1855
1856				hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1857			}
1858		}
1859
1860		(void)vnode_put(di_vp);
1861		file_drop(bsdata->backingfd);
1862		return (0);
1863	}
1864	case HFS_CLRBACKINGSTOREINFO: {
1865		struct vnode * tmpvp;
1866
1867		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1868		if (suser(cred, NULL) &&
1869			kauth_cred_getuid(cred) != vfsp->f_owner) {
1870			return (EACCES); /* must be owner of file system */
1871		}
1872		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1873			return (EROFS);
1874		}
1875
1876		if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1877		    hfsmp->hfs_backingfs_rootvp) {
1878
1879			hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1880			tmpvp = hfsmp->hfs_backingfs_rootvp;
1881			hfsmp->hfs_backingfs_rootvp = NULLVP;
1882			hfsmp->hfs_sparsebandblks = 0;
1883			vnode_rele(tmpvp);
1884		}
1885		return (0);
1886	}
1887#endif /* HFS_SPARSE_DEV */
1888
1889	/* Change the next CNID stored in the VH */
1890	case HFS_CHANGE_NEXTCNID: {
1891		int error = 0;		/* Assume success */
1892		u_int32_t fileid;
1893		int wraparound = 0;
1894		int lockflags = 0;
1895
1896		if (vnode_vfsisrdonly(vp)) {
1897			return (EROFS);
1898		}
1899		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1900		if (suser(cred, NULL) &&
1901			kauth_cred_getuid(cred) != vfsp->f_owner) {
1902			return (EACCES); /* must be owner of file system */
1903		}
1904
1905		fileid = *(u_int32_t *)ap->a_data;
1906
1907		/* Must have catalog lock excl. to advance the CNID pointer */
1908		lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
1909
1910		HFS_MOUNT_LOCK(hfsmp, TRUE);
1911
1912		/* If it is less than the current next CNID, force the wraparound bit to be set */
1913		if (fileid < hfsmp->vcbNxtCNID) {
1914			wraparound=1;
1915		}
1916
1917		/* Return previous value. */
1918		*(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
1919
1920		hfsmp->vcbNxtCNID = fileid;
1921
1922		if (wraparound) {
1923			hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
1924		}
1925
1926		MarkVCBDirty(hfsmp);
1927		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1928		hfs_systemfile_unlock (hfsmp, lockflags);
1929
1930		return (error);
1931	}
1932
1933	case F_FREEZE_FS: {
1934		struct mount *mp;
1935
1936		mp = vnode_mount(vp);
1937		hfsmp = VFSTOHFS(mp);
1938
1939		if (!(hfsmp->jnl))
1940			return (ENOTSUP);
1941
1942		vfsp = vfs_statfs(mp);
1943
1944		if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1945			!kauth_cred_issuser(cred))
1946			return (EACCES);
1947
1948		lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1949
1950		// flush things before we get started to try and prevent
1951		// dirty data from being paged out while we're frozen.
1952		// note: can't do this after taking the lock as it will
1953		// deadlock against ourselves.
1954		vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1955		hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
1956
1957		// DO NOT call hfs_journal_flush() because that takes a
1958		// shared lock on the global exclusive lock!
1959		journal_flush(hfsmp->jnl, TRUE);
1960
1961		// don't need to iterate on all vnodes, we just need to
1962		// wait for writes to the system files and the device vnode
1963		//
1964		// Now that journal flush waits for all metadata blocks to
1965		// be written out, waiting for btree writes is probably no
1966		// longer required.
1967		if (HFSTOVCB(hfsmp)->extentsRefNum)
1968		    vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1969		if (HFSTOVCB(hfsmp)->catalogRefNum)
1970		    vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1971		if (HFSTOVCB(hfsmp)->allocationsRefNum)
1972		    vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1973		if (hfsmp->hfs_attribute_vp)
1974		    vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1975		vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1976
1977		hfsmp->hfs_freezing_proc = current_proc();
1978
1979		return (0);
1980	}
1981
1982	case F_THAW_FS: {
1983		vfsp = vfs_statfs(vnode_mount(vp));
1984		if (kauth_cred_getuid(cred) != vfsp->f_owner &&
1985			!kauth_cred_issuser(cred))
1986			return (EACCES);
1987
1988		// if we're not the one who froze the fs then we
1989		// can't thaw it.
1990		if (hfsmp->hfs_freezing_proc != current_proc()) {
1991		    return EPERM;
1992		}
1993
1994		// NOTE: if you add code here, also go check the
1995		//       code that "thaws" the fs in hfs_vnop_close()
1996		//
1997		hfsmp->hfs_freezing_proc = NULL;
1998		hfs_unlock_global (hfsmp);
1999		lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
2000
2001		return (0);
2002	}
2003
2004	case HFS_BULKACCESS_FSCTL: {
2005	    int size;
2006
2007	    if (hfsmp->hfs_flags & HFS_STANDARD) {
2008		return EINVAL;
2009	    }
2010
2011	    if (is64bit) {
2012		size = sizeof(struct user64_access_t);
2013	    } else {
2014		size = sizeof(struct user32_access_t);
2015	    }
2016
2017	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
2018	}
2019
2020	case HFS_EXT_BULKACCESS_FSCTL: {
2021	    int size;
2022
2023	    if (hfsmp->hfs_flags & HFS_STANDARD) {
2024		return EINVAL;
2025	    }
2026
2027	    if (is64bit) {
2028		size = sizeof(struct user64_ext_access_t);
2029	    } else {
2030		size = sizeof(struct user32_ext_access_t);
2031	    }
2032
2033	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
2034	}
2035
2036	case HFS_SET_XATTREXTENTS_STATE: {
2037		int state;
2038
2039		if (ap->a_data == NULL) {
2040			return (EINVAL);
2041		}
2042
2043		state = *(int *)ap->a_data;
2044
2045		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2046			return (EROFS);
2047		}
2048
2049		/* Super-user can enable or disable extent-based extended
2050		 * attribute support on a volume
2051		 * Note: Starting Mac OS X 10.7, extent-based extended attributes
2052		 * are enabled by default, so any change will be transient only
2053		 * till the volume is remounted.
2054		 */
2055		if (!is_suser()) {
2056			return (EPERM);
2057		}
2058		if (state == 0 || state == 1)
2059			return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2060		else
2061			return (EINVAL);
2062	}
2063
2064	case F_SETSTATICCONTENT: {
2065		int error;
2066		int enable_static = 0;
2067		struct cnode *cp = NULL;
2068		/*
2069		 * lock the cnode, decorate the cnode flag, and bail out.
2070		 * VFS should have already authenticated the caller for us.
2071		 */
2072
2073		if (ap->a_data) {
2074			/*
2075			 * Note that even though ap->a_data is of type caddr_t,
2076			 * the fcntl layer at the syscall handler will pass in NULL
2077			 * or 1 depending on what the argument supplied to the fcntl
2078			 * was.  So it is in fact correct to check the ap->a_data
2079			 * argument for zero or non-zero value when deciding whether or not
2080			 * to enable the static bit in the cnode.
2081			 */
2082			enable_static = 1;
2083		}
2084		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2085			return EROFS;
2086		}
2087		cp = VTOC(vp);
2088
2089		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK);
2090		if (error == 0) {
2091			if (enable_static) {
2092				cp->c_flag |= C_SSD_STATIC;
2093			}
2094			else {
2095				cp->c_flag &= ~C_SSD_STATIC;
2096			}
2097			hfs_unlock (cp);
2098		}
2099		return error;
2100	}
2101
2102	case F_SETBACKINGSTORE: {
2103
2104		int error = 0;
2105
2106		/*
2107		 * See comment in F_SETSTATICCONTENT re: using
2108	     * a null check for a_data
2109  		 */
2110		if (ap->a_data) {
2111			error = hfs_set_backingstore (vp, 1);
2112		}
2113		else {
2114			error = hfs_set_backingstore (vp, 0);
2115		}
2116
2117		return error;
2118	}
2119
2120	case F_GETPATH_MTMINFO: {
2121		int error = 0;
2122
2123		int *data = (int*) ap->a_data;
2124
2125		/* Ask if this is a backingstore vnode */
2126		error = hfs_is_backingstore (vp, data);
2127
2128		return error;
2129	}
2130
2131	case F_FULLFSYNC: {
2132		int error;
2133
2134		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2135			return (EROFS);
2136		}
2137		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
2138		if (error == 0) {
2139			error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2140			hfs_unlock(VTOC(vp));
2141		}
2142
2143		return error;
2144	}
2145
2146	case F_CHKCLEAN: {
2147		register struct cnode *cp;
2148		int error;
2149
2150		if (!vnode_isreg(vp))
2151			return EINVAL;
2152
2153		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
2154		if (error == 0) {
2155			cp = VTOC(vp);
2156			/*
2157			 * used by regression test to determine if
2158			 * all the dirty pages (via write) have been cleaned
2159			 * after a call to 'fsysnc'.
2160			 */
2161			error = is_file_clean(vp, VTOF(vp)->ff_size);
2162			hfs_unlock(cp);
2163		}
2164		return (error);
2165	}
2166
2167	case F_RDADVISE: {
2168		register struct radvisory *ra;
2169		struct filefork *fp;
2170		int error;
2171
2172		if (!vnode_isreg(vp))
2173			return EINVAL;
2174
2175		ra = (struct radvisory *)(ap->a_data);
2176		fp = VTOF(vp);
2177
2178		/* Protect against a size change. */
2179		hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK);
2180
2181#if HFS_COMPRESSION
2182		if (compressed && (uncompressed_size == -1)) {
2183			/* fetching the uncompressed size failed above, so return the error */
2184			error = decmpfs_error;
2185		} else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2186				   (!compressed && (ra->ra_offset >= fp->ff_size))) {
2187			error = EFBIG;
2188		}
2189#else /* HFS_COMPRESSION */
2190		if (ra->ra_offset >= fp->ff_size) {
2191			error = EFBIG;
2192		}
2193#endif /* HFS_COMPRESSION */
2194		else {
2195			error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2196		}
2197
2198		hfs_unlock_truncate(VTOC(vp), 0);
2199		return (error);
2200	}
2201
2202	case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2203	{
2204		if (is64bit) {
2205			*(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2206		}
2207		else {
2208			*(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2209		}
2210		return 0;
2211	}
2212
2213	case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2214	    *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2215	    break;
2216
2217	case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2218	    *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2219	    break;
2220
2221	case HFS_FSCTL_GET_VERY_LOW_DISK:
2222	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2223	    break;
2224
2225	case HFS_FSCTL_SET_VERY_LOW_DISK:
2226	    if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2227		return EINVAL;
2228	    }
2229
2230	    hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2231	    break;
2232
2233	case HFS_FSCTL_GET_LOW_DISK:
2234	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2235	    break;
2236
2237	case HFS_FSCTL_SET_LOW_DISK:
2238	    if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2239		|| *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2240
2241		return EINVAL;
2242	    }
2243
2244	    hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2245	    break;
2246
2247	case HFS_FSCTL_GET_DESIRED_DISK:
2248	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2249	    break;
2250
2251	case HFS_FSCTL_SET_DESIRED_DISK:
2252	    if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2253		return EINVAL;
2254	    }
2255
2256	    hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2257	    break;
2258
2259	case HFS_VOLUME_STATUS:
2260	    *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2261	    break;
2262
2263	case HFS_SET_BOOT_INFO:
2264		if (!vnode_isvroot(vp))
2265			return(EINVAL);
2266		if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2267			return(EACCES);	/* must be superuser or owner of filesystem */
2268		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2269			return (EROFS);
2270		}
2271		HFS_MOUNT_LOCK(hfsmp, TRUE);
2272		bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2273		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2274		(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2275		break;
2276
2277	case HFS_GET_BOOT_INFO:
2278		if (!vnode_isvroot(vp))
2279			return(EINVAL);
2280		HFS_MOUNT_LOCK(hfsmp, TRUE);
2281		bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2282		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2283		break;
2284
2285	case HFS_MARK_BOOT_CORRUPT:
2286		/* Mark the boot volume corrupt by setting
2287		 * kHFSVolumeInconsistentBit in the volume header.  This will
2288		 * force fsck_hfs on next mount.
2289		 */
2290		if (!is_suser()) {
2291			return EACCES;
2292		}
2293
2294		/* Allowed only on the root vnode of the boot volume */
2295		if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2296		    !vnode_isvroot(vp)) {
2297			return EINVAL;
2298		}
2299		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2300			return (EROFS);
2301		}
2302		printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2303		hfs_mark_volume_inconsistent(hfsmp);
2304		break;
2305
2306	case HFS_FSCTL_GET_JOURNAL_INFO:
2307		jip = (struct hfs_journal_info*)ap->a_data;
2308
2309		if (vp == NULLVP)
2310		        return EINVAL;
2311
2312	    if (hfsmp->jnl == NULL) {
2313			jnl_start = 0;
2314			jnl_size  = 0;
2315	    } else {
2316			jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2317			jnl_size  = (off_t)hfsmp->jnl_size;
2318	    }
2319
2320		jip->jstart = jnl_start;
2321		jip->jsize = jnl_size;
2322		break;
2323
2324	case HFS_SET_ALWAYS_ZEROFILL: {
2325	    struct cnode *cp = VTOC(vp);
2326
2327	    if (*(int *)ap->a_data) {
2328		cp->c_flag |= C_ALWAYS_ZEROFILL;
2329	    } else {
2330		cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2331	    }
2332	    break;
2333	}
2334
2335	case HFS_DISABLE_METAZONE: {
2336		/* Only root can disable metadata zone */
2337		if (!is_suser()) {
2338			return EACCES;
2339		}
2340		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2341			return (EROFS);
2342		}
2343
2344		/* Disable metadata zone now */
2345		(void) hfs_metadatazone_init(hfsmp, true);
2346		printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2347		break;
2348	}
2349
2350	default:
2351		return (ENOTTY);
2352	}
2353
2354	return 0;
2355}
2356
2357/*
2358 * select
2359 */
2360int
2361hfs_vnop_select(__unused struct vnop_select_args *ap)
2362/*
2363	struct vnop_select_args {
2364		vnode_t a_vp;
2365		int  a_which;
2366		int  a_fflags;
2367		void *a_wql;
2368		vfs_context_t a_context;
2369	};
2370*/
2371{
2372	/*
2373	 * We should really check to see if I/O is possible.
2374	 */
2375	return (1);
2376}
2377
2378/*
2379 * Converts a logical block number to a physical block, and optionally returns
2380 * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2381 * The physical block number is based on the device block size, currently its 512.
2382 * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2383 */
2384int
2385hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2386{
2387	struct filefork *fp = VTOF(vp);
2388	struct hfsmount *hfsmp = VTOHFS(vp);
2389	int  retval = E_NONE;
2390	u_int32_t  logBlockSize;
2391	size_t  bytesContAvail = 0;
2392	off_t  blockposition;
2393	int lockExtBtree;
2394	int lockflags = 0;
2395
2396	/*
2397	 * Check for underlying vnode requests and ensure that logical
2398	 * to physical mapping is requested.
2399	 */
2400	if (vpp != NULL)
2401		*vpp = hfsmp->hfs_devvp;
2402	if (bnp == NULL)
2403		return (0);
2404
2405	logBlockSize = GetLogicalBlockSize(vp);
2406	blockposition = (off_t)bn * logBlockSize;
2407
2408	lockExtBtree = overflow_extents(fp);
2409
2410	if (lockExtBtree)
2411		lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2412
2413	retval = MacToVFSError(
2414                            MapFileBlockC (HFSTOVCB(hfsmp),
2415                                            (FCB*)fp,
2416                                            MAXPHYSIO,
2417                                            blockposition,
2418                                            bnp,
2419                                            &bytesContAvail));
2420
2421	if (lockExtBtree)
2422		hfs_systemfile_unlock(hfsmp, lockflags);
2423
2424	if (retval == E_NONE) {
2425		/* Figure out how many read ahead blocks there are */
2426		if (runp != NULL) {
2427			if (can_cluster(logBlockSize)) {
2428				/* Make sure this result never goes negative: */
2429				*runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2430			} else {
2431				*runp = 0;
2432			}
2433		}
2434	}
2435	return (retval);
2436}
2437
2438/*
2439 * Convert logical block number to file offset.
2440 */
2441int
2442hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2443/*
2444	struct vnop_blktooff_args {
2445		vnode_t a_vp;
2446		daddr64_t a_lblkno;
2447		off_t *a_offset;
2448	};
2449*/
2450{
2451	if (ap->a_vp == NULL)
2452		return (EINVAL);
2453	*ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2454
2455	return(0);
2456}
2457
2458/*
2459 * Convert file offset to logical block number.
2460 */
2461int
2462hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2463/*
2464	struct vnop_offtoblk_args {
2465		vnode_t a_vp;
2466		off_t a_offset;
2467		daddr64_t *a_lblkno;
2468	};
2469*/
2470{
2471	if (ap->a_vp == NULL)
2472		return (EINVAL);
2473	*ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2474
2475	return(0);
2476}
2477
2478/*
2479 * Map file offset to physical block number.
2480 *
2481 * If this function is called for write operation, and if the file
2482 * had virtual blocks allocated (delayed allocation), real blocks
2483 * are allocated by calling ExtendFileC().
2484 *
2485 * If this function is called for read operation, and if the file
2486 * had virtual blocks allocated (delayed allocation), no change
2487 * to the size of file is done, and if required, rangelist is
2488 * searched for mapping.
2489 *
2490 * System file cnodes are expected to be locked (shared or exclusive).
2491 */
2492int
2493hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2494/*
2495	struct vnop_blockmap_args {
2496		vnode_t a_vp;
2497		off_t a_foffset;
2498		size_t a_size;
2499		daddr64_t *a_bpn;
2500		size_t *a_run;
2501		void *a_poff;
2502		int a_flags;
2503		vfs_context_t a_context;
2504	};
2505*/
2506{
2507	struct vnode *vp = ap->a_vp;
2508	struct cnode *cp;
2509	struct filefork *fp;
2510	struct hfsmount *hfsmp;
2511	size_t bytesContAvail = 0;
2512	int retval = E_NONE;
2513	int syslocks = 0;
2514	int lockflags = 0;
2515	struct rl_entry *invalid_range;
2516	enum rl_overlaptype overlaptype;
2517	int started_tr = 0;
2518	int tooklock = 0;
2519
2520#if HFS_COMPRESSION
2521	if (VNODE_IS_RSRC(vp)) {
2522		/* allow blockmaps to the resource fork */
2523	} else {
2524		if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2525			int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2526			switch(state) {
2527				case FILE_IS_COMPRESSED:
2528					return ENOTSUP;
2529				case FILE_IS_CONVERTING:
2530					/* if FILE_IS_CONVERTING, we allow blockmap */
2531					break;
2532				default:
2533					printf("invalid state %d for compressed file\n", state);
2534					/* fall through */
2535			}
2536		}
2537	}
2538#endif /* HFS_COMPRESSION */
2539
2540	/* Do not allow blockmap operation on a directory */
2541	if (vnode_isdir(vp)) {
2542		return (ENOTSUP);
2543	}
2544
2545	/*
2546	 * Check for underlying vnode requests and ensure that logical
2547	 * to physical mapping is requested.
2548	 */
2549	if (ap->a_bpn == NULL)
2550		return (0);
2551
2552	if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2553		if (VTOC(vp)->c_lockowner != current_thread()) {
2554			hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2555			tooklock = 1;
2556		}
2557	}
2558	hfsmp = VTOHFS(vp);
2559	cp = VTOC(vp);
2560	fp = VTOF(vp);
2561
2562retry:
2563	/* Check virtual blocks only when performing write operation */
2564	if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2565		if (hfs_start_transaction(hfsmp) != 0) {
2566			retval = EINVAL;
2567			goto exit;
2568		} else {
2569			started_tr = 1;
2570		}
2571		syslocks = SFL_EXTENTS | SFL_BITMAP;
2572
2573	} else if (overflow_extents(fp)) {
2574		syslocks = SFL_EXTENTS;
2575	}
2576
2577	if (syslocks)
2578		lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2579
2580	/*
2581	 * Check for any delayed allocations.
2582	 */
2583	if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2584		int64_t actbytes;
2585		u_int32_t loanedBlocks;
2586
2587		//
2588		// Make sure we have a transaction.  It's possible
2589		// that we came in and fp->ff_unallocblocks was zero
2590		// but during the time we blocked acquiring the extents
2591		// btree, ff_unallocblocks became non-zero and so we
2592		// will need to start a transaction.
2593		//
2594		if (started_tr == 0) {
2595			if (syslocks) {
2596				hfs_systemfile_unlock(hfsmp, lockflags);
2597				syslocks = 0;
2598			}
2599			goto retry;
2600		}
2601
2602		/*
2603		 * Note: ExtendFileC will Release any blocks on loan and
2604		 * aquire real blocks.  So we ask to extend by zero bytes
2605		 * since ExtendFileC will account for the virtual blocks.
2606		 */
2607
2608		loanedBlocks = fp->ff_unallocblocks;
2609		retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2610				     kEFAllMask | kEFNoClumpMask, &actbytes);
2611
2612		if (retval) {
2613			fp->ff_unallocblocks = loanedBlocks;
2614			cp->c_blocks += loanedBlocks;
2615			fp->ff_blocks += loanedBlocks;
2616
2617			HFS_MOUNT_LOCK(hfsmp, TRUE);
2618			hfsmp->loanedBlocks += loanedBlocks;
2619			HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2620
2621			hfs_systemfile_unlock(hfsmp, lockflags);
2622			cp->c_flag |= C_MODIFIED;
2623			if (started_tr) {
2624				(void) hfs_update(vp, TRUE);
2625				(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2626
2627				hfs_end_transaction(hfsmp);
2628				started_tr = 0;
2629			}
2630			goto exit;
2631		}
2632	}
2633
2634	retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2635	                       ap->a_bpn, &bytesContAvail);
2636	if (syslocks) {
2637		hfs_systemfile_unlock(hfsmp, lockflags);
2638		syslocks = 0;
2639	}
2640
2641	if (started_tr) {
2642		(void) hfs_update(vp, TRUE);
2643		(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2644		hfs_end_transaction(hfsmp);
2645		started_tr = 0;
2646	}
2647	if (retval) {
2648		/* On write, always return error because virtual blocks, if any,
2649		 * should have been allocated in ExtendFileC().  We do not
2650		 * allocate virtual blocks on read, therefore return error
2651		 * only if no virtual blocks are allocated.  Otherwise we search
2652		 * rangelist for zero-fills
2653		 */
2654		if ((MacToVFSError(retval) != ERANGE) ||
2655		    (ap->a_flags & VNODE_WRITE) ||
2656		    ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2657			goto exit;
2658		}
2659
2660		/* Validate if the start offset is within logical file size */
2661		if (ap->a_foffset >= fp->ff_size) {
2662		    	goto exit;
2663		}
2664
2665		/*
2666		 * At this point, we have encountered a failure during
2667		 * MapFileBlockC that resulted in ERANGE, and we are not servicing
2668		 * a write, and there are borrowed blocks.
2669		 *
2670		 * However, the cluster layer will not call blockmap for
2671		 * blocks that are borrowed and in-cache.  We have to assume that
2672		 * because we observed ERANGE being emitted from MapFileBlockC, this
2673		 * extent range is not valid on-disk.  So we treat this as a
2674		 * mapping that needs to be zero-filled prior to reading.
2675		 *
2676		 * Note that under certain circumstances (such as non-contiguous
2677		 * userland VM mappings in the calling process), cluster_io
2678		 * may be forced to split a large I/O driven by hfs_vnop_write
2679		 * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
2680		 * the case here, then we have already removed the invalid range list
2681		 * mapping prior to getting to this blockmap call, so we should not
2682		 * search the invalid rangelist for this byte range.
2683		 */
2684
2685		bytesContAvail = fp->ff_size - ap->a_foffset;
2686		/*
2687		 * Clip the contiguous available bytes to, at most, the allowable
2688		 * maximum or the amount requested.
2689		 */
2690
2691		if (bytesContAvail > ap->a_size) {
2692			bytesContAvail = ap->a_size;
2693		}
2694
2695		*ap->a_bpn = (daddr64_t) -1;
2696		retval = 0;
2697
2698		goto exit;
2699	}
2700
2701	/* MapFileC() found a valid extent in the filefork.  Search the
2702	 * mapping information further for invalid file ranges
2703	 */
2704	overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2705	                      ap->a_foffset + (off_t)bytesContAvail - 1,
2706	                      &invalid_range);
2707	if (overlaptype != RL_NOOVERLAP) {
2708		switch(overlaptype) {
2709		case RL_MATCHINGOVERLAP:
2710		case RL_OVERLAPCONTAINSRANGE:
2711		case RL_OVERLAPSTARTSBEFORE:
2712			/* There's no valid block for this byte offset */
2713			*ap->a_bpn = (daddr64_t)-1;
2714			/* There's no point limiting the amount to be returned
2715			 * if the invalid range that was hit extends all the way
2716			 * to the EOF (i.e. there's no valid bytes between the
2717			 * end of this range and the file's EOF):
2718			 */
2719			if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2720			    ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2721				bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2722			}
2723			break;
2724
2725		case RL_OVERLAPISCONTAINED:
2726		case RL_OVERLAPENDSAFTER:
2727			/* The range of interest hits an invalid block before the end: */
2728			if (invalid_range->rl_start == ap->a_foffset) {
2729				/* There's actually no valid information to be had starting here: */
2730				*ap->a_bpn = (daddr64_t)-1;
2731				if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2732				    ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
2733					bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2734				}
2735			} else {
2736				bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2737			}
2738			break;
2739
2740		case RL_NOOVERLAP:
2741			break;
2742		} /* end switch */
2743		if (bytesContAvail > ap->a_size)
2744			bytesContAvail = ap->a_size;
2745	}
2746
2747exit:
2748	if (retval == 0) {
2749		if (ap->a_run)
2750			*ap->a_run = bytesContAvail;
2751
2752		if (ap->a_poff)
2753			*(int *)ap->a_poff = 0;
2754	}
2755
2756	if (tooklock)
2757		hfs_unlock(cp);
2758
2759	return (MacToVFSError(retval));
2760}
2761
2762/*
2763 * prepare and issue the I/O
2764 * buf_strategy knows how to deal
2765 * with requests that require
2766 * fragmented I/Os
2767 */
2768int
2769hfs_vnop_strategy(struct vnop_strategy_args *ap)
2770{
2771	buf_t	bp = ap->a_bp;
2772	vnode_t	vp = buf_vnode(bp);
2773	int error = 0;
2774
2775	/* Mark buffer as containing static data if cnode flag set */
2776	if (VTOC(vp)->c_flag & C_SSD_STATIC) {
2777		buf_markstatic(bp);
2778	}
2779
2780#if CONFIG_PROTECT
2781	cnode_t *cp = NULL;
2782
2783	if ((cp = cp_get_protected_cnode(vp)) != NULL) {
2784		/*
2785		 * We rely upon the truncate lock to protect the
2786		 * CP cache key from getting tossed prior to our IO finishing here.
2787		 * Nearly all cluster io calls to manipulate file payload from HFS
2788		 * take the truncate lock before calling into the cluster
2789		 * layer to ensure the file size does not change, or that they
2790		 * have exclusive right to change the EOF of the file.
2791		 * That same guarantee protects us here since the code that
2792		 * deals with CP lock events must now take the truncate lock
2793		 * before doing anything.
2794		 *
2795		 * There is 1 exception here:
2796		 * 1) One exception should be the VM swapfile IO, because HFS will
2797		 * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
2798		 * swapfile code only without holding the truncate lock.  This is because
2799		 * individual swapfiles are maintained at fixed-length sizes by the VM code.
2800		 * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
2801		 * create our own UPL and thus take the truncate lock before calling
2802		 * into the cluster layer.  In that case, however, we are not concerned
2803		 * with the CP blob being wiped out in the middle of the IO
2804		 * because there isn't anything to toss; the VM swapfile key stays
2805		 * in-core as long as the file is open.
2806		 *
2807		 * NB:
2808		 * For filesystem resize, we may not have access to the underlying
2809		 * file's cache key for whatever reason (device may be locked).  However,
2810		 * we do not need it since we are going to use the temporary HFS-wide resize key
2811		 * which is generated once we start relocating file content.  If this file's I/O
2812		 * should be done using the resize key, it will have been supplied already, so
2813		 * do not attach the file's cp blob to the buffer.
2814		 */
2815		if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
2816			buf_setcpaddr(bp, cp->c_cpentry);
2817		}
2818	}
2819#endif /* CONFIG_PROTECT */
2820
2821	error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
2822
2823	return error;
2824}
2825
2826static int
2827hfs_minorupdate(struct vnode *vp) {
2828	struct cnode *cp = VTOC(vp);
2829	cp->c_flag &= ~C_MODIFIED;
2830	cp->c_touch_acctime = 0;
2831	cp->c_touch_chgtime = 0;
2832	cp->c_touch_modtime = 0;
2833
2834	return 0;
2835}
2836
2837int
2838do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context)
2839{
2840	register struct cnode *cp = VTOC(vp);
2841    	struct filefork *fp = VTOF(vp);
2842	struct proc *p = vfs_context_proc(context);;
2843	kauth_cred_t cred = vfs_context_ucred(context);
2844	int retval;
2845	off_t bytesToAdd;
2846	off_t actualBytesAdded;
2847	off_t filebytes;
2848	u_int32_t fileblocks;
2849	int blksize;
2850	struct hfsmount *hfsmp;
2851	int lockflags;
2852
2853	blksize = VTOVCB(vp)->blockSize;
2854	fileblocks = fp->ff_blocks;
2855	filebytes = (off_t)fileblocks * (off_t)blksize;
2856
2857	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2858		 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2859
2860	if (length < 0)
2861		return (EINVAL);
2862
2863	/* This should only happen with a corrupt filesystem */
2864	if ((off_t)fp->ff_size < 0)
2865		return (EINVAL);
2866
2867	if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2868		return (EFBIG);
2869
2870	hfsmp = VTOHFS(vp);
2871
2872	retval = E_NONE;
2873
2874	/* Files that are changing size are not hot file candidates. */
2875	if (hfsmp->hfc_stage == HFC_RECORDING) {
2876		fp->ff_bytesread = 0;
2877	}
2878
2879	/*
2880	 * We cannot just check if fp->ff_size == length (as an optimization)
2881	 * since there may be extra physical blocks that also need truncation.
2882	 */
2883#if QUOTA
2884	if ((retval = hfs_getinoquota(cp)))
2885		return(retval);
2886#endif /* QUOTA */
2887
2888	/*
2889	 * Lengthen the size of the file. We must ensure that the
2890	 * last byte of the file is allocated. Since the smallest
2891	 * value of ff_size is 0, length will be at least 1.
2892	 */
2893	if (length > (off_t)fp->ff_size) {
2894#if QUOTA
2895		retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2896				   cred, 0);
2897		if (retval)
2898			goto Err_Exit;
2899#endif /* QUOTA */
2900		/*
2901		 * If we don't have enough physical space then
2902		 * we need to extend the physical size.
2903		 */
2904		if (length > filebytes) {
2905			int eflags;
2906			u_int32_t blockHint = 0;
2907
2908			/* All or nothing and don't round up to clumpsize. */
2909			eflags = kEFAllMask | kEFNoClumpMask;
2910
2911			if (cred && suser(cred, NULL) != 0)
2912				eflags |= kEFReserveMask;  /* keep a reserve */
2913
2914			/*
2915			 * Allocate Journal and Quota files in metadata zone.
2916			 */
2917			if (filebytes == 0 &&
2918			    hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2919			    hfs_virtualmetafile(cp)) {
2920				eflags |= kEFMetadataMask;
2921				blockHint = hfsmp->hfs_metazone_start;
2922			}
2923			if (hfs_start_transaction(hfsmp) != 0) {
2924			    retval = EINVAL;
2925			    goto Err_Exit;
2926			}
2927
2928			/* Protect extents b-tree and allocation bitmap */
2929			lockflags = SFL_BITMAP;
2930			if (overflow_extents(fp))
2931				lockflags |= SFL_EXTENTS;
2932			lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2933
2934			while ((length > filebytes) && (retval == E_NONE)) {
2935				bytesToAdd = length - filebytes;
2936				retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2937                                                    (FCB*)fp,
2938                                                    bytesToAdd,
2939                                                    blockHint,
2940                                                    eflags,
2941                                                    &actualBytesAdded));
2942
2943				filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2944				if (actualBytesAdded == 0 && retval == E_NONE) {
2945					if (length > filebytes)
2946						length = filebytes;
2947					break;
2948				}
2949			} /* endwhile */
2950
2951			hfs_systemfile_unlock(hfsmp, lockflags);
2952
2953			if (hfsmp->jnl) {
2954				if (skipupdate) {
2955					(void) hfs_minorupdate(vp);
2956				}
2957				else {
2958					(void) hfs_update(vp, TRUE);
2959					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2960				}
2961			}
2962
2963			hfs_end_transaction(hfsmp);
2964
2965			if (retval)
2966				goto Err_Exit;
2967
2968			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2969				(int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2970		}
2971
2972		if (!(flags & IO_NOZEROFILL)) {
2973			if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2974				struct rl_entry *invalid_range;
2975				off_t zero_limit;
2976
2977				zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2978				if (length < zero_limit) zero_limit = length;
2979
2980				if (length > (off_t)fp->ff_size) {
2981					struct timeval tv;
2982
2983		   			/* Extending the file: time to fill out the current last page w. zeroes? */
2984		   			if ((fp->ff_size & PAGE_MASK_64) &&
2985					    (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2986					    fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2987
2988						/* There's some valid data at the start of the (current) last page
2989						   of the file, so zero out the remainder of that page to ensure the
2990						   entire page contains valid data.  Since there is no invalid range
2991						   possible past the (current) eof, there's no need to remove anything
2992						   from the invalid range list before calling cluster_write():	*/
2993						hfs_unlock(cp);
2994						retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2995								fp->ff_size, (off_t)0,
2996								(flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2997						hfs_lock(cp, HFS_FORCE_LOCK);
2998						if (retval) goto Err_Exit;
2999
3000						/* Merely invalidate the remaining area, if necessary: */
3001						if (length > zero_limit) {
3002							microuptime(&tv);
3003							rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3004							cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3005						}
3006		   			} else {
3007					/* The page containing the (current) eof is invalid: just add the
3008					   remainder of the page to the invalid list, along with the area
3009					   being newly allocated:
3010					 */
3011					microuptime(&tv);
3012					rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3013					cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3014					};
3015				}
3016			} else {
3017					panic("hfs_truncate: invoked on non-UBC object?!");
3018			};
3019		}
3020		cp->c_touch_modtime = TRUE;
3021		fp->ff_size = length;
3022
3023	} else { /* Shorten the size of the file */
3024
3025		if ((off_t)fp->ff_size > length) {
3026			/* Any space previously marked as invalid is now irrelevant: */
3027			rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3028		}
3029
3030		/*
3031		 * Account for any unmapped blocks. Note that the new
3032		 * file length can still end up with unmapped blocks.
3033		 */
3034		if (fp->ff_unallocblocks > 0) {
3035			u_int32_t finalblks;
3036			u_int32_t loanedBlocks;
3037
3038			HFS_MOUNT_LOCK(hfsmp, TRUE);
3039
3040			loanedBlocks = fp->ff_unallocblocks;
3041			cp->c_blocks -= loanedBlocks;
3042			fp->ff_blocks -= loanedBlocks;
3043			fp->ff_unallocblocks = 0;
3044
3045			hfsmp->loanedBlocks -= loanedBlocks;
3046
3047			finalblks = (length + blksize - 1) / blksize;
3048			if (finalblks > fp->ff_blocks) {
3049				/* calculate required unmapped blocks */
3050				loanedBlocks = finalblks - fp->ff_blocks;
3051				hfsmp->loanedBlocks += loanedBlocks;
3052
3053				fp->ff_unallocblocks = loanedBlocks;
3054				cp->c_blocks += loanedBlocks;
3055				fp->ff_blocks += loanedBlocks;
3056			}
3057			HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3058		}
3059
3060		/*
3061		 * For a TBE process the deallocation of the file blocks is
3062		 * delayed until the file is closed.  And hfs_close calls
3063		 * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
3064		 * isn't set, we make sure this isn't a TBE process.
3065		 */
3066		if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
3067#if QUOTA
3068		  off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3069#endif /* QUOTA */
3070		  if (hfs_start_transaction(hfsmp) != 0) {
3071		      retval = EINVAL;
3072		      goto Err_Exit;
3073		  }
3074
3075			if (fp->ff_unallocblocks == 0) {
3076				/* Protect extents b-tree and allocation bitmap */
3077				lockflags = SFL_BITMAP;
3078				if (overflow_extents(fp))
3079					lockflags |= SFL_EXTENTS;
3080				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3081
3082				retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3083													 FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3084
3085				hfs_systemfile_unlock(hfsmp, lockflags);
3086			}
3087			if (hfsmp->jnl) {
3088				if (retval == 0) {
3089					fp->ff_size = length;
3090				}
3091				if (skipupdate) {
3092					(void) hfs_minorupdate(vp);
3093				}
3094				else {
3095					(void) hfs_update(vp, TRUE);
3096					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3097				}
3098			}
3099			hfs_end_transaction(hfsmp);
3100
3101			filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3102			if (retval)
3103				goto Err_Exit;
3104#if QUOTA
3105			/* These are bytesreleased */
3106			(void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3107#endif /* QUOTA */
3108		}
3109		/* Only set update flag if the logical length changes */
3110		if ((off_t)fp->ff_size != length)
3111			cp->c_touch_modtime = TRUE;
3112		fp->ff_size = length;
3113	}
3114	if (cp->c_mode & (S_ISUID | S_ISGID)) {
3115		if (!vfs_context_issuser(context)) {
3116			cp->c_mode &= ~(S_ISUID | S_ISGID);
3117			skipupdate = 0;
3118		}
3119	}
3120	if (skipupdate) {
3121		retval = hfs_minorupdate(vp);
3122	}
3123	else {
3124		cp->c_touch_chgtime = TRUE;	/* status changed */
3125		cp->c_touch_modtime = TRUE;	/* file data was modified */
3126		retval = hfs_update(vp, MNT_WAIT);
3127	}
3128	if (retval) {
3129		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3130		     -1, -1, -1, retval, 0);
3131	}
3132
3133Err_Exit:
3134
3135	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
3136		 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3137
3138	return (retval);
3139}
3140
3141/*
3142 * Preparation which must be done prior to deleting the catalog record
3143 * of a file or directory.  In order to make the on-disk as safe as possible,
3144 * we remove the catalog entry before releasing the bitmap blocks and the
3145 * overflow extent records.  However, some work must be done prior to deleting
3146 * the catalog record.
3147 *
3148 * When calling this function, the cnode must exist both in memory and on-disk.
3149 * If there are both resource fork and data fork vnodes, this function should
3150 * be called on both.
3151 */
3152
3153int
3154hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3155
3156	struct filefork *fp = VTOF(vp);
3157	struct cnode *cp = VTOC(vp);
3158#if QUOTA
3159	int retval = 0;
3160#endif /* QUOTA */
3161
3162	/* Cannot truncate an HFS directory! */
3163	if (vnode_isdir(vp)) {
3164		return (EISDIR);
3165	}
3166
3167	/*
3168	 * See the comment below in hfs_truncate for why we need to call
3169	 * setsize here.  Essentially we want to avoid pending IO if we
3170	 * already know that the blocks are going to be released here.
3171	 * This function is only called when totally removing all storage for a file, so
3172	 * we can take a shortcut and immediately setsize (0);
3173	 */
3174	ubc_setsize(vp, 0);
3175
3176	/* This should only happen with a corrupt filesystem */
3177	if ((off_t)fp->ff_size < 0)
3178		return (EINVAL);
3179
3180	/*
3181	 * We cannot just check if fp->ff_size == length (as an optimization)
3182	 * since there may be extra physical blocks that also need truncation.
3183	 */
3184#if QUOTA
3185	if ((retval = hfs_getinoquota(cp))) {
3186		return(retval);
3187	}
3188#endif /* QUOTA */
3189
3190	/* Wipe out any invalid ranges which have yet to be backed by disk */
3191	rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3192
3193	/*
3194	 * Account for any unmapped blocks. Since we're deleting the
3195	 * entire file, we don't have to worry about just shrinking
3196	 * to a smaller number of borrowed blocks.
3197	 */
3198	if (fp->ff_unallocblocks > 0) {
3199		u_int32_t loanedBlocks;
3200
3201		HFS_MOUNT_LOCK(hfsmp, TRUE);
3202
3203		loanedBlocks = fp->ff_unallocblocks;
3204		cp->c_blocks -= loanedBlocks;
3205		fp->ff_blocks -= loanedBlocks;
3206		fp->ff_unallocblocks = 0;
3207
3208		hfsmp->loanedBlocks -= loanedBlocks;
3209
3210		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3211	}
3212
3213	return 0;
3214}
3215
3216
3217/*
3218 * Special wrapper around calling TruncateFileC.  This function is useable
3219 * even when the catalog record does not exist any longer, making it ideal
3220 * for use when deleting a file.  The simplification here is that we know
3221 * that we are releasing all blocks.
3222 *
3223 * Note that this function may be called when there is no vnode backing
3224 * the file fork in question.  We may call this from hfs_vnop_inactive
3225 * to clear out resource fork data (and may not want to clear out the data
3226 * fork yet).  As a result, we pointer-check both sets of inputs before
3227 * doing anything with them.
3228 *
3229 * The caller is responsible for saving off a copy of the filefork(s)
3230 * embedded within the cnode prior to calling this function.  The pointers
3231 * supplied as arguments must be valid even if the cnode is no longer valid.
3232 */
3233
3234int
3235hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3236					 struct filefork *rsrcfork, u_int32_t fileid) {
3237
3238	off_t filebytes;
3239	u_int32_t fileblocks;
3240	int blksize = 0;
3241	int error = 0;
3242	int lockflags;
3243
3244	blksize = hfsmp->blockSize;
3245
3246	/* Data Fork */
3247	if ((datafork != NULL) && (datafork->ff_blocks > 0)) {
3248		fileblocks = datafork->ff_blocks;
3249		filebytes = (off_t)fileblocks * (off_t)blksize;
3250
3251		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3252
3253		while (filebytes > 0) {
3254			if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
3255				filebytes -= HFS_BIGFILE_SIZE;
3256			} else {
3257				filebytes = 0;
3258			}
3259
3260			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
3261			if (hfs_start_transaction(hfsmp) != 0) {
3262				error = EINVAL;
3263				break;
3264			}
3265
3266			if (datafork->ff_unallocblocks == 0) {
3267				/* Protect extents b-tree and allocation bitmap */
3268				lockflags = SFL_BITMAP;
3269				if (overflow_extents(datafork))
3270					lockflags |= SFL_EXTENTS;
3271				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3272
3273				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3274
3275				hfs_systemfile_unlock(hfsmp, lockflags);
3276			}
3277			if (error == 0) {
3278				datafork->ff_size = filebytes;
3279			}
3280			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3281
3282			/* Finish the transaction and start over if necessary */
3283			hfs_end_transaction(hfsmp);
3284
3285			if (error) {
3286				break;
3287			}
3288		}
3289	}
3290
3291	/* Resource fork */
3292	if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
3293		fileblocks = rsrcfork->ff_blocks;
3294		filebytes = (off_t)fileblocks * (off_t)blksize;
3295
3296		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3297
3298		while (filebytes > 0) {
3299			if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
3300				filebytes -= HFS_BIGFILE_SIZE;
3301			} else {
3302				filebytes = 0;
3303			}
3304
3305			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
3306			if (hfs_start_transaction(hfsmp) != 0) {
3307				error = EINVAL;
3308				break;
3309			}
3310
3311			if (rsrcfork->ff_unallocblocks == 0) {
3312				/* Protect extents b-tree and allocation bitmap */
3313				lockflags = SFL_BITMAP;
3314				if (overflow_extents(rsrcfork))
3315					lockflags |= SFL_EXTENTS;
3316				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3317
3318				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3319
3320				hfs_systemfile_unlock(hfsmp, lockflags);
3321			}
3322			if (error == 0) {
3323				rsrcfork->ff_size = filebytes;
3324			}
3325			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3326
3327			/* Finish the transaction and start over if necessary */
3328			hfs_end_transaction(hfsmp);
3329
3330			if (error) {
3331				break;
3332			}
3333		}
3334	}
3335
3336	return error;
3337}
3338
3339
3340/*
3341 * Truncate a cnode to at most length size, freeing (or adding) the
3342 * disk blocks.
3343 */
3344int
3345hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
3346             int skipupdate, vfs_context_t context)
3347{
3348    	struct filefork *fp = VTOF(vp);
3349	off_t filebytes;
3350	u_int32_t fileblocks;
3351	int blksize, error = 0;
3352	struct cnode *cp = VTOC(vp);
3353
3354	/* Cannot truncate an HFS directory! */
3355	if (vnode_isdir(vp)) {
3356		return (EISDIR);
3357	}
3358	/* A swap file cannot change size. */
3359	if (vnode_isswap(vp) && (length != 0)) {
3360		return (EPERM);
3361	}
3362
3363	blksize = VTOVCB(vp)->blockSize;
3364	fileblocks = fp->ff_blocks;
3365	filebytes = (off_t)fileblocks * (off_t)blksize;
3366
3367	//
3368	// Have to do this here so that we don't wind up with
3369	// i/o pending for blocks that are about to be released
3370	// if we truncate the file.
3371	//
3372	// If skipsetsize is set, then the caller is responsible
3373	// for the ubc_setsize.
3374	//
3375	// Even if skipsetsize is set, if the length is zero we
3376	// want to call ubc_setsize() because as of SnowLeopard
3377	// it will no longer cause any page-ins and it will drop
3378	// any dirty pages so that we don't do any i/o that we
3379	// don't have to.  This also prevents a race where i/o
3380	// for truncated blocks may overwrite later data if the
3381	// blocks get reallocated to a different file.
3382	//
3383	if (!skipsetsize || length == 0)
3384		ubc_setsize(vp, length);
3385
3386	// have to loop truncating or growing files that are
3387	// really big because otherwise transactions can get
3388	// enormous and consume too many kernel resources.
3389
3390	if (length < filebytes) {
3391		while (filebytes > length) {
3392			if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3393		    		filebytes -= HFS_BIGFILE_SIZE;
3394			} else {
3395		    		filebytes = length;
3396			}
3397			cp->c_flag |= C_FORCEUPDATE;
3398			error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
3399			if (error)
3400				break;
3401		}
3402	} else if (length > filebytes) {
3403		while (filebytes < length) {
3404			if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3405				filebytes += HFS_BIGFILE_SIZE;
3406			} else {
3407				filebytes = length;
3408			}
3409			cp->c_flag |= C_FORCEUPDATE;
3410			error = do_hfs_truncate(vp, filebytes, flags, skipupdate, context);
3411			if (error)
3412				break;
3413		}
3414	} else /* Same logical size */ {
3415
3416		error = do_hfs_truncate(vp, length, flags, skipupdate, context);
3417	}
3418	/* Files that are changing size are not hot file candidates. */
3419	if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3420		fp->ff_bytesread = 0;
3421	}
3422
3423	return (error);
3424}
3425
3426
3427
3428/*
3429 * Preallocate file storage space.
3430 */
3431int
3432hfs_vnop_allocate(struct vnop_allocate_args /* {
3433		vnode_t a_vp;
3434		off_t a_length;
3435		u_int32_t  a_flags;
3436		off_t *a_bytesallocated;
3437		off_t a_offset;
3438		vfs_context_t a_context;
3439	} */ *ap)
3440{
3441	struct vnode *vp = ap->a_vp;
3442	struct cnode *cp;
3443	struct filefork *fp;
3444	ExtendedVCB *vcb;
3445	off_t length = ap->a_length;
3446	off_t startingPEOF;
3447	off_t moreBytesRequested;
3448	off_t actualBytesAdded;
3449	off_t filebytes;
3450	u_int32_t fileblocks;
3451	int retval, retval2;
3452	u_int32_t blockHint;
3453	u_int32_t extendFlags;   /* For call to ExtendFileC */
3454	struct hfsmount *hfsmp;
3455	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3456	int lockflags;
3457	time_t orig_ctime;
3458
3459	*(ap->a_bytesallocated) = 0;
3460
3461	if (!vnode_isreg(vp))
3462		return (EISDIR);
3463	if (length < (off_t)0)
3464		return (EINVAL);
3465
3466	cp = VTOC(vp);
3467
3468	orig_ctime = VTOC(vp)->c_ctime;
3469
3470	check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3471
3472	hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
3473
3474	if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
3475		goto Err_Exit;
3476	}
3477
3478	fp = VTOF(vp);
3479	hfsmp = VTOHFS(vp);
3480	vcb = VTOVCB(vp);
3481
3482	fileblocks = fp->ff_blocks;
3483	filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3484
3485	if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3486		retval = EINVAL;
3487		goto Err_Exit;
3488	}
3489
3490	/* Fill in the flags word for the call to Extend the file */
3491
3492	extendFlags = kEFNoClumpMask;
3493	if (ap->a_flags & ALLOCATECONTIG)
3494		extendFlags |= kEFContigMask;
3495	if (ap->a_flags & ALLOCATEALL)
3496		extendFlags |= kEFAllMask;
3497	if (cred && suser(cred, NULL) != 0)
3498		extendFlags |= kEFReserveMask;
3499	if (hfs_virtualmetafile(cp))
3500		extendFlags |= kEFMetadataMask;
3501
3502	retval = E_NONE;
3503	blockHint = 0;
3504	startingPEOF = filebytes;
3505
3506	if (ap->a_flags & ALLOCATEFROMPEOF)
3507		length += filebytes;
3508	else if (ap->a_flags & ALLOCATEFROMVOL)
3509		blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3510
3511	/* If no changes are necesary, then we're done */
3512	if (filebytes == length)
3513		goto Std_Exit;
3514
3515	/*
3516	 * Lengthen the size of the file. We must ensure that the
3517	 * last byte of the file is allocated. Since the smallest
3518	 * value of filebytes is 0, length will be at least 1.
3519	 */
3520	if (length > filebytes) {
3521		off_t total_bytes_added = 0, orig_request_size;
3522
3523		orig_request_size = moreBytesRequested = length - filebytes;
3524
3525#if QUOTA
3526		retval = hfs_chkdq(cp,
3527				(int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3528				cred, 0);
3529		if (retval)
3530			goto Err_Exit;
3531
3532#endif /* QUOTA */
3533		/*
3534		 * Metadata zone checks.
3535		 */
3536		if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3537			/*
3538			 * Allocate Journal and Quota files in metadata zone.
3539			 */
3540			if (hfs_virtualmetafile(cp)) {
3541				blockHint = hfsmp->hfs_metazone_start;
3542			} else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3543				   (blockHint <= hfsmp->hfs_metazone_end)) {
3544				/*
3545				 * Move blockHint outside metadata zone.
3546				 */
3547				blockHint = hfsmp->hfs_metazone_end + 1;
3548			}
3549		}
3550
3551
3552		while ((length > filebytes) && (retval == E_NONE)) {
3553		    off_t bytesRequested;
3554
3555		    if (hfs_start_transaction(hfsmp) != 0) {
3556			retval = EINVAL;
3557			goto Err_Exit;
3558		    }
3559
3560		    /* Protect extents b-tree and allocation bitmap */
3561		    lockflags = SFL_BITMAP;
3562		    if (overflow_extents(fp))
3563			lockflags |= SFL_EXTENTS;
3564		    lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3565
3566		    if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
3567			bytesRequested = HFS_BIGFILE_SIZE;
3568		    } else {
3569			bytesRequested = moreBytesRequested;
3570		    }
3571
3572		    if (extendFlags & kEFContigMask) {
3573			    // if we're on a sparse device, this will force it to do a
3574			    // full scan to find the space needed.
3575			    hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
3576		    }
3577
3578		    retval = MacToVFSError(ExtendFileC(vcb,
3579						(FCB*)fp,
3580						bytesRequested,
3581						blockHint,
3582						extendFlags,
3583						&actualBytesAdded));
3584
3585		    if (retval == E_NONE) {
3586			*(ap->a_bytesallocated) += actualBytesAdded;
3587			total_bytes_added += actualBytesAdded;
3588			moreBytesRequested -= actualBytesAdded;
3589			if (blockHint != 0) {
3590			    blockHint += actualBytesAdded / vcb->blockSize;
3591			}
3592		    }
3593		    filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3594
3595		    hfs_systemfile_unlock(hfsmp, lockflags);
3596
3597		    if (hfsmp->jnl) {
3598			(void) hfs_update(vp, TRUE);
3599			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3600		    }
3601
3602		    hfs_end_transaction(hfsmp);
3603		}
3604
3605
3606		/*
3607		 * if we get an error and no changes were made then exit
3608		 * otherwise we must do the hfs_update to reflect the changes
3609		 */
3610		if (retval && (startingPEOF == filebytes))
3611			goto Err_Exit;
3612
3613		/*
3614		 * Adjust actualBytesAdded to be allocation block aligned, not
3615		 * clump size aligned.
3616		 * NOTE: So what we are reporting does not affect reality
3617		 * until the file is closed, when we truncate the file to allocation
3618		 * block size.
3619		 */
3620		if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
3621			*(ap->a_bytesallocated) =
3622				roundup(orig_request_size, (off_t)vcb->blockSize);
3623
3624	} else { /* Shorten the size of the file */
3625
3626		if (fp->ff_size > length) {
3627			/*
3628			 * Any buffers that are past the truncation point need to be
3629			 * invalidated (to maintain buffer cache consistency).
3630			 */
3631		}
3632
3633		retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
3634		filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
3635
3636		/*
3637		 * if we get an error and no changes were made then exit
3638		 * otherwise we must do the hfs_update to reflect the changes
3639		 */
3640		if (retval && (startingPEOF == filebytes)) goto Err_Exit;
3641#if QUOTA
3642		/* These are  bytesreleased */
3643		(void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
3644#endif /* QUOTA */
3645
3646		if (fp->ff_size > filebytes) {
3647			fp->ff_size = filebytes;
3648
3649			hfs_unlock(cp);
3650			ubc_setsize(vp, fp->ff_size);
3651			hfs_lock(cp, HFS_FORCE_LOCK);
3652		}
3653	}
3654
3655Std_Exit:
3656	cp->c_touch_chgtime = TRUE;
3657	cp->c_touch_modtime = TRUE;
3658	retval2 = hfs_update(vp, MNT_WAIT);
3659
3660	if (retval == 0)
3661		retval = retval2;
3662Err_Exit:
3663	hfs_unlock_truncate(cp, 0);
3664	hfs_unlock(cp);
3665	return (retval);
3666}
3667
3668
3669/*
3670 * Pagein for HFS filesystem
3671 */
3672int
3673hfs_vnop_pagein(struct vnop_pagein_args *ap)
3674/*
3675	struct vnop_pagein_args {
3676	   	vnode_t a_vp,
3677	   	upl_t 	      a_pl,
3678		vm_offset_t   a_pl_offset,
3679		off_t         a_f_offset,
3680		size_t        a_size,
3681		int           a_flags
3682		vfs_context_t a_context;
3683	};
3684*/
3685{
3686	vnode_t 	vp;
3687	struct cnode	*cp;
3688	struct filefork *fp;
3689	int		error = 0;
3690	upl_t 		upl;
3691	upl_page_info_t	*pl;
3692	off_t		f_offset;
3693	int		offset;
3694	int		isize;
3695	int		pg_index;
3696	boolean_t	truncate_lock_held = FALSE;
3697	boolean_t 	file_converted = FALSE;
3698	kern_return_t	kret;
3699
3700	vp = ap->a_vp;
3701	cp = VTOC(vp);
3702	fp = VTOF(vp);
3703
3704#if CONFIG_PROTECT
3705	if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
3706		return error;
3707	}
3708#endif /* CONFIG_PROTECT */
3709
3710	if (ap->a_pl != NULL) {
3711		/*
3712		 * this can only happen for swap files now that
3713		 * we're asking for V2 paging behavior...
3714		 * so don't need to worry about decompression, or
3715		 * keeping track of blocks read or taking the truncate lock
3716		 */
3717		error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
3718				       ap->a_size, (off_t)fp->ff_size, ap->a_flags);
3719		goto pagein_done;
3720	}
3721
3722retry_pagein:
3723	/*
3724	 * take truncate lock (shared/recursive) to guard against
3725	 * zero-fill thru fsync interfering, but only for v2
3726	 *
3727	 * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
3728	 * lock shared and we are allowed to recurse 1 level if this thread already
3729	 * owns the lock exclusively... this can legally occur
3730	 * if we are doing a shrinking ftruncate against a file
3731	 * that is mapped private, and the pages being truncated
3732	 * do not currently exist in the cache... in that case
3733	 * we will have to page-in the missing pages in order
3734	 * to provide them to the private mapping... we must
3735	 * also call hfs_unlock_truncate with a postive been_recursed
3736	 * arg to indicate that if we have recursed, there is no need to drop
3737	 * the lock.  Allowing this simple recursion is necessary
3738	 * in order to avoid a certain deadlock... since the ftruncate
3739	 * already holds the truncate lock exclusively, if we try
3740	 * to acquire it shared to protect the pagein path, we will
3741	 * hang this thread
3742	 *
3743	 * NOTE: The if () block below is a workaround in order to prevent a
3744	 * VM deadlock. See rdar://7853471.
3745	 *
3746	 * If we are in a forced unmount, then launchd will still have the
3747	 * dyld_shared_cache file mapped as it is trying to reboot.  If we
3748	 * take the truncate lock here to service a page fault, then our
3749	 * thread could deadlock with the forced-unmount.  The forced unmount
3750	 * thread will try to reclaim the dyld_shared_cache vnode, but since it's
3751	 * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
3752	 * thread will think it needs to copy all of the data out of the file
3753	 * and into a VM copy object.  If we hold the cnode lock here, then that
3754	 * VM operation will not be able to proceed, because we'll set a busy page
3755	 * before attempting to grab the lock.  Note that this isn't as simple as "don't
3756	 * call ubc_setsize" because doing that would just shift the problem to the
3757	 * ubc_msync done before the vnode is reclaimed.
3758	 *
3759	 * So, if a forced unmount on this volume is in flight AND the cnode is
3760	 * marked C_DELETED, then just go ahead and do the page in without taking
3761	 * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
3762	 * that is not going to be available on the next mount, this seems like a
3763	 * OK solution from a correctness point of view, even though it is hacky.
3764	 */
3765	if (vfs_isforce(vp->v_mount)) {
3766		if (cp->c_flag & C_DELETED) {
3767			/* If we don't get it, then just go ahead and operate without the lock */
3768			truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK);
3769		}
3770	}
3771	else {
3772		hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
3773		truncate_lock_held = TRUE;
3774	}
3775
3776	kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
3777
3778	if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
3779		error = EINVAL;
3780		goto pagein_done;
3781	}
3782	ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
3783
3784	isize = ap->a_size;
3785
3786	/*
3787	 * Scan from the back to find the last page in the UPL, so that we
3788	 * aren't looking at a UPL that may have already been freed by the
3789	 * preceding aborts/completions.
3790	 */
3791	for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
3792		if (upl_page_present(pl, --pg_index))
3793			break;
3794		if (pg_index == 0) {
3795			/*
3796			 * no absent pages were found in the range specified
3797			 * just abort the UPL to get rid of it and then we're done
3798			 */
3799			ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
3800			goto pagein_done;
3801		}
3802	}
3803	/*
3804	 * initialize the offset variables before we touch the UPL.
3805	 * f_offset is the position into the file, in bytes
3806	 * offset is the position into the UPL, in bytes
3807	 * pg_index is the pg# of the UPL we're operating on
3808	 * isize is the offset into the UPL of the last page that is present.
3809	 */
3810	isize = ((pg_index + 1) * PAGE_SIZE);
3811	pg_index = 0;
3812	offset = 0;
3813	f_offset = ap->a_f_offset;
3814
3815	while (isize) {
3816		int  xsize;
3817		int  num_of_pages;
3818
3819		if ( !upl_page_present(pl, pg_index)) {
3820			/*
3821			 * we asked for RET_ONLY_ABSENT, so it's possible
3822			 * to get back empty slots in the UPL.
3823			 * just skip over them
3824			 */
3825			f_offset += PAGE_SIZE;
3826			offset   += PAGE_SIZE;
3827			isize    -= PAGE_SIZE;
3828			pg_index++;
3829
3830			continue;
3831		}
3832		/*
3833		 * We know that we have at least one absent page.
3834		 * Now checking to see how many in a row we have
3835		 */
3836		num_of_pages = 1;
3837		xsize = isize - PAGE_SIZE;
3838
3839		while (xsize) {
3840			if ( !upl_page_present(pl, pg_index + num_of_pages))
3841				break;
3842			num_of_pages++;
3843			xsize -= PAGE_SIZE;
3844		}
3845		xsize = num_of_pages * PAGE_SIZE;
3846
3847#if HFS_COMPRESSION
3848		if (VNODE_IS_RSRC(vp)) {
3849			/* allow pageins of the resource fork */
3850		} else {
3851			int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
3852
3853			if (compressed) {
3854				if (truncate_lock_held) {
3855					/*
3856					 * can't hold the truncate lock when calling into the decmpfs layer
3857					 * since it calls back into this layer... even though we're only
3858					 * holding the lock in shared mode, and the re-entrant path only
3859					 * takes the lock shared, we can deadlock if some other thread
3860					 * tries to grab the lock exclusively in between.
3861					 */
3862					hfs_unlock_truncate(cp, 1);
3863					truncate_lock_held = FALSE;
3864				}
3865				ap->a_pl = upl;
3866				ap->a_pl_offset = offset;
3867				ap->a_f_offset = f_offset;
3868				ap->a_size = xsize;
3869
3870				error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
3871				/*
3872				 * note that decpfs_pagein_compressed can change the state of
3873				 * 'compressed'... it will set it to 0 if the file is no longer
3874				 * compressed once the compression lock is successfully taken
3875				 * i.e. we would block on that lock while the file is being inflated
3876				 */
3877				if (compressed) {
3878					if (error == 0) {
3879						/* successful page-in, update the access time */
3880						VTOC(vp)->c_touch_acctime = TRUE;
3881
3882						/* compressed files are not hot file candidates */
3883						if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3884							fp->ff_bytesread = 0;
3885						}
3886					} else if (error == EAGAIN) {
3887						/*
3888						 * EAGAIN indicates someone else already holds the compression lock...
3889						 * to avoid deadlocking, we'll abort this range of pages with an
3890						 * indication that the pagein needs to be redriven
3891						 */
3892			        		ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
3893					}
3894					goto pagein_next_range;
3895				}
3896				else {
3897					/*
3898					 * Set file_converted only if the file became decompressed while we were
3899					 * paging in.  If it were still compressed, we would re-start the loop using the goto
3900					 * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
3901					 * condition below, since we could have avoided taking the truncate lock to prevent
3902					 * a deadlock in the force unmount case.
3903					 */
3904					file_converted = TRUE;
3905				}
3906			}
3907			if (file_converted == TRUE) {
3908				/*
3909				 * the file was converted back to a regular file after we first saw it as compressed
3910				 * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
3911				 * reset a_size so that we consider what remains of the original request
3912				 * and null out a_upl and a_pl_offset.
3913				 *
3914				 * We should only be able to get into this block if the decmpfs_pagein_compressed
3915				 * successfully decompressed the range in question for this file.
3916				 */
3917				ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
3918
3919				ap->a_size = isize;
3920				ap->a_pl = NULL;
3921				ap->a_pl_offset = 0;
3922
3923				/* Reset file_converted back to false so that we don't infinite-loop. */
3924				file_converted = FALSE;
3925				goto retry_pagein;
3926			}
3927		}
3928#endif
3929		error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
3930
3931		/*
3932		 * Keep track of blocks read.
3933		 */
3934		if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
3935			int bytesread;
3936			int took_cnode_lock = 0;
3937
3938			if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
3939				bytesread = fp->ff_size;
3940			else
3941				bytesread = xsize;
3942
3943			/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
3944			if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
3945				hfs_lock(cp, HFS_FORCE_LOCK);
3946				took_cnode_lock = 1;
3947			}
3948			/*
3949			 * If this file hasn't been seen since the start of
3950			 * the current sampling period then start over.
3951			 */
3952			if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
3953				struct timeval tv;
3954
3955				fp->ff_bytesread = bytesread;
3956				microtime(&tv);
3957				cp->c_atime = tv.tv_sec;
3958			} else {
3959				fp->ff_bytesread += bytesread;
3960			}
3961			cp->c_touch_acctime = TRUE;
3962			if (took_cnode_lock)
3963				hfs_unlock(cp);
3964		}
3965pagein_next_range:
3966		f_offset += xsize;
3967		offset   += xsize;
3968		isize    -= xsize;
3969		pg_index += num_of_pages;
3970
3971		error = 0;
3972	}
3973
3974pagein_done:
3975	if (truncate_lock_held == TRUE) {
3976		/* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
3977		hfs_unlock_truncate(cp, 1);
3978	}
3979
3980	return (error);
3981}
3982
3983/*
3984 * Pageout for HFS filesystem.
3985 */
3986int
3987hfs_vnop_pageout(struct vnop_pageout_args *ap)
3988/*
3989	struct vnop_pageout_args {
3990	   vnode_t a_vp,
3991	   upl_t         a_pl,
3992	   vm_offset_t   a_pl_offset,
3993	   off_t         a_f_offset,
3994	   size_t        a_size,
3995	   int           a_flags
3996	   vfs_context_t a_context;
3997	};
3998*/
3999{
4000	vnode_t vp = ap->a_vp;
4001	struct cnode *cp;
4002	struct filefork *fp;
4003	int retval = 0;
4004	off_t filesize;
4005	upl_t 		upl;
4006	upl_page_info_t* pl;
4007	vm_offset_t	a_pl_offset;
4008	int		a_flags;
4009	int is_pageoutv2 = 0;
4010	kern_return_t kret;
4011
4012	cp = VTOC(vp);
4013	fp = VTOF(vp);
4014
4015	/*
4016	 * Figure out where the file ends, for pageout purposes.  If
4017	 * ff_new_size > ff_size, then we're in the middle of extending the
4018	 * file via a write, so it is safe (and necessary) that we be able
4019	 * to pageout up to that point.
4020	 */
4021	filesize = fp->ff_size;
4022	if (fp->ff_new_size > filesize)
4023		filesize = fp->ff_new_size;
4024
4025	a_flags = ap->a_flags;
4026	a_pl_offset = ap->a_pl_offset;
4027
4028	/*
4029	 * we can tell if we're getting the new or old behavior from the UPL
4030	 */
4031	if ((upl = ap->a_pl) == NULL) {
4032		int request_flags;
4033
4034		is_pageoutv2 = 1;
4035		/*
4036		 * we're in control of any UPL we commit
4037		 * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4038		 */
4039		a_flags &= ~UPL_NOCOMMIT;
4040		a_pl_offset = 0;
4041
4042		/*
4043		 * For V2 semantics, we want to take the cnode truncate lock
4044		 * shared to guard against the file size changing via zero-filling.
4045		 *
4046		 * However, we have to be careful because we may be invoked
4047		 * via the ubc_msync path to write out dirty mmap'd pages
4048		 * in response to a lock event on a content-protected
4049		 * filesystem (e.g. to write out class A files).
4050		 * As a result, we want to take the truncate lock 'SHARED' with
4051		 * the mini-recursion locktype so that we don't deadlock/panic
4052		 * because we may be already holding the truncate lock exclusive to force any other
4053		 * IOs to have blocked behind us.
4054		 */
4055		hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK);
4056
4057		if (a_flags & UPL_MSYNC) {
4058			request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4059		}
4060		else {
4061			request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4062		}
4063
4064		kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4065
4066		if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4067			retval = EINVAL;
4068			goto pageout_done;
4069		}
4070	}
4071	/*
4072	 * from this point forward upl points at the UPL we're working with
4073	 * it was either passed in or we succesfully created it
4074	 */
4075
4076	/*
4077	 * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4078	 * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4079	 * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4080	 * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4081	 * logic in vnode_pageout except that we need to do it after grabbing the truncate
4082	 * lock in HFS so that we don't lock invert ourselves.
4083	 *
4084	 * Note that we can still get into this function on behalf of the default pager with
4085	 * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4086	 * since fsync and other writing threads will grab the locks, then mark the
4087	 * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4088	 * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4089	 * we do not try to grab anything for the pre-V2 case, which should only be accessed
4090	 * by the paging/VM system.
4091	 */
4092
4093	if (is_pageoutv2) {
4094		off_t f_offset;
4095		int offset;
4096		int isize;
4097		int pg_index;
4098		int error;
4099		int error_ret = 0;
4100
4101		isize = ap->a_size;
4102		f_offset = ap->a_f_offset;
4103
4104		/*
4105		 * Scan from the back to find the last page in the UPL, so that we
4106		 * aren't looking at a UPL that may have already been freed by the
4107		 * preceding aborts/completions.
4108		 */
4109		for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4110			if (upl_page_present(pl, --pg_index))
4111				break;
4112			if (pg_index == 0) {
4113				ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4114				goto pageout_done;
4115			}
4116		}
4117
4118		/*
4119		 * initialize the offset variables before we touch the UPL.
4120		 * a_f_offset is the position into the file, in bytes
4121		 * offset is the position into the UPL, in bytes
4122		 * pg_index is the pg# of the UPL we're operating on.
4123		 * isize is the offset into the UPL of the last non-clean page.
4124		 */
4125		isize = ((pg_index + 1) * PAGE_SIZE);
4126
4127		offset = 0;
4128		pg_index = 0;
4129
4130		while (isize) {
4131			int  xsize;
4132			int  num_of_pages;
4133
4134			if ( !upl_page_present(pl, pg_index)) {
4135				/*
4136				 * we asked for RET_ONLY_DIRTY, so it's possible
4137				 * to get back empty slots in the UPL.
4138				 * just skip over them
4139				 */
4140				f_offset += PAGE_SIZE;
4141				offset   += PAGE_SIZE;
4142				isize    -= PAGE_SIZE;
4143				pg_index++;
4144
4145				continue;
4146			}
4147			if ( !upl_dirty_page(pl, pg_index)) {
4148				panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4149			}
4150
4151			/*
4152			 * We know that we have at least one dirty page.
4153			 * Now checking to see how many in a row we have
4154			 */
4155			num_of_pages = 1;
4156			xsize = isize - PAGE_SIZE;
4157
4158			while (xsize) {
4159				if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4160					break;
4161				num_of_pages++;
4162				xsize -= PAGE_SIZE;
4163			}
4164			xsize = num_of_pages * PAGE_SIZE;
4165
4166			if (!vnode_isswap(vp)) {
4167				off_t end_of_range;
4168				int tooklock;
4169
4170				tooklock = 0;
4171
4172				if (cp->c_lockowner != current_thread()) {
4173					if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
4174						/*
4175						 * we're in the v2 path, so we are the
4176						 * owner of the UPL... we may have already
4177						 * processed some of the UPL, so abort it
4178						 * from the current working offset to the
4179						 * end of the UPL
4180						 */
4181						ubc_upl_abort_range(upl,
4182								    offset,
4183								    ap->a_size - offset,
4184								    UPL_ABORT_FREE_ON_EMPTY);
4185						goto pageout_done;
4186					}
4187					tooklock = 1;
4188				}
4189				end_of_range = f_offset + xsize - 1;
4190
4191				if (end_of_range >= filesize) {
4192					end_of_range = (off_t)(filesize - 1);
4193				}
4194				if (f_offset < filesize) {
4195					rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4196					cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4197				}
4198				if (tooklock) {
4199					hfs_unlock(cp);
4200				}
4201			}
4202			if ((error = cluster_pageout(vp, upl, offset, f_offset,
4203							xsize, filesize, a_flags))) {
4204				if (error_ret == 0)
4205					error_ret = error;
4206			}
4207			f_offset += xsize;
4208			offset   += xsize;
4209			isize    -= xsize;
4210			pg_index += num_of_pages;
4211		}
4212		/* capture errnos bubbled out of cluster_pageout if they occurred */
4213		if (error_ret != 0) {
4214			retval = error_ret;
4215		}
4216	} /* end block for v2 pageout behavior */
4217	else {
4218		if (!vnode_isswap(vp)) {
4219			off_t end_of_range;
4220			int tooklock = 0;
4221
4222			if (cp->c_lockowner != current_thread()) {
4223				if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
4224					if (!(a_flags & UPL_NOCOMMIT)) {
4225						ubc_upl_abort_range(upl,
4226								    a_pl_offset,
4227								    ap->a_size,
4228								    UPL_ABORT_FREE_ON_EMPTY);
4229					}
4230					goto pageout_done;
4231				}
4232				tooklock = 1;
4233			}
4234			end_of_range = ap->a_f_offset + ap->a_size - 1;
4235
4236			if (end_of_range >= filesize) {
4237				end_of_range = (off_t)(filesize - 1);
4238			}
4239			if (ap->a_f_offset < filesize) {
4240				rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4241				cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4242			}
4243
4244			if (tooklock) {
4245				hfs_unlock(cp);
4246			}
4247		}
4248		/*
4249		 * just call cluster_pageout for old pre-v2 behavior
4250		 */
4251		retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4252				ap->a_size, filesize, a_flags);
4253	}
4254
4255	/*
4256	 * If data was written, update the modification time of the file.
4257	 * If setuid or setgid bits are set and this process is not the
4258	 * superuser then clear the setuid and setgid bits as a precaution
4259	 * against tampering.
4260	 */
4261	if (retval == 0) {
4262		cp->c_touch_modtime = TRUE;
4263		cp->c_touch_chgtime = TRUE;
4264		if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4265		    (vfs_context_suser(ap->a_context) != 0)) {
4266			hfs_lock(cp, HFS_FORCE_LOCK);
4267			cp->c_mode &= ~(S_ISUID | S_ISGID);
4268			hfs_unlock(cp);
4269		}
4270	}
4271
4272pageout_done:
4273	if (is_pageoutv2) {
4274		/*
4275		 * Release the truncate lock.  Note that because
4276		 * we may have taken the lock recursively by
4277		 * being invoked via ubc_msync due to lockdown,
4278		 * we should release it recursively, too.
4279		 */
4280		hfs_unlock_truncate(cp, 1);
4281	}
4282	return (retval);
4283}
4284
4285/*
4286 * Intercept B-Tree node writes to unswap them if necessary.
4287 */
4288int
4289hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4290{
4291	int retval = 0;
4292	register struct buf *bp = ap->a_bp;
4293	register struct vnode *vp = buf_vnode(bp);
4294	BlockDescriptor block;
4295
4296	/* Trap B-Tree writes */
4297	if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4298	    (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4299	    (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4300	    (vp == VTOHFS(vp)->hfc_filevp)) {
4301
4302		/*
4303		 * Swap and validate the node if it is in native byte order.
4304		 * This is always be true on big endian, so we always validate
4305		 * before writing here.  On little endian, the node typically has
4306		 * been swapped and validated when it was written to the journal,
4307		 * so we won't do anything here.
4308		 */
4309		if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4310			/* Prepare the block pointer */
4311			block.blockHeader = bp;
4312			block.buffer = (char *)buf_dataptr(bp);
4313			block.blockNum = buf_lblkno(bp);
4314			/* not found in cache ==> came from disk */
4315			block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4316			block.blockSize = buf_count(bp);
4317
4318			/* Endian un-swap B-Tree node */
4319			retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4320			if (retval)
4321				panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4322		}
4323	}
4324
4325	/* This buffer shouldn't be locked anymore but if it is clear it */
4326	if ((buf_flags(bp) & B_LOCKED)) {
4327	        // XXXdbg
4328	        if (VTOHFS(vp)->jnl) {
4329		        panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4330		}
4331		buf_clearflags(bp, B_LOCKED);
4332	}
4333	retval = vn_bwrite (ap);
4334
4335	return (retval);
4336}
4337
4338/*
4339 * Relocate a file to a new location on disk
4340 *  cnode must be locked on entry
4341 *
4342 * Relocation occurs by cloning the file's data from its
4343 * current set of blocks to a new set of blocks. During
4344 * the relocation all of the blocks (old and new) are
4345 * owned by the file.
4346 *
4347 * -----------------
4348 * |///////////////|
4349 * -----------------
4350 * 0               N (file offset)
4351 *
4352 * -----------------     -----------------
4353 * |///////////////|     |               |     STEP 1 (acquire new blocks)
4354 * -----------------     -----------------
4355 * 0               N     N+1             2N
4356 *
4357 * -----------------     -----------------
4358 * |///////////////|     |///////////////|     STEP 2 (clone data)
4359 * -----------------     -----------------
4360 * 0               N     N+1             2N
4361 *
4362 *                       -----------------
4363 *                       |///////////////|     STEP 3 (head truncate blocks)
4364 *                       -----------------
4365 *                       0               N
4366 *
4367 * During steps 2 and 3 page-outs to file offsets less
4368 * than or equal to N are suspended.
4369 *
4370 * During step 3 page-ins to the file get suspended.
4371 */
4372int
4373hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4374	struct  proc *p)
4375{
4376	struct  cnode *cp;
4377	struct  filefork *fp;
4378	struct  hfsmount *hfsmp;
4379	u_int32_t  headblks;
4380	u_int32_t  datablks;
4381	u_int32_t  blksize;
4382	u_int32_t  growsize;
4383	u_int32_t  nextallocsave;
4384	daddr64_t  sector_a,  sector_b;
4385	int eflags;
4386	off_t  newbytes;
4387	int  retval;
4388	int lockflags = 0;
4389	int took_trunc_lock = 0;
4390	int started_tr = 0;
4391	enum vtype vnodetype;
4392
4393	vnodetype = vnode_vtype(vp);
4394	if (vnodetype != VREG && vnodetype != VLNK) {
4395		return (EPERM);
4396	}
4397
4398	hfsmp = VTOHFS(vp);
4399	if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4400		return (ENOSPC);
4401	}
4402
4403	cp = VTOC(vp);
4404	fp = VTOF(vp);
4405	if (fp->ff_unallocblocks)
4406		return (EINVAL);
4407
4408#if CONFIG_PROTECT
4409	/*
4410	 * <rdar://problem/9118426>
4411	 * Disable HFS file relocation on content-protected filesystems
4412	 */
4413	if (cp_fs_protected (hfsmp->hfs_mp)) {
4414		return EINVAL;
4415	}
4416#endif
4417	/* If it's an SSD, also disable HFS relocation */
4418	if (hfsmp->hfs_flags & HFS_SSD) {
4419		return EINVAL;
4420	}
4421
4422
4423	blksize = hfsmp->blockSize;
4424	if (blockHint == 0)
4425		blockHint = hfsmp->nextAllocation;
4426
4427	if ((fp->ff_size > 0x7fffffff) ||
4428	    ((fp->ff_size > blksize) && vnodetype == VLNK)) {
4429		return (EFBIG);
4430	}
4431
4432	//
4433	// We do not believe that this call to hfs_fsync() is
4434	// necessary and it causes a journal transaction
4435	// deadlock so we are removing it.
4436	//
4437	//if (vnodetype == VREG && !vnode_issystem(vp)) {
4438	//	retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4439	//	if (retval)
4440	//		return (retval);
4441	//}
4442
4443	if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4444		hfs_unlock(cp);
4445		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
4446		/* Force lock since callers expects lock to be held. */
4447		if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
4448			hfs_unlock_truncate(cp, 0);
4449			return (retval);
4450		}
4451		/* No need to continue if file was removed. */
4452		if (cp->c_flag & C_NOEXISTS) {
4453			hfs_unlock_truncate(cp, 0);
4454			return (ENOENT);
4455		}
4456		took_trunc_lock = 1;
4457	}
4458	headblks = fp->ff_blocks;
4459	datablks = howmany(fp->ff_size, blksize);
4460	growsize = datablks * blksize;
4461	eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
4462	if (blockHint >= hfsmp->hfs_metazone_start &&
4463	    blockHint <= hfsmp->hfs_metazone_end)
4464		eflags |= kEFMetadataMask;
4465
4466	if (hfs_start_transaction(hfsmp) != 0) {
4467		if (took_trunc_lock)
4468			hfs_unlock_truncate(cp, 0);
4469	    return (EINVAL);
4470	}
4471	started_tr = 1;
4472	/*
4473	 * Protect the extents b-tree and the allocation bitmap
4474	 * during MapFileBlockC and ExtendFileC operations.
4475	 */
4476	lockflags = SFL_BITMAP;
4477	if (overflow_extents(fp))
4478		lockflags |= SFL_EXTENTS;
4479	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4480
4481	retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
4482	if (retval) {
4483		retval = MacToVFSError(retval);
4484		goto out;
4485	}
4486
4487	/*
4488	 * STEP 1 - acquire new allocation blocks.
4489	 */
4490	nextallocsave = hfsmp->nextAllocation;
4491	retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
4492	if (eflags & kEFMetadataMask) {
4493		HFS_MOUNT_LOCK(hfsmp, TRUE);
4494		HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
4495		MarkVCBDirty(hfsmp);
4496		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4497	}
4498
4499	retval = MacToVFSError(retval);
4500	if (retval == 0) {
4501		cp->c_flag |= C_MODIFIED;
4502		if (newbytes < growsize) {
4503			retval = ENOSPC;
4504			goto restore;
4505		} else if (fp->ff_blocks < (headblks + datablks)) {
4506			printf("hfs_relocate: allocation failed");
4507			retval = ENOSPC;
4508			goto restore;
4509		}
4510
4511		retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
4512		if (retval) {
4513			retval = MacToVFSError(retval);
4514		} else if ((sector_a + 1) == sector_b) {
4515			retval = ENOSPC;
4516			goto restore;
4517		} else if ((eflags & kEFMetadataMask) &&
4518		           ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
4519		              hfsmp->hfs_metazone_end)) {
4520#if 0
4521			const char * filestr;
4522			char emptystr = '\0';
4523
4524			if (cp->c_desc.cd_nameptr != NULL) {
4525				filestr = (const char *)&cp->c_desc.cd_nameptr[0];
4526			} else if (vnode_name(vp) != NULL) {
4527				filestr = vnode_name(vp);
4528			} else {
4529				filestr = &emptystr;
4530			}
4531#endif
4532			retval = ENOSPC;
4533			goto restore;
4534		}
4535	}
4536	/* Done with system locks and journal for now. */
4537	hfs_systemfile_unlock(hfsmp, lockflags);
4538	lockflags = 0;
4539	hfs_end_transaction(hfsmp);
4540	started_tr = 0;
4541
4542	if (retval) {
4543		/*
4544		 * Check to see if failure is due to excessive fragmentation.
4545		 */
4546		if ((retval == ENOSPC) &&
4547		    (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
4548			hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
4549		}
4550		goto out;
4551	}
4552	/*
4553	 * STEP 2 - clone file data into the new allocation blocks.
4554	 */
4555
4556	if (vnodetype == VLNK)
4557		retval = hfs_clonelink(vp, blksize, cred, p);
4558	else if (vnode_issystem(vp))
4559		retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
4560	else
4561		retval = hfs_clonefile(vp, headblks, datablks, blksize);
4562
4563	/* Start transaction for step 3 or for a restore. */
4564	if (hfs_start_transaction(hfsmp) != 0) {
4565		retval = EINVAL;
4566		goto out;
4567	}
4568	started_tr = 1;
4569	if (retval)
4570		goto restore;
4571
4572	/*
4573	 * STEP 3 - switch to cloned data and remove old blocks.
4574	 */
4575	lockflags = SFL_BITMAP;
4576	if (overflow_extents(fp))
4577		lockflags |= SFL_EXTENTS;
4578	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4579
4580	retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
4581
4582	hfs_systemfile_unlock(hfsmp, lockflags);
4583	lockflags = 0;
4584	if (retval)
4585		goto restore;
4586out:
4587	if (took_trunc_lock)
4588		hfs_unlock_truncate(cp, 0);
4589
4590	if (lockflags) {
4591		hfs_systemfile_unlock(hfsmp, lockflags);
4592		lockflags = 0;
4593	}
4594
4595	/* Push cnode's new extent data to disk. */
4596	if (retval == 0) {
4597		(void) hfs_update(vp, MNT_WAIT);
4598	}
4599	if (hfsmp->jnl) {
4600		if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
4601			(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4602		else
4603			(void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
4604	}
4605exit:
4606	if (started_tr)
4607		hfs_end_transaction(hfsmp);
4608
4609	return (retval);
4610
4611restore:
4612	if (fp->ff_blocks == headblks) {
4613		if (took_trunc_lock)
4614			hfs_unlock_truncate(cp, 0);
4615		goto exit;
4616	}
4617	/*
4618	 * Give back any newly allocated space.
4619	 */
4620	if (lockflags == 0) {
4621		lockflags = SFL_BITMAP;
4622		if (overflow_extents(fp))
4623			lockflags |= SFL_EXTENTS;
4624		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4625	}
4626
4627	(void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
4628						 FTOC(fp)->c_fileid, false);
4629
4630	hfs_systemfile_unlock(hfsmp, lockflags);
4631	lockflags = 0;
4632
4633	if (took_trunc_lock)
4634		hfs_unlock_truncate(cp, 0);
4635	goto exit;
4636}
4637
4638
4639/*
4640 * Clone a symlink.
4641 *
4642 */
4643static int
4644hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
4645{
4646	struct buf *head_bp = NULL;
4647	struct buf *tail_bp = NULL;
4648	int error;
4649
4650
4651	error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
4652	if (error)
4653		goto out;
4654
4655	tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
4656	if (tail_bp == NULL) {
4657		error = EIO;
4658		goto out;
4659	}
4660	bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
4661	error = (int)buf_bwrite(tail_bp);
4662out:
4663	if (head_bp) {
4664	        buf_markinvalid(head_bp);
4665		buf_brelse(head_bp);
4666	}
4667	(void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
4668
4669	return (error);
4670}
4671
4672/*
4673 * Clone a file's data within the file.
4674 *
4675 */
4676static int
4677hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
4678{
4679	caddr_t  bufp;
4680	size_t  bufsize;
4681	size_t  copysize;
4682        size_t  iosize;
4683	size_t  offset;
4684	off_t	writebase;
4685	uio_t auio;
4686	int  error = 0;
4687
4688	writebase = blkstart * blksize;
4689	copysize = blkcnt * blksize;
4690	iosize = bufsize = MIN(copysize, 128 * 1024);
4691	offset = 0;
4692
4693	hfs_unlock(VTOC(vp));
4694
4695#if CONFIG_PROTECT
4696	if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
4697		hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4698		return (error);
4699	}
4700#endif /* CONFIG_PROTECT */
4701
4702	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4703		hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4704		return (ENOMEM);
4705	}
4706
4707	auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
4708
4709	while (offset < copysize) {
4710		iosize = MIN(copysize - offset, iosize);
4711
4712		uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
4713		uio_addiov(auio, (uintptr_t)bufp, iosize);
4714
4715		error = cluster_read(vp, auio, copysize, IO_NOCACHE);
4716		if (error) {
4717			printf("hfs_clonefile: cluster_read failed - %d\n", error);
4718			break;
4719		}
4720		if (uio_resid(auio) != 0) {
4721			printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
4722			error = EIO;
4723			break;
4724		}
4725
4726		uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
4727		uio_addiov(auio, (uintptr_t)bufp, iosize);
4728
4729		error = cluster_write(vp, auio, writebase + offset,
4730		                      writebase + offset + iosize,
4731		                      uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
4732		if (error) {
4733			printf("hfs_clonefile: cluster_write failed - %d\n", error);
4734			break;
4735		}
4736		if (uio_resid(auio) != 0) {
4737			printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
4738			error = EIO;
4739			break;
4740		}
4741		offset += iosize;
4742	}
4743	uio_free(auio);
4744
4745	if ((blksize & PAGE_MASK)) {
4746		/*
4747		 * since the copy may not have started on a PAGE
4748		 * boundary (or may not have ended on one), we
4749		 * may have pages left in the cache since NOCACHE
4750		 * will let partially written pages linger...
4751		 * lets just flush the entire range to make sure
4752		 * we don't have any pages left that are beyond
4753		 * (or intersect) the real LEOF of this file
4754		 */
4755		ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
4756	} else {
4757		/*
4758		 * No need to call ubc_sync_range or hfs_invalbuf
4759		 * since the file was copied using IO_NOCACHE and
4760		 * the copy was done starting and ending on a page
4761		 * boundary in the file.
4762		 */
4763	}
4764	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4765
4766	hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
4767	return (error);
4768}
4769
4770/*
4771 * Clone a system (metadata) file.
4772 *
4773 */
4774static int
4775hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
4776                 kauth_cred_t cred, struct proc *p)
4777{
4778	caddr_t  bufp;
4779	char * offset;
4780	size_t  bufsize;
4781	size_t  iosize;
4782	struct buf *bp = NULL;
4783	daddr64_t  blkno;
4784 	daddr64_t  blk;
4785	daddr64_t  start_blk;
4786	daddr64_t  last_blk;
4787	int  breadcnt;
4788        int  i;
4789	int  error = 0;
4790
4791
4792	iosize = GetLogicalBlockSize(vp);
4793	bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
4794	breadcnt = bufsize / iosize;
4795
4796	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
4797		return (ENOMEM);
4798	}
4799	start_blk = ((daddr64_t)blkstart * blksize) / iosize;
4800	last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
4801	blkno = 0;
4802
4803	while (blkno < last_blk) {
4804		/*
4805		 * Read up to a megabyte
4806		 */
4807		offset = bufp;
4808		for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
4809			error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
4810			if (error) {
4811				printf("hfs_clonesysfile: meta_bread error %d\n", error);
4812				goto out;
4813			}
4814			if (buf_count(bp) != iosize) {
4815				printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
4816				goto out;
4817			}
4818			bcopy((char *)buf_dataptr(bp), offset, iosize);
4819
4820			buf_markinvalid(bp);
4821			buf_brelse(bp);
4822			bp = NULL;
4823
4824			offset += iosize;
4825		}
4826
4827		/*
4828		 * Write up to a megabyte
4829		 */
4830		offset = bufp;
4831		for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
4832			bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
4833			if (bp == NULL) {
4834				printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
4835				error = EIO;
4836				goto out;
4837			}
4838			bcopy(offset, (char *)buf_dataptr(bp), iosize);
4839			error = (int)buf_bwrite(bp);
4840			bp = NULL;
4841			if (error)
4842				goto out;
4843			offset += iosize;
4844		}
4845	}
4846out:
4847	if (bp) {
4848		buf_brelse(bp);
4849	}
4850
4851	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
4852
4853	error = hfs_fsync(vp, MNT_WAIT, 0, p);
4854
4855	return (error);
4856}
4857