1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*	@(#)hfs_readwrite.c	1.0
29 *
30 *	(c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
31 *
32 *	hfs_readwrite.c -- vnode operations to deal with reading and writing files.
33 *
34 */
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/resourcevar.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/filedesc.h>
42#include <sys/stat.h>
43#include <sys/buf.h>
44#include <sys/buf_internal.h>
45#include <sys/proc.h>
46#include <sys/kauth.h>
47#include <sys/vnode.h>
48#include <sys/vnode_internal.h>
49#include <sys/uio.h>
50#include <sys/vfs_context.h>
51#include <sys/fsevents.h>
52#include <kern/kalloc.h>
53#include <sys/disk.h>
54#include <sys/sysctl.h>
55#include <sys/fsctl.h>
56#include <sys/mount_internal.h>
57#include <sys/file_internal.h>
58
59#include <miscfs/specfs/specdev.h>
60
61#include <sys/ubc.h>
62#include <sys/ubc_internal.h>
63
64#include <vm/vm_pageout.h>
65#include <vm/vm_kern.h>
66
67#include <sys/kdebug.h>
68
69#include	"hfs.h"
70#include	"hfs_attrlist.h"
71#include	"hfs_endian.h"
72#include  	"hfs_fsctl.h"
73#include	"hfs_quota.h"
74#include	"hfscommon/headers/FileMgrInternal.h"
75#include	"hfscommon/headers/BTreesInternal.h"
76#include	"hfs_cnode.h"
77#include	"hfs_dbg.h"
78
79#define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
80
81enum {
82	MAXHFSFILESIZE = 0x7FFFFFFF		/* this needs to go in the mount structure */
83};
84
85/* from bsd/hfs/hfs_vfsops.c */
86extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
87
88static int  hfs_clonefile(struct vnode *, int, int, int);
89static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
90static int  hfs_minorupdate(struct vnode *vp);
91static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
92
93/* from bsd/hfs/hfs_vnops.c */
94extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
95
96
97
98int flush_cache_on_write = 0;
99SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
100
101/*
102 * Read data from a file.
103 */
104int
105hfs_vnop_read(struct vnop_read_args *ap)
106{
107	/*
108	   struct vnop_read_args {
109	   struct vnodeop_desc *a_desc;
110	   vnode_t a_vp;
111	   struct uio *a_uio;
112	   int a_ioflag;
113	   vfs_context_t a_context;
114	   };
115	 */
116
117	uio_t uio = ap->a_uio;
118	struct vnode *vp = ap->a_vp;
119	struct cnode *cp;
120	struct filefork *fp;
121	struct hfsmount *hfsmp;
122	off_t filesize;
123	off_t filebytes;
124	off_t start_resid = uio_resid(uio);
125	off_t offset = uio_offset(uio);
126	int retval = 0;
127	int took_truncate_lock = 0;
128	int io_throttle = 0;
129	int throttled_count = 0;
130
131	/* Preflight checks */
132	if (!vnode_isreg(vp)) {
133		/* can only read regular files */
134		if (vnode_isdir(vp))
135			return (EISDIR);
136		else
137			return (EPERM);
138	}
139	if (start_resid == 0)
140		return (0);		/* Nothing left to do */
141	if (offset < 0)
142		return (EINVAL);	/* cant read from a negative offset */
143
144	if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
145						(IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
146		/* Don't allow unencrypted io request from user space */
147		return EPERM;
148	}
149
150
151
152#if HFS_COMPRESSION
153	if (VNODE_IS_RSRC(vp)) {
154		if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
155			return 0;
156		}
157		/* otherwise read the resource fork normally */
158	} else {
159		int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
160		if (compressed) {
161			retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
162			if (compressed) {
163				if (retval == 0) {
164					/* successful read, update the access time */
165					VTOC(vp)->c_touch_acctime = TRUE;
166
167					/* compressed files are not hot file candidates */
168					if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
169						VTOF(vp)->ff_bytesread = 0;
170					}
171				}
172				return retval;
173			}
174			/* otherwise the file was converted back to a regular file while we were reading it */
175			retval = 0;
176		} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
177			int error;
178
179			error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
180			if (error) {
181				return error;
182			}
183
184		}
185	}
186#endif /* HFS_COMPRESSION */
187
188	cp = VTOC(vp);
189	fp = VTOF(vp);
190	hfsmp = VTOHFS(vp);
191
192#if CONFIG_PROTECT
193	if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
194		goto exit;
195	}
196#endif
197
198	/*
199	 * If this read request originated from a syscall (as opposed to
200	 * an in-kernel page fault or something), then set it up for
201	 * throttle checks
202	 */
203	if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
204		io_throttle = IO_RETURN_ON_THROTTLE;
205	}
206
207read_again:
208
209	/* Protect against a size change. */
210	hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
211	took_truncate_lock = 1;
212
213	filesize = fp->ff_size;
214	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
215
216	/*
217	 * Check the file size. Note that per POSIX spec, we return 0 at
218	 * file EOF, so attempting a read at an offset that is too big
219	 * should just return 0 on HFS+. Since the return value was initialized
220	 * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
221	 */
222	if (offset > filesize) {
223		if ((hfsmp->hfs_flags & HFS_STANDARD) &&
224		    (offset > (off_t)MAXHFSFILESIZE)) {
225			retval = EFBIG;
226		}
227		goto exit;
228	}
229
230	KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
231		(int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
232
233	retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
234
235	cp->c_touch_acctime = TRUE;
236
237	KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
238		(int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
239
240	/*
241	 * Keep track blocks read
242	 */
243	if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
244		int took_cnode_lock = 0;
245		off_t bytesread;
246
247		bytesread = start_resid - uio_resid(uio);
248
249		/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
250		if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
251			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
252			took_cnode_lock = 1;
253		}
254		/*
255		 * If this file hasn't been seen since the start of
256		 * the current sampling period then start over.
257		 */
258		if (cp->c_atime < hfsmp->hfc_timebase) {
259			struct timeval tv;
260
261			fp->ff_bytesread = bytesread;
262			microtime(&tv);
263			cp->c_atime = tv.tv_sec;
264		} else {
265			fp->ff_bytesread += bytesread;
266		}
267		if (took_cnode_lock)
268			hfs_unlock(cp);
269	}
270exit:
271	if (took_truncate_lock) {
272		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
273	}
274	if (retval == EAGAIN) {
275		throttle_lowpri_io(1);
276		throttled_count++;
277
278		retval = 0;
279		goto read_again;
280	}
281	if (throttled_count) {
282		throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
283	}
284	return (retval);
285}
286
287/*
288 * Write data to a file.
289 */
290int
291hfs_vnop_write(struct vnop_write_args *ap)
292{
293	uio_t uio = ap->a_uio;
294	struct vnode *vp = ap->a_vp;
295	struct cnode *cp;
296	struct filefork *fp;
297	struct hfsmount *hfsmp;
298	kauth_cred_t cred = NULL;
299	off_t origFileSize;
300	off_t writelimit;
301	off_t bytesToAdd = 0;
302	off_t actualBytesAdded;
303	off_t filebytes;
304	off_t offset;
305	ssize_t resid;
306	int eflags;
307	int ioflag = ap->a_ioflag;
308	int retval = 0;
309	int lockflags;
310	int cnode_locked = 0;
311	int partialwrite = 0;
312	int do_snapshot = 1;
313	time_t orig_ctime=VTOC(vp)->c_ctime;
314	int took_truncate_lock = 0;
315	int io_return_on_throttle = 0;
316	int throttled_count = 0;
317	struct rl_entry *invalid_range;
318
319#if HFS_COMPRESSION
320	if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
321		int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
322		switch(state) {
323			case FILE_IS_COMPRESSED:
324				return EACCES;
325			case FILE_IS_CONVERTING:
326				/* if FILE_IS_CONVERTING, we allow writes but do not
327				   bother with snapshots or else we will deadlock.
328				*/
329				do_snapshot = 0;
330				break;
331			default:
332				printf("invalid state %d for compressed file\n", state);
333				/* fall through */
334		}
335	} else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
336		int error;
337
338		error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
339		if (error != 0) {
340			return error;
341		}
342	}
343
344	if (do_snapshot) {
345		check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
346	}
347
348#endif
349
350	if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
351						(IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
352		/* Don't allow unencrypted io request from user space */
353		return EPERM;
354	}
355
356
357	resid = uio_resid(uio);
358	offset = uio_offset(uio);
359
360	if (offset < 0)
361		return (EINVAL);
362	if (resid == 0)
363		return (E_NONE);
364	if (!vnode_isreg(vp))
365		return (EPERM);  /* Can only write regular files */
366
367	cp = VTOC(vp);
368	fp = VTOF(vp);
369	hfsmp = VTOHFS(vp);
370
371#if CONFIG_PROTECT
372	if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
373		goto exit;
374	}
375#endif
376
377	eflags = kEFDeferMask;	/* defer file block allocations */
378#if HFS_SPARSE_DEV
379	/*
380	 * When the underlying device is sparse and space
381	 * is low (< 8MB), stop doing delayed allocations
382	 * and begin doing synchronous I/O.
383	 */
384	if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
385	    (hfs_freeblks(hfsmp, 0) < 2048)) {
386		eflags &= ~kEFDeferMask;
387		ioflag |= IO_SYNC;
388	}
389#endif /* HFS_SPARSE_DEV */
390
391	if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
392			(IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
393		io_return_on_throttle = IO_RETURN_ON_THROTTLE;
394	}
395
396again:
397	/*
398	 * Protect against a size change.
399	 *
400	 * Note: If took_truncate_lock is true, then we previously got the lock shared
401	 * but needed to upgrade to exclusive.  So try getting it exclusive from the
402	 * start.
403	 */
404	if (ioflag & IO_APPEND || took_truncate_lock) {
405		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
406	}
407	else {
408		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
409	}
410	took_truncate_lock = 1;
411
412	/* Update UIO */
413	if (ioflag & IO_APPEND) {
414		uio_setoffset(uio, fp->ff_size);
415		offset = fp->ff_size;
416	}
417	if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
418		retval = EPERM;
419		goto exit;
420	}
421
422	origFileSize = fp->ff_size;
423	writelimit = offset + resid;
424	filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
425
426	/*
427	 * We may need an exclusive truncate lock for several reasons, all
428	 * of which are because we may be writing to a (portion of a) block
429	 * for the first time, and we need to make sure no readers see the
430	 * prior, uninitialized contents of the block.  The cases are:
431	 *
432	 * 1. We have unallocated (delayed allocation) blocks.  We may be
433	 *    allocating new blocks to the file and writing to them.
434	 *    (A more precise check would be whether the range we're writing
435	 *    to contains delayed allocation blocks.)
436	 * 2. We need to extend the file.  The bytes between the old EOF
437	 *    and the new EOF are not yet initialized.  This is important
438	 *    even if we're not allocating new blocks to the file.  If the
439	 *    old EOF and new EOF are in the same block, we still need to
440	 *    protect that range of bytes until they are written for the
441	 *    first time.
442	 * 3. The write overlaps some invalid ranges (delayed zero fill; that
443	 *    part of the file has been allocated, but not yet written).
444	 *
445	 * If we had a shared lock with the above cases, we need to try to upgrade
446	 * to an exclusive lock.  If the upgrade fails, we will lose the shared
447	 * lock, and will need to take the truncate lock again; the took_truncate_lock
448	 * flag will still be set, causing us to try for an exclusive lock next time.
449	 *
450	 * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
451	 * lock is held, since it protects the range lists.
452	 */
453	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
454	    ((fp->ff_unallocblocks != 0) ||
455	     (writelimit > origFileSize))) {
456		if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
457			/*
458			 * Lock upgrade failed and we lost our shared lock, try again.
459			 * Note: we do not set took_truncate_lock=0 here.  Leaving it
460			 * set to 1 will cause us to try to get the lock exclusive.
461			 */
462			goto again;
463		}
464		else {
465			/* Store the owner in the c_truncatelockowner field if we successfully upgrade */
466			cp->c_truncatelockowner = current_thread();
467		}
468	}
469
470	if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
471		goto exit;
472	}
473	cnode_locked = 1;
474
475	/*
476	 * Now that we have the cnode lock, see if there are delayed zero fill ranges
477	 * overlapping our write.  If so, we need the truncate lock exclusive (see above).
478	 */
479	if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
480	    (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
481	    	/*
482		 * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
483		 * a deadlock, rather than simply returning failure.  (That is, it apparently does
484		 * not behave like a "try_lock").  Since this condition is rare, just drop the
485		 * cnode lock and try again.  Since took_truncate_lock is set, we will
486		 * automatically take the truncate lock exclusive.
487		 */
488		hfs_unlock(cp);
489		cnode_locked = 0;
490		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
491		goto again;
492	}
493
494	KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
495		     (int)offset, uio_resid(uio), (int)fp->ff_size,
496		     (int)filebytes, 0);
497
498	/* Check if we do not need to extend the file */
499	if (writelimit <= filebytes) {
500		goto sizeok;
501	}
502
503	cred = vfs_context_ucred(ap->a_context);
504	bytesToAdd = writelimit - filebytes;
505
506#if QUOTA
507	retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
508			   cred, 0);
509	if (retval)
510		goto exit;
511#endif /* QUOTA */
512
513	if (hfs_start_transaction(hfsmp) != 0) {
514		retval = EINVAL;
515		goto exit;
516	}
517
518	while (writelimit > filebytes) {
519		bytesToAdd = writelimit - filebytes;
520		if (cred && suser(cred, NULL) != 0)
521			eflags |= kEFReserveMask;
522
523		/* Protect extents b-tree and allocation bitmap */
524		lockflags = SFL_BITMAP;
525		if (overflow_extents(fp))
526			lockflags |= SFL_EXTENTS;
527		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
528
529		/* Files that are changing size are not hot file candidates. */
530		if (hfsmp->hfc_stage == HFC_RECORDING) {
531			fp->ff_bytesread = 0;
532		}
533		retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
534				0, eflags, &actualBytesAdded));
535
536		hfs_systemfile_unlock(hfsmp, lockflags);
537
538		if ((actualBytesAdded == 0) && (retval == E_NONE))
539			retval = ENOSPC;
540		if (retval != E_NONE)
541			break;
542		filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
543		KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
544			(int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
545	}
546	(void) hfs_update(vp, TRUE);
547	(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
548	(void) hfs_end_transaction(hfsmp);
549
550	/*
551	 * If we didn't grow the file enough try a partial write.
552	 * POSIX expects this behavior.
553	 */
554	if ((retval == ENOSPC) && (filebytes > offset)) {
555		retval = 0;
556		partialwrite = 1;
557		uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
558		resid -= bytesToAdd;
559		writelimit = filebytes;
560	}
561sizeok:
562	if (retval == E_NONE) {
563		off_t filesize;
564		off_t zero_off;
565		off_t tail_off;
566		off_t inval_start;
567		off_t inval_end;
568		off_t io_start;
569		int lflag;
570
571		if (writelimit > fp->ff_size)
572			filesize = writelimit;
573		else
574			filesize = fp->ff_size;
575
576		lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
577
578		if (offset <= fp->ff_size) {
579			zero_off = offset & ~PAGE_MASK_64;
580
581			/* Check to see whether the area between the zero_offset and the start
582			   of the transfer to see whether is invalid and should be zero-filled
583			   as part of the transfer:
584			 */
585			if (offset > zero_off) {
586			        if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
587				        lflag |= IO_HEADZEROFILL;
588			}
589		} else {
590			off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
591
592			/* The bytes between fp->ff_size and uio->uio_offset must never be
593			   read without being zeroed.  The current last block is filled with zeroes
594			   if it holds valid data but in all cases merely do a little bookkeeping
595			   to track the area from the end of the current last page to the start of
596			   the area actually written.  For the same reason only the bytes up to the
597			   start of the page where this write will start is invalidated; any remainder
598			   before uio->uio_offset is explicitly zeroed as part of the cluster_write.
599
600			   Note that inval_start, the start of the page after the current EOF,
601			   may be past the start of the write, in which case the zeroing
602			   will be handled by the cluser_write of the actual data.
603			 */
604			inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
605			inval_end = offset & ~PAGE_MASK_64;
606			zero_off = fp->ff_size;
607
608			if ((fp->ff_size & PAGE_MASK_64) &&
609				(rl_scan(&fp->ff_invalidranges,
610							eof_page_base,
611							fp->ff_size - 1,
612							&invalid_range) != RL_NOOVERLAP)) {
613				/* The page containing the EOF is not valid, so the
614				   entire page must be made inaccessible now.  If the write
615				   starts on a page beyond the page containing the eof
616				   (inval_end > eof_page_base), add the
617				   whole page to the range to be invalidated.  Otherwise
618				   (i.e. if the write starts on the same page), zero-fill
619				   the entire page explicitly now:
620				 */
621				if (inval_end > eof_page_base) {
622					inval_start = eof_page_base;
623				} else {
624					zero_off = eof_page_base;
625				};
626			};
627
628			if (inval_start < inval_end) {
629				struct timeval tv;
630				/* There's some range of data that's going to be marked invalid */
631
632				if (zero_off < inval_start) {
633					/* The pages between inval_start and inval_end are going to be invalidated,
634					   and the actual write will start on a page past inval_end.  Now's the last
635					   chance to zero-fill the page containing the EOF:
636					 */
637					hfs_unlock(cp);
638					cnode_locked = 0;
639					retval = cluster_write(vp, (uio_t) 0,
640							fp->ff_size, inval_start,
641							zero_off, (off_t)0,
642							lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
643					hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
644					cnode_locked = 1;
645					if (retval) goto ioerr_exit;
646					offset = uio_offset(uio);
647				};
648
649				/* Mark the remaining area of the newly allocated space as invalid: */
650				rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
651				microuptime(&tv);
652				cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
653				zero_off = fp->ff_size = inval_end;
654			};
655
656			if (offset > zero_off) lflag |= IO_HEADZEROFILL;
657		};
658
659		/* Check to see whether the area between the end of the write and the end of
660		   the page it falls in is invalid and should be zero-filled as part of the transfer:
661		 */
662		tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
663		if (tail_off > filesize) tail_off = filesize;
664		if (tail_off > writelimit) {
665			if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
666				lflag |= IO_TAILZEROFILL;
667			};
668		};
669
670		/*
671		 * if the write starts beyond the current EOF (possibly advanced in the
672		 * zeroing of the last block, above), then we'll zero fill from the current EOF
673		 * to where the write begins:
674		 *
675		 * NOTE: If (and ONLY if) the portion of the file about to be written is
676		 *       before the current EOF it might be marked as invalid now and must be
677		 *       made readable (removed from the invalid ranges) before cluster_write
678		 *       tries to write it:
679		 */
680		io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
681		if (io_start < fp->ff_size) {
682			off_t io_end;
683
684			io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
685			rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
686		};
687
688		hfs_unlock(cp);
689		cnode_locked = 0;
690
691		/*
692		 * We need to tell UBC the fork's new size BEFORE calling
693		 * cluster_write, in case any of the new pages need to be
694		 * paged out before cluster_write completes (which does happen
695		 * in embedded systems due to extreme memory pressure).
696		 * Similarly, we need to tell hfs_vnop_pageout what the new EOF
697		 * will be, so that it can pass that on to cluster_pageout, and
698		 * allow those pageouts.
699		 *
700		 * We don't update ff_size yet since we don't want pageins to
701		 * be able to see uninitialized data between the old and new
702		 * EOF, until cluster_write has completed and initialized that
703		 * part of the file.
704		 *
705		 * The vnode pager relies on the file size last given to UBC via
706		 * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
707		 * ff_size (whichever is larger).  NOTE: ff_new_size is always
708		 * zero, unless we are extending the file via write.
709		 */
710		if (filesize > fp->ff_size) {
711			fp->ff_new_size = filesize;
712			ubc_setsize(vp, filesize);
713		}
714		retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
715				tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
716		if (retval) {
717			fp->ff_new_size = 0;	/* no longer extending; use ff_size */
718
719			if (retval == EAGAIN) {
720				/*
721				 * EAGAIN indicates that we still have I/O to do, but
722				 * that we now need to be throttled
723				 */
724				if (resid != uio_resid(uio)) {
725					/*
726					 * did manage to do some I/O before returning EAGAIN
727					 */
728					resid = uio_resid(uio);
729					offset = uio_offset(uio);
730
731					cp->c_touch_chgtime = TRUE;
732					cp->c_touch_modtime = TRUE;
733					hfs_incr_gencount(cp);
734				}
735				if (filesize > fp->ff_size) {
736					/*
737					 * we called ubc_setsize before the call to
738					 * cluster_write... since we only partially
739					 * completed the I/O, we need to
740					 * re-adjust our idea of the filesize based
741					 * on our interim EOF
742					 */
743					ubc_setsize(vp, offset);
744
745					fp->ff_size = offset;
746				}
747				goto exit;
748			}
749			if (filesize > origFileSize) {
750				ubc_setsize(vp, origFileSize);
751			}
752			goto ioerr_exit;
753		}
754
755		if (filesize > origFileSize) {
756			fp->ff_size = filesize;
757
758			/* Files that are changing size are not hot file candidates. */
759			if (hfsmp->hfc_stage == HFC_RECORDING) {
760				fp->ff_bytesread = 0;
761			}
762		}
763		fp->ff_new_size = 0;	/* ff_size now has the correct size */
764	}
765	if (partialwrite) {
766		uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
767		resid += bytesToAdd;
768	}
769
770	// XXXdbg - see radar 4871353 for more info
771	{
772	    if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
773		VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
774	    }
775	}
776
777ioerr_exit:
778	if (resid > uio_resid(uio)) {
779		if (!cnode_locked) {
780			hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
781			cnode_locked = 1;
782		}
783
784		cp->c_touch_chgtime = TRUE;
785		cp->c_touch_modtime = TRUE;
786		hfs_incr_gencount(cp);
787
788		/*
789		 * If we successfully wrote any data, and we are not the superuser
790		 * we clear the setuid and setgid bits as a precaution against
791		 * tampering.
792		 */
793		if (cp->c_mode & (S_ISUID | S_ISGID)) {
794			cred = vfs_context_ucred(ap->a_context);
795			if (cred && suser(cred, NULL)) {
796				cp->c_mode &= ~(S_ISUID | S_ISGID);
797			}
798		}
799	}
800	if (retval) {
801		if (ioflag & IO_UNIT) {
802			(void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
803			                   0, ap->a_context);
804			uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
805			uio_setresid(uio, resid);
806			filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
807		}
808	} else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
809		retval = hfs_update(vp, TRUE);
810
811	/* Updating vcbWrCnt doesn't need to be atomic. */
812	hfsmp->vcbWrCnt++;
813
814	KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
815		(int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
816exit:
817	if (cnode_locked)
818		hfs_unlock(cp);
819
820	if (took_truncate_lock) {
821		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
822	}
823	if (retval == EAGAIN) {
824		throttle_lowpri_io(1);
825		throttled_count++;
826
827		retval = 0;
828		goto again;
829	}
830	if (throttled_count) {
831		throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
832	}
833	return (retval);
834}
835
836/* support for the "bulk-access" fcntl */
837
838#define CACHE_LEVELS 16
839#define NUM_CACHE_ENTRIES (64*16)
840#define PARENT_IDS_FLAG 0x100
841
842struct access_cache {
843       int numcached;
844       int cachehits; /* these two for statistics gathering */
845       int lookups;
846       unsigned int *acache;
847       unsigned char *haveaccess;
848};
849
850struct access_t {
851	uid_t     uid;              /* IN: effective user id */
852	short     flags;            /* IN: access requested (i.e. R_OK) */
853	short     num_groups;       /* IN: number of groups user belongs to */
854	int       num_files;        /* IN: number of files to process */
855	int       *file_ids;        /* IN: array of file ids */
856	gid_t     *groups;          /* IN: array of groups */
857	short     *access;          /* OUT: access info for each file (0 for 'has access') */
858} __attribute__((unavailable)); // this structure is for reference purposes only
859
860struct user32_access_t {
861	uid_t     uid;              /* IN: effective user id */
862	short     flags;            /* IN: access requested (i.e. R_OK) */
863	short     num_groups;       /* IN: number of groups user belongs to */
864	int       num_files;        /* IN: number of files to process */
865	user32_addr_t      file_ids;        /* IN: array of file ids */
866	user32_addr_t      groups;          /* IN: array of groups */
867	user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
868};
869
870struct user64_access_t {
871	uid_t		uid;			/* IN: effective user id */
872	short		flags;			/* IN: access requested (i.e. R_OK) */
873	short		num_groups;		/* IN: number of groups user belongs to */
874	int		num_files;		/* IN: number of files to process */
875	user64_addr_t	file_ids;		/* IN: array of file ids */
876	user64_addr_t	groups;			/* IN: array of groups */
877	user64_addr_t	access;			/* OUT: access info for each file (0 for 'has access') */
878};
879
880
881// these are the "extended" versions of the above structures
882// note that it is crucial that they be different sized than
883// the regular version
884struct ext_access_t {
885	uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
886	uint32_t   num_files;       /* IN: number of files to process */
887	uint32_t   map_size;        /* IN: size of the bit map */
888	uint32_t  *file_ids;        /* IN: Array of file ids */
889	char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
890	short     *access;          /* OUT: access info for each file (0 for 'has access') */
891	uint32_t   num_parents;   /* future use */
892	cnid_t      *parents;   /* future use */
893} __attribute__((unavailable)); // this structure is for reference purposes only
894
895struct user32_ext_access_t {
896	uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
897	uint32_t   num_files;       /* IN: number of files to process */
898	uint32_t   map_size;        /* IN: size of the bit map */
899	user32_addr_t  file_ids;        /* IN: Array of file ids */
900	user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
901	user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
902	uint32_t   num_parents;   /* future use */
903	user32_addr_t parents;   /* future use */
904};
905
906struct user64_ext_access_t {
907	uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
908	uint32_t      num_files;    /* IN: number of files to process */
909	uint32_t      map_size;     /* IN: size of the bit map */
910	user64_addr_t   file_ids;     /* IN: array of file ids */
911	user64_addr_t   bitmap;       /* IN: array of groups */
912	user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
913	uint32_t      num_parents;/* future use */
914	user64_addr_t   parents;/* future use */
915};
916
917
918/*
919 * Perform a binary search for the given parent_id. Return value is
920 * the index if there is a match.  If no_match_indexp is non-NULL it
921 * will be assigned with the index to insert the item (even if it was
922 * not found).
923 */
924static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
925{
926    int index=-1;
927    unsigned int lo=0;
928
929    do {
930	unsigned int mid = ((hi - lo)/2) + lo;
931	unsigned int this_id = array[mid];
932
933	if (parent_id == this_id) {
934	    hi = mid;
935	    break;
936	}
937
938	if (parent_id < this_id) {
939	    hi = mid;
940	    continue;
941	}
942
943	if (parent_id > this_id) {
944	    lo = mid + 1;
945	    continue;
946	}
947    } while(lo < hi);
948
949    /* check if lo and hi converged on the match */
950    if (parent_id == array[hi]) {
951	index = hi;
952    }
953
954    if (no_match_indexp) {
955	*no_match_indexp = hi;
956    }
957
958    return index;
959}
960
961
962static int
963lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
964{
965    unsigned int hi;
966    int matches = 0;
967    int index, no_match_index;
968
969    if (cache->numcached == 0) {
970	*indexp = 0;
971	return 0; // table is empty, so insert at index=0 and report no match
972    }
973
974    if (cache->numcached > NUM_CACHE_ENTRIES) {
975	cache->numcached = NUM_CACHE_ENTRIES;
976    }
977
978    hi = cache->numcached - 1;
979
980    index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
981
982    /* if no existing entry found, find index for new one */
983    if (index == -1) {
984	index = no_match_index;
985	matches = 0;
986    } else {
987	matches = 1;
988    }
989
990    *indexp = index;
991    return matches;
992}
993
994/*
995 * Add a node to the access_cache at the given index (or do a lookup first
996 * to find the index if -1 is passed in). We currently do a replace rather
997 * than an insert if the cache is full.
998 */
999static void
1000add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
1001{
1002    int lookup_index = -1;
1003
1004    /* need to do a lookup first if -1 passed for index */
1005    if (index == -1) {
1006	if (lookup_bucket(cache, &lookup_index, nodeID)) {
1007	    if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
1008		// only update an entry if the previous access was ESRCH (i.e. a scope checking error)
1009		cache->haveaccess[lookup_index] = access;
1010	    }
1011
1012	    /* mission accomplished */
1013	    return;
1014	} else {
1015	    index = lookup_index;
1016	}
1017
1018    }
1019
1020    /* if the cache is full, do a replace rather than an insert */
1021    if (cache->numcached >= NUM_CACHE_ENTRIES) {
1022	cache->numcached = NUM_CACHE_ENTRIES-1;
1023
1024	if (index > cache->numcached) {
1025	    index = cache->numcached;
1026	}
1027    }
1028
1029    if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1030	index++;
1031    }
1032
1033    if (index >= 0 && index < cache->numcached) {
1034	/* only do bcopy if we're inserting */
1035	bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1036	bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1037    }
1038
1039    cache->acache[index] = nodeID;
1040    cache->haveaccess[index] = access;
1041    cache->numcached++;
1042}
1043
1044
1045struct cinfo {
1046    uid_t   uid;
1047    gid_t   gid;
1048    mode_t  mode;
1049    cnid_t  parentcnid;
1050    u_int16_t recflags;
1051};
1052
1053static int
1054snoop_callback(const cnode_t *cp, void *arg)
1055{
1056    struct cinfo *cip = arg;
1057
1058    cip->uid = cp->c_uid;
1059    cip->gid = cp->c_gid;
1060    cip->mode = cp->c_mode;
1061    cip->parentcnid = cp->c_parentcnid;
1062    cip->recflags = cp->c_attr.ca_recflags;
1063
1064    return (0);
1065}
1066
1067/*
1068 * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1069 * isn't incore, then go to the catalog.
1070 */
1071static int
1072do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1073    struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1074{
1075    int error = 0;
1076
1077    /* if this id matches the one the fsctl was called with, skip the lookup */
1078    if (cnid == skip_cp->c_cnid) {
1079		cnattrp->ca_uid = skip_cp->c_uid;
1080		cnattrp->ca_gid = skip_cp->c_gid;
1081		cnattrp->ca_mode = skip_cp->c_mode;
1082		cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1083		keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1084    } else {
1085		struct cinfo c_info;
1086
1087		/* otherwise, check the cnode hash incase the file/dir is incore */
1088		error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1089
1090		if (error == EACCES) {
1091			// File is deleted
1092			return ENOENT;
1093		} else if (!error) {
1094			cnattrp->ca_uid = c_info.uid;
1095			cnattrp->ca_gid = c_info.gid;
1096			cnattrp->ca_mode = c_info.mode;
1097			cnattrp->ca_recflags = c_info.recflags;
1098			keyp->hfsPlus.parentID = c_info.parentcnid;
1099		} else {
1100			int lockflags;
1101
1102			if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1103				throttle_lowpri_io(1);
1104
1105			lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1106
1107			/* lookup this cnid in the catalog */
1108			error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1109
1110			hfs_systemfile_unlock(hfsmp, lockflags);
1111
1112			cache->lookups++;
1113		}
1114    }
1115
1116    return (error);
1117}
1118
1119
1120/*
1121 * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1122 * up to CACHE_LEVELS as we progress towards the root.
1123 */
1124static int
1125do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1126    struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1127    struct vfs_context *my_context,
1128    char *bitmap,
1129    uint32_t map_size,
1130    cnid_t* parents,
1131    uint32_t num_parents)
1132{
1133    int                     myErr = 0;
1134    int                     myResult;
1135    HFSCatalogNodeID        thisNodeID;
1136    unsigned int            myPerms;
1137    struct cat_attr         cnattr;
1138    int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1139    CatalogKey              catkey;
1140
1141    int i = 0, ids_to_cache = 0;
1142    int parent_ids[CACHE_LEVELS];
1143
1144    thisNodeID = nodeID;
1145    while (thisNodeID >=  kRootDirID) {
1146	myResult = 0;   /* default to "no access" */
1147
1148	/* check the cache before resorting to hitting the catalog */
1149
1150	/* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1151	 * to look any further after hitting cached dir */
1152
1153	if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1154	    cache->cachehits++;
1155	    myErr = cache->haveaccess[cache_index];
1156	    if (scope_index != -1) {
1157		if (myErr == ESRCH) {
1158		    myErr = 0;
1159		}
1160	    } else {
1161		scope_index = 0;   // so we'll just use the cache result
1162		scope_idx_start = ids_to_cache;
1163	    }
1164	    myResult = (myErr == 0) ? 1 : 0;
1165	    goto ExitThisRoutine;
1166	}
1167
1168
1169	if (parents) {
1170	    int tmp;
1171	    tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1172	    if (scope_index == -1)
1173		scope_index = tmp;
1174	    if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1175		scope_idx_start = ids_to_cache;
1176	    }
1177	}
1178
1179	/* remember which parents we want to cache */
1180	if (ids_to_cache < CACHE_LEVELS) {
1181	    parent_ids[ids_to_cache] = thisNodeID;
1182	    ids_to_cache++;
1183	}
1184	// Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1185	if (bitmap && map_size) {
1186	    bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1187	}
1188
1189
1190	/* do the lookup (checks the cnode hash, then the catalog) */
1191	myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1192	if (myErr) {
1193	    goto ExitThisRoutine; /* no access */
1194	}
1195
1196	/* Root always gets access. */
1197	if (suser(myp_ucred, NULL) == 0) {
1198		thisNodeID = catkey.hfsPlus.parentID;
1199		myResult = 1;
1200		continue;
1201	}
1202
1203	// if the thing has acl's, do the full permission check
1204	if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1205	    struct vnode *vp;
1206
1207	    /* get the vnode for this cnid */
1208	    myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1209	    if ( myErr ) {
1210		myResult = 0;
1211		goto ExitThisRoutine;
1212	    }
1213
1214	    thisNodeID = VTOC(vp)->c_parentcnid;
1215
1216	    hfs_unlock(VTOC(vp));
1217
1218	    if (vnode_vtype(vp) == VDIR) {
1219		myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1220	    } else {
1221		myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1222	    }
1223
1224	    vnode_put(vp);
1225	    if (myErr) {
1226		myResult = 0;
1227		goto ExitThisRoutine;
1228	    }
1229	} else {
1230	    unsigned int flags;
1231		int mode = cnattr.ca_mode & S_IFMT;
1232		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1233
1234		if (mode == S_IFDIR) {
1235			flags = R_OK | X_OK;
1236		} else {
1237			flags = R_OK;
1238		}
1239		if ( (myPerms & flags) != flags) {
1240			myResult = 0;
1241			myErr = EACCES;
1242			goto ExitThisRoutine;   /* no access */
1243		}
1244
1245	    /* up the hierarchy we go */
1246	    thisNodeID = catkey.hfsPlus.parentID;
1247	}
1248    }
1249
1250    /* if here, we have access to this node */
1251    myResult = 1;
1252
1253  ExitThisRoutine:
1254    if (parents && myErr == 0 && scope_index == -1) {
1255	myErr = ESRCH;
1256    }
1257
1258    if (myErr) {
1259	myResult = 0;
1260    }
1261    *err = myErr;
1262
1263    /* cache the parent directory(ies) */
1264    for (i = 0; i < ids_to_cache; i++) {
1265	if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1266	    add_node(cache, -1, parent_ids[i], ESRCH);
1267	} else {
1268	    add_node(cache, -1, parent_ids[i], myErr);
1269	}
1270    }
1271
1272    return (myResult);
1273}
1274
1275static int
1276do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1277    struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1278{
1279    boolean_t is64bit;
1280
1281    /*
1282     * NOTE: on entry, the vnode has an io_ref. In case this vnode
1283     * happens to be in our list of file_ids, we'll note it
1284     * avoid calling hfs_chashget_nowait() on that id as that
1285     * will cause a "locking against myself" panic.
1286     */
1287    Boolean check_leaf = true;
1288
1289    struct user64_ext_access_t *user_access_structp;
1290    struct user64_ext_access_t tmp_user_access;
1291    struct access_cache cache;
1292
1293    int error = 0, prev_parent_check_ok=1;
1294    unsigned int i;
1295
1296    short flags;
1297    unsigned int num_files = 0;
1298    int map_size = 0;
1299    int num_parents = 0;
1300    int *file_ids=NULL;
1301    short *access=NULL;
1302    char *bitmap=NULL;
1303    cnid_t *parents=NULL;
1304    int leaf_index;
1305
1306    cnid_t cnid;
1307    cnid_t prevParent_cnid = 0;
1308    unsigned int myPerms;
1309    short myaccess = 0;
1310    struct cat_attr cnattr;
1311    CatalogKey catkey;
1312    struct cnode *skip_cp = VTOC(vp);
1313    kauth_cred_t cred = vfs_context_ucred(context);
1314    proc_t p = vfs_context_proc(context);
1315
1316    is64bit = proc_is64bit(p);
1317
1318    /* initialize the local cache and buffers */
1319    cache.numcached = 0;
1320    cache.cachehits = 0;
1321    cache.lookups = 0;
1322    cache.acache = NULL;
1323    cache.haveaccess = NULL;
1324
1325    /* struct copyin done during dispatch... need to copy file_id array separately */
1326    if (ap->a_data == NULL) {
1327	error = EINVAL;
1328	goto err_exit_bulk_access;
1329    }
1330
1331    if (is64bit) {
1332	if (arg_size != sizeof(struct user64_ext_access_t)) {
1333	    error = EINVAL;
1334	    goto err_exit_bulk_access;
1335	}
1336
1337	user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1338
1339    } else if (arg_size == sizeof(struct user32_access_t)) {
1340	struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1341
1342	// convert an old style bulk-access struct to the new style
1343	tmp_user_access.flags     = accessp->flags;
1344	tmp_user_access.num_files = accessp->num_files;
1345	tmp_user_access.map_size  = 0;
1346	tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1347	tmp_user_access.bitmap    = USER_ADDR_NULL;
1348	tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1349	tmp_user_access.num_parents = 0;
1350	user_access_structp = &tmp_user_access;
1351
1352    } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1353	struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1354
1355	// up-cast from a 32-bit version of the struct
1356	tmp_user_access.flags     = accessp->flags;
1357	tmp_user_access.num_files = accessp->num_files;
1358	tmp_user_access.map_size  = accessp->map_size;
1359	tmp_user_access.num_parents  = accessp->num_parents;
1360
1361	tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1362	tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1363	tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1364	tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1365
1366	user_access_structp = &tmp_user_access;
1367    } else {
1368	error = EINVAL;
1369	goto err_exit_bulk_access;
1370    }
1371
1372    map_size = user_access_structp->map_size;
1373
1374    num_files = user_access_structp->num_files;
1375
1376    num_parents= user_access_structp->num_parents;
1377
1378    if (num_files < 1) {
1379	goto err_exit_bulk_access;
1380    }
1381    if (num_files > 1024) {
1382	error = EINVAL;
1383	goto err_exit_bulk_access;
1384    }
1385
1386    if (num_parents > 1024) {
1387	error = EINVAL;
1388	goto err_exit_bulk_access;
1389    }
1390
1391    file_ids = (int *) kalloc(sizeof(int) * num_files);
1392    access = (short *) kalloc(sizeof(short) * num_files);
1393    if (map_size) {
1394	bitmap = (char *) kalloc(sizeof(char) * map_size);
1395    }
1396
1397    if (num_parents) {
1398	parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1399    }
1400
1401    cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1402    cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1403
1404    if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1405	if (file_ids) {
1406	    kfree(file_ids, sizeof(int) * num_files);
1407	}
1408	if (bitmap) {
1409	    kfree(bitmap, sizeof(char) * map_size);
1410	}
1411	if (access) {
1412	    kfree(access, sizeof(short) * num_files);
1413	}
1414	if (cache.acache) {
1415	    kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1416	}
1417	if (cache.haveaccess) {
1418	    kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1419	}
1420	if (parents) {
1421	    kfree(parents, sizeof(cnid_t) * num_parents);
1422	}
1423	return ENOMEM;
1424    }
1425
1426    // make sure the bitmap is zero'ed out...
1427    if (bitmap) {
1428	bzero(bitmap, (sizeof(char) * map_size));
1429    }
1430
1431    if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1432		num_files * sizeof(int)))) {
1433	goto err_exit_bulk_access;
1434    }
1435
1436    if (num_parents) {
1437	if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1438		    num_parents * sizeof(cnid_t)))) {
1439	    goto err_exit_bulk_access;
1440	}
1441    }
1442
1443    flags = user_access_structp->flags;
1444    if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1445	flags = R_OK;
1446    }
1447
1448    /* check if we've been passed leaf node ids or parent ids */
1449    if (flags & PARENT_IDS_FLAG) {
1450	check_leaf = false;
1451    }
1452
1453    /* Check access to each file_id passed in */
1454    for (i = 0; i < num_files; i++) {
1455	leaf_index=-1;
1456	cnid = (cnid_t) file_ids[i];
1457
1458	/* root always has access */
1459	if ((!parents) && (!suser(cred, NULL))) {
1460	    access[i] = 0;
1461	    continue;
1462	}
1463
1464	if (check_leaf) {
1465	    /* do the lookup (checks the cnode hash, then the catalog) */
1466	    error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1467	    if (error) {
1468		access[i] = (short) error;
1469		continue;
1470	    }
1471
1472	    if (parents) {
1473		// Check if the leaf matches one of the parent scopes
1474		leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1475 		if (leaf_index >= 0 && parents[leaf_index] == cnid)
1476 		    prev_parent_check_ok = 0;
1477 		else if (leaf_index >= 0)
1478 		    prev_parent_check_ok = 1;
1479	    }
1480
1481	    // if the thing has acl's, do the full permission check
1482	    if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1483		struct vnode *cvp;
1484		int myErr = 0;
1485		/* get the vnode for this cnid */
1486		myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1487		if ( myErr ) {
1488		    access[i] = myErr;
1489		    continue;
1490		}
1491
1492		hfs_unlock(VTOC(cvp));
1493
1494		if (vnode_vtype(cvp) == VDIR) {
1495		    myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1496		} else {
1497		    myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1498		}
1499
1500		vnode_put(cvp);
1501		if (myErr) {
1502		    access[i] = myErr;
1503		    continue;
1504		}
1505	    } else {
1506		/* before calling CheckAccess(), check the target file for read access */
1507		myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1508		    cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1509
1510		/* fail fast if no access */
1511		if ((myPerms & flags) == 0) {
1512		    access[i] = EACCES;
1513		    continue;
1514		}
1515	    }
1516	} else {
1517	    /* we were passed an array of parent ids */
1518	    catkey.hfsPlus.parentID = cnid;
1519	}
1520
1521	/* if the last guy had the same parent and had access, we're done */
1522 	if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1523	    cache.cachehits++;
1524	    access[i] = 0;
1525	    continue;
1526	}
1527
1528	myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1529	    skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1530
1531	if (myaccess || (error == ESRCH && leaf_index != -1)) {
1532	    access[i] = 0; // have access.. no errors to report
1533	} else {
1534	    access[i] = (error != 0 ? (short) error : EACCES);
1535	}
1536
1537	prevParent_cnid = catkey.hfsPlus.parentID;
1538    }
1539
1540    /* copyout the access array */
1541    if ((error = copyout((caddr_t)access, user_access_structp->access,
1542		num_files * sizeof (short)))) {
1543	goto err_exit_bulk_access;
1544    }
1545    if (map_size && bitmap) {
1546	if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1547		    map_size * sizeof (char)))) {
1548	    goto err_exit_bulk_access;
1549	}
1550    }
1551
1552
1553  err_exit_bulk_access:
1554
1555    if (file_ids)
1556	kfree(file_ids, sizeof(int) * num_files);
1557    if (parents)
1558	kfree(parents, sizeof(cnid_t) * num_parents);
1559    if (bitmap)
1560	kfree(bitmap, sizeof(char) * map_size);
1561    if (access)
1562	kfree(access, sizeof(short) * num_files);
1563    if (cache.acache)
1564	kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1565    if (cache.haveaccess)
1566	kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1567
1568    return (error);
1569}
1570
1571
1572/* end "bulk-access" support */
1573
1574
1575/*
1576 * Control filesystem operating characteristics.
1577 */
1578int
1579hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1580		vnode_t a_vp;
1581		int  a_command;
1582		caddr_t  a_data;
1583		int  a_fflag;
1584		vfs_context_t a_context;
1585	} */ *ap)
1586{
1587	struct vnode * vp = ap->a_vp;
1588	struct hfsmount *hfsmp = VTOHFS(vp);
1589	vfs_context_t context = ap->a_context;
1590	kauth_cred_t cred = vfs_context_ucred(context);
1591	proc_t p = vfs_context_proc(context);
1592	struct vfsstatfs *vfsp;
1593	boolean_t is64bit;
1594	off_t jnl_start, jnl_size;
1595	struct hfs_journal_info *jip;
1596#if HFS_COMPRESSION
1597	int compressed = 0;
1598	off_t uncompressed_size = -1;
1599	int decmpfs_error = 0;
1600
1601	if (ap->a_command == F_RDADVISE) {
1602		/* we need to inspect the decmpfs state of the file as early as possible */
1603		compressed = hfs_file_is_compressed(VTOC(vp), 0);
1604		if (compressed) {
1605			if (VNODE_IS_RSRC(vp)) {
1606				/* if this is the resource fork, treat it as if it were empty */
1607				uncompressed_size = 0;
1608			} else {
1609				decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1610				if (decmpfs_error != 0) {
1611					/* failed to get the uncompressed size, we'll check for this later */
1612					uncompressed_size = -1;
1613				}
1614			}
1615		}
1616	}
1617#endif /* HFS_COMPRESSION */
1618
1619	is64bit = proc_is64bit(p);
1620
1621#if CONFIG_PROTECT
1622	{
1623		int error = 0;
1624		if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1625			return error;
1626		}
1627	}
1628#endif /* CONFIG_PROTECT */
1629
1630	switch (ap->a_command) {
1631
1632	case HFS_GETPATH:
1633	{
1634		struct vnode *file_vp;
1635		cnid_t  cnid;
1636		int  outlen;
1637		char *bufptr;
1638		int error;
1639		int flags = 0;
1640
1641		/* Caller must be owner of file system. */
1642		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1643		if (suser(cred, NULL) &&
1644			kauth_cred_getuid(cred) != vfsp->f_owner) {
1645			return (EACCES);
1646		}
1647		/* Target vnode must be file system's root. */
1648		if (!vnode_isvroot(vp)) {
1649			return (EINVAL);
1650		}
1651		bufptr = (char *)ap->a_data;
1652		cnid = strtoul(bufptr, NULL, 10);
1653		if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1654			flags |= BUILDPATH_VOLUME_RELATIVE;
1655		}
1656
1657		/* We need to call hfs_vfs_vget to leverage the code that will
1658		 * fix the origin list for us if needed, as opposed to calling
1659		 * hfs_vget, since we will need the parent for build_path call.
1660		 */
1661
1662		if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1663			return (error);
1664		}
1665		error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1666		vnode_put(file_vp);
1667
1668		return (error);
1669	}
1670
1671	case HFS_TRANSFER_DOCUMENT_ID:
1672	{
1673		struct cnode *cp = NULL;
1674		int error;
1675		u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1676		struct fileproc *to_fp;
1677		struct vnode *to_vp;
1678		struct cnode *to_cp;
1679
1680		cp = VTOC(vp);
1681
1682		if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1683			//printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1684			return error;
1685		}
1686		if ( (error = vnode_getwithref(to_vp)) ) {
1687			file_drop(to_fd);
1688			return error;
1689		}
1690
1691		if (VTOHFS(to_vp) != hfsmp) {
1692			error = EXDEV;
1693			goto transfer_cleanup;
1694		}
1695
1696		int need_unlock = 1;
1697		to_cp = VTOC(to_vp);
1698		error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1699		if (error != 0) {
1700			//printf("could not lock the pair of cnodes (error %d)\n", error);
1701			goto transfer_cleanup;
1702		}
1703
1704		if (!(cp->c_bsdflags & UF_TRACKED)) {
1705			error = EINVAL;
1706		} else if (to_cp->c_bsdflags & UF_TRACKED) {
1707			//
1708			// if the destination is already tracked, return an error
1709			// as otherwise it's a silent deletion of the target's
1710			// document-id
1711			//
1712			error = EEXIST;
1713		} else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1714			//
1715			// we can use the FndrExtendedFileInfo because the doc-id is the first
1716			// thing in both it and the ExtendedDirInfo struct which is fixed in
1717			// format and can not change layout
1718			//
1719			struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1720			struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1721
1722			if (f_extinfo->document_id == 0) {
1723				uint32_t new_id;
1724
1725				hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1726
1727				if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1728					//
1729					// re-lock the pair now that we have the document-id
1730					//
1731					hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1732					f_extinfo->document_id = new_id;
1733				} else {
1734					goto transfer_cleanup;
1735				}
1736			}
1737
1738			to_extinfo->document_id = f_extinfo->document_id;
1739			f_extinfo->document_id = 0;
1740			//printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1741
1742			// make sure the destination is also UF_TRACKED
1743			to_cp->c_bsdflags |= UF_TRACKED;
1744			cp->c_bsdflags &= ~UF_TRACKED;
1745
1746			// mark the cnodes dirty
1747			cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1748			to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1749
1750			int lockflags;
1751			if ((error = hfs_start_transaction(hfsmp)) == 0) {
1752
1753				lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1754
1755				(void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1756				(void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1757
1758				hfs_systemfile_unlock (hfsmp, lockflags);
1759				(void) hfs_end_transaction(hfsmp);
1760			}
1761
1762#if CONFIG_FSE
1763			add_fsevent(FSE_DOCID_CHANGED, context,
1764				    FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1765				    FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1766				    FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1767				    FSE_ARG_INT32, to_extinfo->document_id,
1768				    FSE_ARG_DONE);
1769
1770			hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1771			need_unlock = 0;
1772
1773			if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1774				add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1775			}
1776			if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1777				add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1778			}
1779#else
1780			hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1781			need_unlock = 0;
1782#endif
1783		}
1784
1785		if (need_unlock) {
1786			hfs_unlockpair(cp, to_cp);
1787		}
1788
1789	transfer_cleanup:
1790		vnode_put(to_vp);
1791		file_drop(to_fd);
1792
1793		return error;
1794	}
1795
1796
1797
1798	case HFS_PREV_LINK:
1799	case HFS_NEXT_LINK:
1800	{
1801		cnid_t linkfileid;
1802		cnid_t nextlinkid;
1803		cnid_t prevlinkid;
1804		int error;
1805
1806		/* Caller must be owner of file system. */
1807		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1808		if (suser(cred, NULL) &&
1809			kauth_cred_getuid(cred) != vfsp->f_owner) {
1810			return (EACCES);
1811		}
1812		/* Target vnode must be file system's root. */
1813		if (!vnode_isvroot(vp)) {
1814			return (EINVAL);
1815		}
1816		linkfileid = *(cnid_t *)ap->a_data;
1817		if (linkfileid < kHFSFirstUserCatalogNodeID) {
1818			return (EINVAL);
1819		}
1820		if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1821			return (error);
1822		}
1823		if (ap->a_command == HFS_NEXT_LINK) {
1824			*(cnid_t *)ap->a_data = nextlinkid;
1825		} else {
1826			*(cnid_t *)ap->a_data = prevlinkid;
1827		}
1828		return (0);
1829	}
1830
1831	case HFS_RESIZE_PROGRESS: {
1832
1833		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1834		if (suser(cred, NULL) &&
1835			kauth_cred_getuid(cred) != vfsp->f_owner) {
1836			return (EACCES); /* must be owner of file system */
1837		}
1838		if (!vnode_isvroot(vp)) {
1839			return (EINVAL);
1840		}
1841		/* file system must not be mounted read-only */
1842		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1843			return (EROFS);
1844		}
1845
1846		return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1847	}
1848
1849	case HFS_RESIZE_VOLUME: {
1850		u_int64_t newsize;
1851		u_int64_t cursize;
1852
1853		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1854		if (suser(cred, NULL) &&
1855			kauth_cred_getuid(cred) != vfsp->f_owner) {
1856			return (EACCES); /* must be owner of file system */
1857		}
1858		if (!vnode_isvroot(vp)) {
1859			return (EINVAL);
1860		}
1861
1862		/* filesystem must not be mounted read only */
1863		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1864			return (EROFS);
1865		}
1866		newsize = *(u_int64_t *)ap->a_data;
1867		cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1868
1869		if (newsize > cursize) {
1870			return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1871		} else if (newsize < cursize) {
1872			return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1873		} else {
1874			return (0);
1875		}
1876	}
1877	case HFS_CHANGE_NEXT_ALLOCATION: {
1878		int error = 0;		/* Assume success */
1879		u_int32_t location;
1880
1881		if (vnode_vfsisrdonly(vp)) {
1882			return (EROFS);
1883		}
1884		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1885		if (suser(cred, NULL) &&
1886			kauth_cred_getuid(cred) != vfsp->f_owner) {
1887			return (EACCES); /* must be owner of file system */
1888		}
1889		if (!vnode_isvroot(vp)) {
1890			return (EINVAL);
1891		}
1892		hfs_lock_mount(hfsmp);
1893		location = *(u_int32_t *)ap->a_data;
1894		if ((location >= hfsmp->allocLimit) &&
1895			(location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1896			error = EINVAL;
1897			goto fail_change_next_allocation;
1898		}
1899		/* Return previous value. */
1900		*(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1901		if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1902			/* On magic value for location, set nextAllocation to next block
1903			 * after metadata zone and set flag in mount structure to indicate
1904			 * that nextAllocation should not be updated again.
1905			 */
1906			if (hfsmp->hfs_metazone_end != 0) {
1907				HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1908			}
1909			hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1910		} else {
1911			hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1912			HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1913		}
1914		MarkVCBDirty(hfsmp);
1915fail_change_next_allocation:
1916		hfs_unlock_mount(hfsmp);
1917		return (error);
1918	}
1919
1920#if HFS_SPARSE_DEV
1921	case HFS_SETBACKINGSTOREINFO: {
1922		struct vnode * bsfs_rootvp;
1923		struct vnode * di_vp;
1924		struct hfs_backingstoreinfo *bsdata;
1925		int error = 0;
1926
1927		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1928			return (EROFS);
1929		}
1930		if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1931			return (EALREADY);
1932		}
1933		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1934		if (suser(cred, NULL) &&
1935			kauth_cred_getuid(cred) != vfsp->f_owner) {
1936			return (EACCES); /* must be owner of file system */
1937		}
1938		bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1939		if (bsdata == NULL) {
1940			return (EINVAL);
1941		}
1942		if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1943			return (error);
1944		}
1945		if ((error = vnode_getwithref(di_vp))) {
1946			file_drop(bsdata->backingfd);
1947			return(error);
1948		}
1949
1950		if (vnode_mount(vp) == vnode_mount(di_vp)) {
1951			(void)vnode_put(di_vp);
1952			file_drop(bsdata->backingfd);
1953			return (EINVAL);
1954		}
1955
1956		/*
1957		 * Obtain the backing fs root vnode and keep a reference
1958		 * on it.  This reference will be dropped in hfs_unmount.
1959		 */
1960		error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1961		if (error) {
1962			(void)vnode_put(di_vp);
1963			file_drop(bsdata->backingfd);
1964			return (error);
1965		}
1966		vnode_ref(bsfs_rootvp);
1967		vnode_put(bsfs_rootvp);
1968
1969		hfs_lock_mount(hfsmp);
1970		hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1971		hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1972		hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1973		hfs_unlock_mount(hfsmp);
1974
1975		/* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1976
1977		/*
1978		 * If the sparse image is on a sparse image file (as opposed to a sparse
1979		 * bundle), then we may need to limit the free space to the maximum size
1980		 * of a file on that volume.  So we query (using pathconf), and if we get
1981		 * a meaningful result, we cache the number of blocks for later use in
1982		 * hfs_freeblks().
1983		 */
1984		hfsmp->hfs_backingfs_maxblocks = 0;
1985		if (vnode_vtype(di_vp) == VREG) {
1986			int terr;
1987			int hostbits;
1988			terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1989			if (terr == 0 && hostbits != 0 && hostbits < 64) {
1990				u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1991
1992				hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1993			}
1994		}
1995
1996		/* The free extent cache is managed differently for sparse devices.
1997		 * There is a window between which the volume is mounted and the
1998		 * device is marked as sparse, so the free extent cache for this
1999		 * volume is currently initialized as normal volume (sorted by block
2000		 * count).  Reset the cache so that it will be rebuilt again
2001		 * for sparse device (sorted by start block).
2002		 */
2003		ResetVCBFreeExtCache(hfsmp);
2004
2005		(void)vnode_put(di_vp);
2006		file_drop(bsdata->backingfd);
2007		return (0);
2008	}
2009	case HFS_CLRBACKINGSTOREINFO: {
2010		struct vnode * tmpvp;
2011
2012		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2013		if (suser(cred, NULL) &&
2014			kauth_cred_getuid(cred) != vfsp->f_owner) {
2015			return (EACCES); /* must be owner of file system */
2016		}
2017		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2018			return (EROFS);
2019		}
2020
2021		if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
2022		    hfsmp->hfs_backingfs_rootvp) {
2023
2024			hfs_lock_mount(hfsmp);
2025			hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2026			tmpvp = hfsmp->hfs_backingfs_rootvp;
2027			hfsmp->hfs_backingfs_rootvp = NULLVP;
2028			hfsmp->hfs_sparsebandblks = 0;
2029			hfs_unlock_mount(hfsmp);
2030
2031			vnode_rele(tmpvp);
2032		}
2033		return (0);
2034	}
2035#endif /* HFS_SPARSE_DEV */
2036
2037	/* Change the next CNID stored in the VH */
2038	case HFS_CHANGE_NEXTCNID: {
2039		int error = 0;		/* Assume success */
2040		u_int32_t fileid;
2041		int wraparound = 0;
2042		int lockflags = 0;
2043
2044		if (vnode_vfsisrdonly(vp)) {
2045			return (EROFS);
2046		}
2047		vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2048		if (suser(cred, NULL) &&
2049			kauth_cred_getuid(cred) != vfsp->f_owner) {
2050			return (EACCES); /* must be owner of file system */
2051		}
2052
2053		fileid = *(u_int32_t *)ap->a_data;
2054
2055		/* Must have catalog lock excl. to advance the CNID pointer */
2056		lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2057
2058		hfs_lock_mount(hfsmp);
2059
2060		/* If it is less than the current next CNID, force the wraparound bit to be set */
2061		if (fileid < hfsmp->vcbNxtCNID) {
2062			wraparound=1;
2063		}
2064
2065		/* Return previous value. */
2066		*(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2067
2068		hfsmp->vcbNxtCNID = fileid;
2069
2070		if (wraparound) {
2071			hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2072		}
2073
2074		MarkVCBDirty(hfsmp);
2075		hfs_unlock_mount(hfsmp);
2076		hfs_systemfile_unlock (hfsmp, lockflags);
2077
2078		return (error);
2079	}
2080
2081	case F_FREEZE_FS: {
2082		struct mount *mp;
2083
2084		mp = vnode_mount(vp);
2085		hfsmp = VFSTOHFS(mp);
2086
2087		if (!(hfsmp->jnl))
2088			return (ENOTSUP);
2089
2090		vfsp = vfs_statfs(mp);
2091
2092		if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2093			!kauth_cred_issuser(cred))
2094			return (EACCES);
2095
2096		return hfs_freeze(hfsmp);
2097	}
2098
2099	case F_THAW_FS: {
2100		vfsp = vfs_statfs(vnode_mount(vp));
2101		if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2102			!kauth_cred_issuser(cred))
2103			return (EACCES);
2104
2105		return hfs_thaw(hfsmp, current_proc());
2106	}
2107
2108	case HFS_BULKACCESS_FSCTL: {
2109	    int size;
2110
2111	    if (hfsmp->hfs_flags & HFS_STANDARD) {
2112		return EINVAL;
2113	    }
2114
2115	    if (is64bit) {
2116		size = sizeof(struct user64_access_t);
2117	    } else {
2118		size = sizeof(struct user32_access_t);
2119	    }
2120
2121	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
2122	}
2123
2124	case HFS_EXT_BULKACCESS_FSCTL: {
2125	    int size;
2126
2127	    if (hfsmp->hfs_flags & HFS_STANDARD) {
2128		return EINVAL;
2129	    }
2130
2131	    if (is64bit) {
2132		size = sizeof(struct user64_ext_access_t);
2133	    } else {
2134		size = sizeof(struct user32_ext_access_t);
2135	    }
2136
2137	    return do_bulk_access_check(hfsmp, vp, ap, size, context);
2138	}
2139
2140	case HFS_SET_XATTREXTENTS_STATE: {
2141		int state;
2142
2143		if (ap->a_data == NULL) {
2144			return (EINVAL);
2145		}
2146
2147		state = *(int *)ap->a_data;
2148
2149		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2150			return (EROFS);
2151		}
2152
2153		/* Super-user can enable or disable extent-based extended
2154		 * attribute support on a volume
2155		 * Note: Starting Mac OS X 10.7, extent-based extended attributes
2156		 * are enabled by default, so any change will be transient only
2157		 * till the volume is remounted.
2158		 */
2159		if (!kauth_cred_issuser(kauth_cred_get())) {
2160			return (EPERM);
2161		}
2162		if (state == 0 || state == 1)
2163			return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2164		else
2165			return (EINVAL);
2166	}
2167
2168	case F_SETSTATICCONTENT: {
2169		int error;
2170		int enable_static = 0;
2171		struct cnode *cp = NULL;
2172		/*
2173		 * lock the cnode, decorate the cnode flag, and bail out.
2174		 * VFS should have already authenticated the caller for us.
2175		 */
2176
2177		if (ap->a_data) {
2178			/*
2179			 * Note that even though ap->a_data is of type caddr_t,
2180			 * the fcntl layer at the syscall handler will pass in NULL
2181			 * or 1 depending on what the argument supplied to the fcntl
2182			 * was.  So it is in fact correct to check the ap->a_data
2183			 * argument for zero or non-zero value when deciding whether or not
2184			 * to enable the static bit in the cnode.
2185			 */
2186			enable_static = 1;
2187		}
2188		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2189			return EROFS;
2190		}
2191		cp = VTOC(vp);
2192
2193		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2194		if (error == 0) {
2195			if (enable_static) {
2196				cp->c_flag |= C_SSD_STATIC;
2197			}
2198			else {
2199				cp->c_flag &= ~C_SSD_STATIC;
2200			}
2201			hfs_unlock (cp);
2202		}
2203		return error;
2204	}
2205
2206	case F_SET_GREEDY_MODE: {
2207		int error;
2208		int enable_greedy_mode = 0;
2209		struct cnode *cp = NULL;
2210		/*
2211		 * lock the cnode, decorate the cnode flag, and bail out.
2212		 * VFS should have already authenticated the caller for us.
2213		 */
2214
2215		if (ap->a_data) {
2216			/*
2217			 * Note that even though ap->a_data is of type caddr_t,
2218			 * the fcntl layer at the syscall handler will pass in NULL
2219			 * or 1 depending on what the argument supplied to the fcntl
2220			 * was.  So it is in fact correct to check the ap->a_data
2221			 * argument for zero or non-zero value when deciding whether or not
2222			 * to enable the greedy mode bit in the cnode.
2223			 */
2224			enable_greedy_mode = 1;
2225		}
2226		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2227			return EROFS;
2228		}
2229		cp = VTOC(vp);
2230
2231		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2232		if (error == 0) {
2233			if (enable_greedy_mode) {
2234				cp->c_flag |= C_SSD_GREEDY_MODE;
2235			}
2236			else {
2237				cp->c_flag &= ~C_SSD_GREEDY_MODE;
2238			}
2239			hfs_unlock (cp);
2240		}
2241		return error;
2242	}
2243
2244	case F_SETIOTYPE: {
2245		int error;
2246		uint32_t iotypeflag = 0;
2247
2248		struct cnode *cp = NULL;
2249		/*
2250		 * lock the cnode, decorate the cnode flag, and bail out.
2251		 * VFS should have already authenticated the caller for us.
2252		 */
2253
2254		if (ap->a_data == NULL) {
2255			return EINVAL;
2256		}
2257
2258		/*
2259		 * Note that even though ap->a_data is of type caddr_t, we
2260		 * can only use 32 bits of flag values.
2261		 */
2262		iotypeflag = (uint32_t) ap->a_data;
2263		switch (iotypeflag) {
2264			case F_IOTYPE_ISOCHRONOUS:
2265				break;
2266			default:
2267				return EINVAL;
2268		}
2269
2270
2271		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2272			return EROFS;
2273		}
2274		cp = VTOC(vp);
2275
2276		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2277		if (error == 0) {
2278			switch (iotypeflag) {
2279				case F_IOTYPE_ISOCHRONOUS:
2280					cp->c_flag |= C_IO_ISOCHRONOUS;
2281					break;
2282				default:
2283					break;
2284			}
2285			hfs_unlock (cp);
2286		}
2287		return error;
2288	}
2289
2290	case F_MAKECOMPRESSED: {
2291		int error = 0;
2292		uint32_t gen_counter;
2293		struct cnode *cp = NULL;
2294		int reset_decmp = 0;
2295
2296		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2297			return EROFS;
2298		}
2299
2300		/*
2301		 * acquire & lock the cnode.
2302		 * VFS should have already authenticated the caller for us.
2303		 */
2304
2305		if (ap->a_data) {
2306			/*
2307			 * Cast the pointer into a uint32_t so we can extract the
2308			 * supplied generation counter.
2309			 */
2310			gen_counter = *((uint32_t*)ap->a_data);
2311		}
2312		else {
2313			return EINVAL;
2314		}
2315
2316#if HFS_COMPRESSION
2317		cp = VTOC(vp);
2318		/* Grab truncate lock first; we may truncate the file */
2319		hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2320
2321		error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2322		if (error) {
2323			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2324			return error;
2325		}
2326
2327		/* Are there any other usecounts/FDs? */
2328		if (vnode_isinuse(vp, 1)) {
2329			hfs_unlock(cp);
2330			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2331			return EBUSY;
2332		}
2333
2334		/* now we have the cnode locked down; Validate arguments */
2335		if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2336			/* EINVAL if you are trying to manipulate an IMMUTABLE file */
2337			hfs_unlock(cp);
2338			hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2339			return EINVAL;
2340		}
2341
2342		if ((hfs_get_gencount (cp)) == gen_counter) {
2343			/*
2344			 * OK, the gen_counter matched.  Go for it:
2345			 * Toggle state bits, truncate file, and suppress mtime update
2346			 */
2347			reset_decmp = 1;
2348			cp->c_bsdflags |= UF_COMPRESSED;
2349
2350			error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2351								 ap->a_context);
2352		}
2353		else {
2354			error = ESTALE;
2355		}
2356
2357		/* Unlock cnode before executing decmpfs ; they may need to get an EA */
2358		hfs_unlock(cp);
2359
2360		/*
2361		 * Reset the decmp state while still holding the truncate lock. We need to
2362		 * serialize here against a listxattr on this node which may occur at any
2363		 * time.
2364		 *
2365		 * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2366		 * that will still potentially require getting the com.apple.decmpfs EA. If the
2367	 	 * EA is required, then we can't hold the cnode lock, because the getxattr call is
2368		 * generic(through VFS), and can't pass along any info telling it that we're already
2369		 * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2370		 * and trying to fill in the hfs_file_is_compressed info during the callback
2371		 * operation, which will result in deadlock against the b-tree node.
2372		 *
2373		 * So, to serialize against listxattr (which will grab buf_t meta references on
2374		 * the b-tree blocks), we hold the truncate lock as we're manipulating the
2375		 * decmpfs payload.
2376		 */
2377		if ((reset_decmp) && (error == 0)) {
2378			decmpfs_cnode *dp = VTOCMP (vp);
2379			if (dp != NULL) {
2380				decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2381			}
2382
2383			/* Initialize the decmpfs node as needed */
2384			(void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2385		}
2386
2387		hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2388
2389#endif
2390		return error;
2391	}
2392
2393	case F_SETBACKINGSTORE: {
2394
2395		int error = 0;
2396
2397		/*
2398		 * See comment in F_SETSTATICCONTENT re: using
2399	     * a null check for a_data
2400  		 */
2401		if (ap->a_data) {
2402			error = hfs_set_backingstore (vp, 1);
2403		}
2404		else {
2405			error = hfs_set_backingstore (vp, 0);
2406		}
2407
2408		return error;
2409	}
2410
2411	case F_GETPATH_MTMINFO: {
2412		int error = 0;
2413
2414		int *data = (int*) ap->a_data;
2415
2416		/* Ask if this is a backingstore vnode */
2417		error = hfs_is_backingstore (vp, data);
2418
2419		return error;
2420	}
2421
2422	case F_FULLFSYNC: {
2423		int error;
2424
2425		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2426			return (EROFS);
2427		}
2428		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2429		if (error == 0) {
2430			error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2431			hfs_unlock(VTOC(vp));
2432		}
2433
2434		return error;
2435	}
2436
2437	case F_CHKCLEAN: {
2438		register struct cnode *cp;
2439		int error;
2440
2441		if (!vnode_isreg(vp))
2442			return EINVAL;
2443
2444		error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2445		if (error == 0) {
2446			cp = VTOC(vp);
2447			/*
2448			 * used by regression test to determine if
2449			 * all the dirty pages (via write) have been cleaned
2450			 * after a call to 'fsysnc'.
2451			 */
2452			error = is_file_clean(vp, VTOF(vp)->ff_size);
2453			hfs_unlock(cp);
2454		}
2455		return (error);
2456	}
2457
2458	case F_RDADVISE: {
2459		register struct radvisory *ra;
2460		struct filefork *fp;
2461		int error;
2462
2463		if (!vnode_isreg(vp))
2464			return EINVAL;
2465
2466		ra = (struct radvisory *)(ap->a_data);
2467		fp = VTOF(vp);
2468
2469		/* Protect against a size change. */
2470		hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2471
2472#if HFS_COMPRESSION
2473		if (compressed && (uncompressed_size == -1)) {
2474			/* fetching the uncompressed size failed above, so return the error */
2475			error = decmpfs_error;
2476		} else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2477				   (!compressed && (ra->ra_offset >= fp->ff_size))) {
2478			error = EFBIG;
2479		}
2480#else /* HFS_COMPRESSION */
2481		if (ra->ra_offset >= fp->ff_size) {
2482			error = EFBIG;
2483		}
2484#endif /* HFS_COMPRESSION */
2485		else {
2486			error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2487		}
2488
2489		hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2490		return (error);
2491	}
2492
2493	case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2494	{
2495		if (is64bit) {
2496			*(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2497		}
2498		else {
2499			*(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2500		}
2501		return 0;
2502	}
2503
2504	case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2505	    *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2506	    break;
2507
2508	case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2509	    *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2510	    break;
2511
2512	case HFS_FSCTL_GET_VERY_LOW_DISK:
2513	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2514	    break;
2515
2516	case HFS_FSCTL_SET_VERY_LOW_DISK:
2517	    if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2518		return EINVAL;
2519	    }
2520
2521	    hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2522	    break;
2523
2524	case HFS_FSCTL_GET_LOW_DISK:
2525	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2526	    break;
2527
2528	case HFS_FSCTL_SET_LOW_DISK:
2529	    if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2530		|| *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2531
2532		return EINVAL;
2533	    }
2534
2535	    hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2536	    break;
2537
2538	case HFS_FSCTL_GET_DESIRED_DISK:
2539	    *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2540	    break;
2541
2542	case HFS_FSCTL_SET_DESIRED_DISK:
2543	    if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2544		return EINVAL;
2545	    }
2546
2547	    hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2548	    break;
2549
2550	case HFS_VOLUME_STATUS:
2551	    *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2552	    break;
2553
2554	case HFS_SET_BOOT_INFO:
2555		if (!vnode_isvroot(vp))
2556			return(EINVAL);
2557		if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2558			return(EACCES);	/* must be superuser or owner of filesystem */
2559		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2560			return (EROFS);
2561		}
2562		hfs_lock_mount (hfsmp);
2563		bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2564		hfs_unlock_mount (hfsmp);
2565		(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2566		break;
2567
2568	case HFS_GET_BOOT_INFO:
2569		if (!vnode_isvroot(vp))
2570			return(EINVAL);
2571		hfs_lock_mount (hfsmp);
2572		bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2573		hfs_unlock_mount(hfsmp);
2574		break;
2575
2576	case HFS_MARK_BOOT_CORRUPT:
2577		/* Mark the boot volume corrupt by setting
2578		 * kHFSVolumeInconsistentBit in the volume header.  This will
2579		 * force fsck_hfs on next mount.
2580		 */
2581		if (!kauth_cred_issuser(kauth_cred_get())) {
2582			return EACCES;
2583		}
2584
2585		/* Allowed only on the root vnode of the boot volume */
2586		if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2587		    !vnode_isvroot(vp)) {
2588			return EINVAL;
2589		}
2590		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2591			return (EROFS);
2592		}
2593		printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2594		hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2595		break;
2596
2597	case HFS_FSCTL_GET_JOURNAL_INFO:
2598		jip = (struct hfs_journal_info*)ap->a_data;
2599
2600		if (vp == NULLVP)
2601		        return EINVAL;
2602
2603	    if (hfsmp->jnl == NULL) {
2604			jnl_start = 0;
2605			jnl_size  = 0;
2606	    } else {
2607			jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2608			jnl_size  = (off_t)hfsmp->jnl_size;
2609	    }
2610
2611		jip->jstart = jnl_start;
2612		jip->jsize = jnl_size;
2613		break;
2614
2615	case HFS_SET_ALWAYS_ZEROFILL: {
2616	    struct cnode *cp = VTOC(vp);
2617
2618	    if (*(int *)ap->a_data) {
2619		cp->c_flag |= C_ALWAYS_ZEROFILL;
2620	    } else {
2621		cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2622	    }
2623	    break;
2624	}
2625
2626	case HFS_DISABLE_METAZONE: {
2627		/* Only root can disable metadata zone */
2628		if (!kauth_cred_issuser(kauth_cred_get())) {
2629			return EACCES;
2630		}
2631		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2632			return (EROFS);
2633		}
2634
2635		/* Disable metadata zone now */
2636		(void) hfs_metadatazone_init(hfsmp, true);
2637		printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2638		break;
2639	}
2640
2641
2642	case HFS_FSINFO_METADATA_BLOCKS: {
2643		int error;
2644		struct hfsinfo_metadata *hinfo;
2645
2646		hinfo = (struct hfsinfo_metadata *)ap->a_data;
2647
2648		/* Get information about number of metadata blocks */
2649		error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2650		if (error) {
2651			return error;
2652		}
2653
2654		break;
2655	}
2656
2657	case HFS_CS_FREESPACE_TRIM: {
2658		int error = 0;
2659		int lockflags = 0;
2660
2661		/* Only root allowed */
2662		if (!kauth_cred_issuser(kauth_cred_get())) {
2663			return EACCES;
2664		}
2665
2666		/*
2667		 * This core functionality is similar to hfs_scan_blocks().
2668		 * The main difference is that hfs_scan_blocks() is called
2669		 * as part of mount where we are assured that the journal is
2670		 * empty to start with.  This fcntl() can be called on a
2671		 * mounted volume, therefore it has to flush the content of
2672		 * the journal as well as ensure the state of summary table.
2673		 *
2674		 * This fcntl scans over the entire allocation bitmap,
2675		 * creates list of all the free blocks, and issues TRIM
2676		 * down to the underlying device.  This can take long time
2677		 * as it can generate up to 512MB of read I/O.
2678		 */
2679
2680		if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2681			error = hfs_init_summary(hfsmp);
2682			if (error) {
2683				printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2684				return error;
2685			}
2686		}
2687
2688		/*
2689		 * The journal maintains list of recently deallocated blocks to
2690		 * issue DKIOCUNMAPs when the corresponding journal transaction is
2691		 * flushed to the disk.  To avoid any race conditions, we only
2692		 * want one active trim list and only one thread issuing DKIOCUNMAPs.
2693		 * Therefore we make sure that the journal trim list is sync'ed,
2694		 * empty, and not modifiable for the duration of our scan.
2695		 *
2696		 * Take the journal lock before flushing the journal to the disk.
2697		 * We will keep on holding the journal lock till we don't get the
2698		 * bitmap lock to make sure that no new journal transactions can
2699		 * start.  This will make sure that the journal trim list is not
2700		 * modified after the journal flush and before getting bitmap lock.
2701		 * We can release the journal lock after we acquire the bitmap
2702		 * lock as it will prevent any further block deallocations.
2703		 */
2704		hfs_journal_lock(hfsmp);
2705
2706		/* Flush the journal and wait for all I/Os to finish up */
2707		error = hfs_journal_flush(hfsmp, TRUE);
2708		if (error) {
2709			hfs_journal_unlock(hfsmp);
2710			return error;
2711		}
2712
2713		/* Take bitmap lock to ensure it is not being modified */
2714		lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2715
2716		/* Release the journal lock */
2717		hfs_journal_unlock(hfsmp);
2718
2719		/*
2720		 * ScanUnmapBlocks reads the bitmap in large block size
2721		 * (up to 1MB) unlike the runtime which reads the bitmap
2722		 * in the 4K block size.  This can cause buf_t collisions
2723		 * and potential data corruption.  To avoid this, we
2724		 * invalidate all the existing buffers associated with
2725		 * the bitmap vnode before scanning it.
2726		 *
2727		 * Note: ScanUnmapBlock() cleans up all the buffers
2728		 * after itself, so there won't be any large buffers left
2729		 * for us to clean up after it returns.
2730		 */
2731		error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2732		if (error) {
2733			hfs_systemfile_unlock(hfsmp, lockflags);
2734			return error;
2735		}
2736
2737		/* Traverse bitmap and issue DKIOCUNMAPs */
2738		error = ScanUnmapBlocks(hfsmp);
2739		hfs_systemfile_unlock(hfsmp, lockflags);
2740		if (error) {
2741			return error;
2742		}
2743
2744		break;
2745	}
2746
2747	default:
2748		return (ENOTTY);
2749	}
2750
2751	return 0;
2752}
2753
2754/*
2755 * select
2756 */
2757int
2758hfs_vnop_select(__unused struct vnop_select_args *ap)
2759/*
2760	struct vnop_select_args {
2761		vnode_t a_vp;
2762		int  a_which;
2763		int  a_fflags;
2764		void *a_wql;
2765		vfs_context_t a_context;
2766	};
2767*/
2768{
2769	/*
2770	 * We should really check to see if I/O is possible.
2771	 */
2772	return (1);
2773}
2774
2775/*
2776 * Converts a logical block number to a physical block, and optionally returns
2777 * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2778 * The physical block number is based on the device block size, currently its 512.
2779 * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2780 */
2781int
2782hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2783{
2784	struct filefork *fp = VTOF(vp);
2785	struct hfsmount *hfsmp = VTOHFS(vp);
2786	int  retval = E_NONE;
2787	u_int32_t  logBlockSize;
2788	size_t  bytesContAvail = 0;
2789	off_t  blockposition;
2790	int lockExtBtree;
2791	int lockflags = 0;
2792
2793	/*
2794	 * Check for underlying vnode requests and ensure that logical
2795	 * to physical mapping is requested.
2796	 */
2797	if (vpp != NULL)
2798		*vpp = hfsmp->hfs_devvp;
2799	if (bnp == NULL)
2800		return (0);
2801
2802	logBlockSize = GetLogicalBlockSize(vp);
2803	blockposition = (off_t)bn * logBlockSize;
2804
2805	lockExtBtree = overflow_extents(fp);
2806
2807	if (lockExtBtree)
2808		lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2809
2810	retval = MacToVFSError(
2811                            MapFileBlockC (HFSTOVCB(hfsmp),
2812                                            (FCB*)fp,
2813                                            MAXPHYSIO,
2814                                            blockposition,
2815                                            bnp,
2816                                            &bytesContAvail));
2817
2818	if (lockExtBtree)
2819		hfs_systemfile_unlock(hfsmp, lockflags);
2820
2821	if (retval == E_NONE) {
2822		/* Figure out how many read ahead blocks there are */
2823		if (runp != NULL) {
2824			if (can_cluster(logBlockSize)) {
2825				/* Make sure this result never goes negative: */
2826				*runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2827			} else {
2828				*runp = 0;
2829			}
2830		}
2831	}
2832	return (retval);
2833}
2834
2835/*
2836 * Convert logical block number to file offset.
2837 */
2838int
2839hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2840/*
2841	struct vnop_blktooff_args {
2842		vnode_t a_vp;
2843		daddr64_t a_lblkno;
2844		off_t *a_offset;
2845	};
2846*/
2847{
2848	if (ap->a_vp == NULL)
2849		return (EINVAL);
2850	*ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2851
2852	return(0);
2853}
2854
2855/*
2856 * Convert file offset to logical block number.
2857 */
2858int
2859hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2860/*
2861	struct vnop_offtoblk_args {
2862		vnode_t a_vp;
2863		off_t a_offset;
2864		daddr64_t *a_lblkno;
2865	};
2866*/
2867{
2868	if (ap->a_vp == NULL)
2869		return (EINVAL);
2870	*ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2871
2872	return(0);
2873}
2874
2875/*
2876 * Map file offset to physical block number.
2877 *
2878 * If this function is called for write operation, and if the file
2879 * had virtual blocks allocated (delayed allocation), real blocks
2880 * are allocated by calling ExtendFileC().
2881 *
2882 * If this function is called for read operation, and if the file
2883 * had virtual blocks allocated (delayed allocation), no change
2884 * to the size of file is done, and if required, rangelist is
2885 * searched for mapping.
2886 *
2887 * System file cnodes are expected to be locked (shared or exclusive).
2888 */
2889int
2890hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2891/*
2892	struct vnop_blockmap_args {
2893		vnode_t a_vp;
2894		off_t a_foffset;
2895		size_t a_size;
2896		daddr64_t *a_bpn;
2897		size_t *a_run;
2898		void *a_poff;
2899		int a_flags;
2900		vfs_context_t a_context;
2901	};
2902*/
2903{
2904	struct vnode *vp = ap->a_vp;
2905	struct cnode *cp;
2906	struct filefork *fp;
2907	struct hfsmount *hfsmp;
2908	size_t bytesContAvail = 0;
2909	int retval = E_NONE;
2910	int syslocks = 0;
2911	int lockflags = 0;
2912	struct rl_entry *invalid_range;
2913	enum rl_overlaptype overlaptype;
2914	int started_tr = 0;
2915	int tooklock = 0;
2916
2917#if HFS_COMPRESSION
2918	if (VNODE_IS_RSRC(vp)) {
2919		/* allow blockmaps to the resource fork */
2920	} else {
2921		if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2922			int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2923			switch(state) {
2924				case FILE_IS_COMPRESSED:
2925					return ENOTSUP;
2926				case FILE_IS_CONVERTING:
2927					/* if FILE_IS_CONVERTING, we allow blockmap */
2928					break;
2929				default:
2930					printf("invalid state %d for compressed file\n", state);
2931					/* fall through */
2932			}
2933		}
2934	}
2935#endif /* HFS_COMPRESSION */
2936
2937	/* Do not allow blockmap operation on a directory */
2938	if (vnode_isdir(vp)) {
2939		return (ENOTSUP);
2940	}
2941
2942	/*
2943	 * Check for underlying vnode requests and ensure that logical
2944	 * to physical mapping is requested.
2945	 */
2946	if (ap->a_bpn == NULL)
2947		return (0);
2948
2949	if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2950		if (VTOC(vp)->c_lockowner != current_thread()) {
2951			hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
2952			tooklock = 1;
2953		}
2954	}
2955	hfsmp = VTOHFS(vp);
2956	cp = VTOC(vp);
2957	fp = VTOF(vp);
2958
2959retry:
2960	/* Check virtual blocks only when performing write operation */
2961	if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2962		if (hfs_start_transaction(hfsmp) != 0) {
2963			retval = EINVAL;
2964			goto exit;
2965		} else {
2966			started_tr = 1;
2967		}
2968		syslocks = SFL_EXTENTS | SFL_BITMAP;
2969
2970	} else if (overflow_extents(fp)) {
2971		syslocks = SFL_EXTENTS;
2972	}
2973
2974	if (syslocks)
2975		lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2976
2977	/*
2978	 * Check for any delayed allocations.
2979	 */
2980	if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2981		int64_t actbytes;
2982		u_int32_t loanedBlocks;
2983
2984		//
2985		// Make sure we have a transaction.  It's possible
2986		// that we came in and fp->ff_unallocblocks was zero
2987		// but during the time we blocked acquiring the extents
2988		// btree, ff_unallocblocks became non-zero and so we
2989		// will need to start a transaction.
2990		//
2991		if (started_tr == 0) {
2992			if (syslocks) {
2993				hfs_systemfile_unlock(hfsmp, lockflags);
2994				syslocks = 0;
2995			}
2996			goto retry;
2997		}
2998
2999		/*
3000		 * Note: ExtendFileC will Release any blocks on loan and
3001		 * aquire real blocks.  So we ask to extend by zero bytes
3002		 * since ExtendFileC will account for the virtual blocks.
3003		 */
3004
3005		loanedBlocks = fp->ff_unallocblocks;
3006		retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3007				     kEFAllMask | kEFNoClumpMask, &actbytes);
3008
3009		if (retval) {
3010			fp->ff_unallocblocks = loanedBlocks;
3011			cp->c_blocks += loanedBlocks;
3012			fp->ff_blocks += loanedBlocks;
3013
3014			hfs_lock_mount (hfsmp);
3015			hfsmp->loanedBlocks += loanedBlocks;
3016			hfs_unlock_mount (hfsmp);
3017
3018			hfs_systemfile_unlock(hfsmp, lockflags);
3019			cp->c_flag |= C_MODIFIED;
3020			if (started_tr) {
3021				(void) hfs_update(vp, TRUE);
3022				(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3023
3024				hfs_end_transaction(hfsmp);
3025				started_tr = 0;
3026			}
3027			goto exit;
3028		}
3029	}
3030
3031	retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
3032	                       ap->a_bpn, &bytesContAvail);
3033	if (syslocks) {
3034		hfs_systemfile_unlock(hfsmp, lockflags);
3035		syslocks = 0;
3036	}
3037
3038	if (started_tr) {
3039		(void) hfs_update(vp, TRUE);
3040		(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3041		hfs_end_transaction(hfsmp);
3042		started_tr = 0;
3043	}
3044	if (retval) {
3045		/* On write, always return error because virtual blocks, if any,
3046		 * should have been allocated in ExtendFileC().  We do not
3047		 * allocate virtual blocks on read, therefore return error
3048		 * only if no virtual blocks are allocated.  Otherwise we search
3049		 * rangelist for zero-fills
3050		 */
3051		if ((MacToVFSError(retval) != ERANGE) ||
3052		    (ap->a_flags & VNODE_WRITE) ||
3053		    ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3054			goto exit;
3055		}
3056
3057		/* Validate if the start offset is within logical file size */
3058		if (ap->a_foffset >= fp->ff_size) {
3059			goto exit;
3060		}
3061
3062		/*
3063		 * At this point, we have encountered a failure during
3064		 * MapFileBlockC that resulted in ERANGE, and we are not servicing
3065		 * a write, and there are borrowed blocks.
3066		 *
3067		 * However, the cluster layer will not call blockmap for
3068		 * blocks that are borrowed and in-cache.  We have to assume that
3069		 * because we observed ERANGE being emitted from MapFileBlockC, this
3070		 * extent range is not valid on-disk.  So we treat this as a
3071		 * mapping that needs to be zero-filled prior to reading.
3072		 *
3073		 * Note that under certain circumstances (such as non-contiguous
3074		 * userland VM mappings in the calling process), cluster_io
3075		 * may be forced to split a large I/O driven by hfs_vnop_write
3076		 * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
3077		 * the case here, then we have already removed the invalid range list
3078		 * mapping prior to getting to this blockmap call, so we should not
3079		 * search the invalid rangelist for this byte range.
3080		 */
3081
3082		bytesContAvail = fp->ff_size - ap->a_foffset;
3083		/*
3084		 * Clip the contiguous available bytes to, at most, the allowable
3085		 * maximum or the amount requested.
3086		 */
3087
3088		if (bytesContAvail > ap->a_size) {
3089			bytesContAvail = ap->a_size;
3090		}
3091
3092		*ap->a_bpn = (daddr64_t) -1;
3093		retval = 0;
3094
3095		goto exit;
3096	}
3097
3098	/* MapFileC() found a valid extent in the filefork.  Search the
3099	 * mapping information further for invalid file ranges
3100	 */
3101	overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3102	                      ap->a_foffset + (off_t)bytesContAvail - 1,
3103	                      &invalid_range);
3104	if (overlaptype != RL_NOOVERLAP) {
3105		switch(overlaptype) {
3106		case RL_MATCHINGOVERLAP:
3107		case RL_OVERLAPCONTAINSRANGE:
3108		case RL_OVERLAPSTARTSBEFORE:
3109			/* There's no valid block for this byte offset */
3110			*ap->a_bpn = (daddr64_t)-1;
3111			/* There's no point limiting the amount to be returned
3112			 * if the invalid range that was hit extends all the way
3113			 * to the EOF (i.e. there's no valid bytes between the
3114			 * end of this range and the file's EOF):
3115			 */
3116			if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3117			    ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3118				bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3119			}
3120			break;
3121
3122		case RL_OVERLAPISCONTAINED:
3123		case RL_OVERLAPENDSAFTER:
3124			/* The range of interest hits an invalid block before the end: */
3125			if (invalid_range->rl_start == ap->a_foffset) {
3126				/* There's actually no valid information to be had starting here: */
3127				*ap->a_bpn = (daddr64_t)-1;
3128				if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3129				    ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3130					bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3131				}
3132			} else {
3133				bytesContAvail = invalid_range->rl_start - ap->a_foffset;
3134			}
3135			break;
3136
3137		case RL_NOOVERLAP:
3138			break;
3139		} /* end switch */
3140		if (bytesContAvail > ap->a_size)
3141			bytesContAvail = ap->a_size;
3142	}
3143
3144exit:
3145	if (retval == 0) {
3146		if (ap->a_run)
3147			*ap->a_run = bytesContAvail;
3148
3149		if (ap->a_poff)
3150			*(int *)ap->a_poff = 0;
3151	}
3152
3153	if (tooklock)
3154		hfs_unlock(cp);
3155
3156	return (MacToVFSError(retval));
3157}
3158
3159/*
3160 * prepare and issue the I/O
3161 * buf_strategy knows how to deal
3162 * with requests that require
3163 * fragmented I/Os
3164 */
3165int
3166hfs_vnop_strategy(struct vnop_strategy_args *ap)
3167{
3168	buf_t	bp = ap->a_bp;
3169	vnode_t	vp = buf_vnode(bp);
3170	int error = 0;
3171
3172	/* Mark buffer as containing static data if cnode flag set */
3173	if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3174		buf_markstatic(bp);
3175	}
3176
3177	/* Mark buffer as containing static data if cnode flag set */
3178	if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3179		bufattr_markgreedymode(&bp->b_attr);
3180	}
3181
3182	/* mark buffer as containing burst mode data if cnode flag set */
3183	if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3184		bufattr_markisochronous(&bp->b_attr);
3185	}
3186
3187#if CONFIG_PROTECT
3188	cnode_t *cp = NULL;
3189
3190	if ((!bufattr_rawencrypted(&bp->b_attr)) &&
3191			((cp = cp_get_protected_cnode(vp)) != NULL)) {
3192		/*
3193		 * We rely upon the truncate lock to protect the
3194		 * CP cache key from getting tossed prior to our IO finishing here.
3195		 * Nearly all cluster io calls to manipulate file payload from HFS
3196		 * take the truncate lock before calling into the cluster
3197		 * layer to ensure the file size does not change, or that they
3198		 * have exclusive right to change the EOF of the file.
3199		 * That same guarantee protects us here since the code that
3200		 * deals with CP lock events must now take the truncate lock
3201		 * before doing anything.
3202		 *
3203		 * There is 1 exception here:
3204		 * 1) One exception should be the VM swapfile IO, because HFS will
3205		 * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
3206		 * swapfile code only without holding the truncate lock.  This is because
3207		 * individual swapfiles are maintained at fixed-length sizes by the VM code.
3208		 * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
3209		 * create our own UPL and thus take the truncate lock before calling
3210		 * into the cluster layer.  In that case, however, we are not concerned
3211		 * with the CP blob being wiped out in the middle of the IO
3212		 * because there isn't anything to toss; the VM swapfile key stays
3213		 * in-core as long as the file is open.
3214		 */
3215
3216
3217		/*
3218		 * Last chance: If this data protected I/O does not have unwrapped keys
3219		 * present, then try to get them.  We already know that it should, by this point.
3220		 */
3221		if (cp->c_cpentry->cp_flags & (CP_KEY_FLUSHED | CP_NEEDS_KEYS)) {
3222			int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS);
3223			if ((error = cp_handle_vnop(vp, io_op, 0)) != 0) {
3224				/*
3225				 * We have to be careful here.  By this point in the I/O path, VM or the cluster
3226				 * engine has prepared a buf_t with the proper file offsets and all the rest,
3227				 * so simply erroring out will result in us leaking this particular buf_t.
3228				 * We need to properly decorate the buf_t just as buf_strategy would so as
3229				 * to make it appear that the I/O errored out with the particular error code.
3230				 */
3231				buf_seterror (bp, error);
3232				buf_biodone(bp);
3233				return error;
3234			}
3235		}
3236
3237		/*
3238		 *NB:
3239		 * For filesystem resize, we may not have access to the underlying
3240		 * file's cache key for whatever reason (device may be locked).  However,
3241		 * we do not need it since we are going to use the temporary HFS-wide resize key
3242		 * which is generated once we start relocating file content.  If this file's I/O
3243		 * should be done using the resize key, it will have been supplied already, so
3244		 * do not attach the file's cp blob to the buffer.
3245		 */
3246		if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
3247			buf_setcpaddr(bp, cp->c_cpentry);
3248		}
3249	}
3250#endif /* CONFIG_PROTECT */
3251
3252	error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3253
3254	return error;
3255}
3256
3257static int
3258hfs_minorupdate(struct vnode *vp) {
3259	struct cnode *cp = VTOC(vp);
3260	cp->c_flag &= ~C_MODIFIED;
3261	cp->c_touch_acctime = 0;
3262	cp->c_touch_chgtime = 0;
3263	cp->c_touch_modtime = 0;
3264
3265	return 0;
3266}
3267
3268int
3269do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3270{
3271	register struct cnode *cp = VTOC(vp);
3272    	struct filefork *fp = VTOF(vp);
3273	kauth_cred_t cred = vfs_context_ucred(context);
3274	int retval;
3275	off_t bytesToAdd;
3276	off_t actualBytesAdded;
3277	off_t filebytes;
3278	u_int32_t fileblocks;
3279	int blksize;
3280	struct hfsmount *hfsmp;
3281	int lockflags;
3282	int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
3283	int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3284
3285	blksize = VTOVCB(vp)->blockSize;
3286	fileblocks = fp->ff_blocks;
3287	filebytes = (off_t)fileblocks * (off_t)blksize;
3288
3289	KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3290		 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3291
3292	if (length < 0)
3293		return (EINVAL);
3294
3295	/* This should only happen with a corrupt filesystem */
3296	if ((off_t)fp->ff_size < 0)
3297		return (EINVAL);
3298
3299	if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3300		return (EFBIG);
3301
3302	hfsmp = VTOHFS(vp);
3303
3304	retval = E_NONE;
3305
3306	/* Files that are changing size are not hot file candidates. */
3307	if (hfsmp->hfc_stage == HFC_RECORDING) {
3308		fp->ff_bytesread = 0;
3309	}
3310
3311	/*
3312	 * We cannot just check if fp->ff_size == length (as an optimization)
3313	 * since there may be extra physical blocks that also need truncation.
3314	 */
3315#if QUOTA
3316	if ((retval = hfs_getinoquota(cp)))
3317		return(retval);
3318#endif /* QUOTA */
3319
3320	/*
3321	 * Lengthen the size of the file. We must ensure that the
3322	 * last byte of the file is allocated. Since the smallest
3323	 * value of ff_size is 0, length will be at least 1.
3324	 */
3325	if (length > (off_t)fp->ff_size) {
3326#if QUOTA
3327		retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3328				   cred, 0);
3329		if (retval)
3330			goto Err_Exit;
3331#endif /* QUOTA */
3332		/*
3333		 * If we don't have enough physical space then
3334		 * we need to extend the physical size.
3335		 */
3336		if (length > filebytes) {
3337			int eflags;
3338			u_int32_t blockHint = 0;
3339
3340			/* All or nothing and don't round up to clumpsize. */
3341			eflags = kEFAllMask | kEFNoClumpMask;
3342
3343			if (cred && (suser(cred, NULL) != 0)) {
3344				eflags |= kEFReserveMask;  /* keep a reserve */
3345			}
3346
3347			/*
3348			 * Allocate Journal and Quota files in metadata zone.
3349			 */
3350			if (filebytes == 0 &&
3351			    hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3352			    hfs_virtualmetafile(cp)) {
3353				eflags |= kEFMetadataMask;
3354				blockHint = hfsmp->hfs_metazone_start;
3355			}
3356			if (hfs_start_transaction(hfsmp) != 0) {
3357			    retval = EINVAL;
3358			    goto Err_Exit;
3359			}
3360
3361			/* Protect extents b-tree and allocation bitmap */
3362			lockflags = SFL_BITMAP;
3363			if (overflow_extents(fp))
3364				lockflags |= SFL_EXTENTS;
3365			lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3366
3367			/*
3368			 * Keep growing the file as long as the current EOF is
3369			 * less than the desired value.
3370			 */
3371			while ((length > filebytes) && (retval == E_NONE)) {
3372				bytesToAdd = length - filebytes;
3373				retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3374                                                    (FCB*)fp,
3375                                                    bytesToAdd,
3376                                                    blockHint,
3377                                                    eflags,
3378                                                    &actualBytesAdded));
3379
3380				filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3381				if (actualBytesAdded == 0 && retval == E_NONE) {
3382					if (length > filebytes)
3383						length = filebytes;
3384					break;
3385				}
3386			} /* endwhile */
3387
3388			hfs_systemfile_unlock(hfsmp, lockflags);
3389
3390			if (hfsmp->jnl) {
3391				if (skipupdate) {
3392					(void) hfs_minorupdate(vp);
3393				}
3394				else {
3395					(void) hfs_update(vp, TRUE);
3396					(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3397				}
3398			}
3399
3400			hfs_end_transaction(hfsmp);
3401
3402			if (retval)
3403				goto Err_Exit;
3404
3405			KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3406				(int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3407		}
3408
3409		if (ISSET(flags, IO_NOZEROFILL)) {
3410			// An optimisation for the hibernation file
3411			if (vnode_isswap(vp))
3412				rl_remove_all(&fp->ff_invalidranges);
3413		} else {
3414			if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3415				struct rl_entry *invalid_range;
3416				off_t zero_limit;
3417
3418				zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
3419				if (length < zero_limit) zero_limit = length;
3420
3421				if (length > (off_t)fp->ff_size) {
3422					struct timeval tv;
3423
3424		   			/* Extending the file: time to fill out the current last page w. zeroes? */
3425		   			if ((fp->ff_size & PAGE_MASK_64) &&
3426					    (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
3427					    fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
3428
3429						/* There's some valid data at the start of the (current) last page
3430						   of the file, so zero out the remainder of that page to ensure the
3431						   entire page contains valid data.  Since there is no invalid range
3432						   possible past the (current) eof, there's no need to remove anything
3433						   from the invalid range list before calling cluster_write():	*/
3434						hfs_unlock(cp);
3435						retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
3436								fp->ff_size, (off_t)0,
3437								(flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
3438						hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3439						if (retval) goto Err_Exit;
3440
3441						/* Merely invalidate the remaining area, if necessary: */
3442						if (length > zero_limit) {
3443							microuptime(&tv);
3444							rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3445							cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3446						}
3447		   			} else {
3448					/* The page containing the (current) eof is invalid: just add the
3449					   remainder of the page to the invalid list, along with the area
3450					   being newly allocated:
3451					 */
3452					microuptime(&tv);
3453					rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3454					cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3455					};
3456				}
3457			} else {
3458					panic("hfs_truncate: invoked on non-UBC object?!");
3459			};
3460		}
3461		if (suppress_times == 0) {
3462			cp->c_touch_modtime = TRUE;
3463		}
3464		fp->ff_size = length;
3465
3466	} else { /* Shorten the size of the file */
3467
3468		// An optimisation for the hibernation file
3469		if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3470			rl_remove_all(&fp->ff_invalidranges);
3471		} else if ((off_t)fp->ff_size > length) {
3472			/* Any space previously marked as invalid is now irrelevant: */
3473			rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3474		}
3475
3476		/*
3477		 * Account for any unmapped blocks. Note that the new
3478		 * file length can still end up with unmapped blocks.
3479		 */
3480		if (fp->ff_unallocblocks > 0) {
3481			u_int32_t finalblks;
3482			u_int32_t loanedBlocks;
3483
3484			hfs_lock_mount(hfsmp);
3485			loanedBlocks = fp->ff_unallocblocks;
3486			cp->c_blocks -= loanedBlocks;
3487			fp->ff_blocks -= loanedBlocks;
3488			fp->ff_unallocblocks = 0;
3489
3490			hfsmp->loanedBlocks -= loanedBlocks;
3491
3492			finalblks = (length + blksize - 1) / blksize;
3493			if (finalblks > fp->ff_blocks) {
3494				/* calculate required unmapped blocks */
3495				loanedBlocks = finalblks - fp->ff_blocks;
3496				hfsmp->loanedBlocks += loanedBlocks;
3497
3498				fp->ff_unallocblocks = loanedBlocks;
3499				cp->c_blocks += loanedBlocks;
3500				fp->ff_blocks += loanedBlocks;
3501			}
3502			hfs_unlock_mount (hfsmp);
3503		}
3504
3505#if QUOTA
3506		off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3507#endif /* QUOTA */
3508		if (hfs_start_transaction(hfsmp) != 0) {
3509			retval = EINVAL;
3510			goto Err_Exit;
3511		}
3512
3513		if (fp->ff_unallocblocks == 0) {
3514			/* Protect extents b-tree and allocation bitmap */
3515			lockflags = SFL_BITMAP;
3516			if (overflow_extents(fp))
3517				lockflags |= SFL_EXTENTS;
3518			lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3519
3520			retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3521												 FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3522
3523			hfs_systemfile_unlock(hfsmp, lockflags);
3524		}
3525		if (hfsmp->jnl) {
3526			if (retval == 0) {
3527				fp->ff_size = length;
3528			}
3529			if (skipupdate) {
3530				(void) hfs_minorupdate(vp);
3531			}
3532			else {
3533				(void) hfs_update(vp, TRUE);
3534				(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3535			}
3536		}
3537		hfs_end_transaction(hfsmp);
3538
3539		filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3540		if (retval)
3541			goto Err_Exit;
3542#if QUOTA
3543		/* These are bytesreleased */
3544		(void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3545#endif /* QUOTA */
3546
3547		/*
3548		 * Only set update flag if the logical length changes & we aren't
3549		 * suppressing modtime updates.
3550		 */
3551		if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3552			cp->c_touch_modtime = TRUE;
3553		}
3554		fp->ff_size = length;
3555	}
3556	if (cp->c_mode & (S_ISUID | S_ISGID)) {
3557		if (!vfs_context_issuser(context)) {
3558			cp->c_mode &= ~(S_ISUID | S_ISGID);
3559			skipupdate = 0;
3560		}
3561	}
3562	if (skipupdate) {
3563		retval = hfs_minorupdate(vp);
3564	}
3565	else {
3566		cp->c_touch_chgtime = TRUE;	/* status changed */
3567		if (suppress_times == 0) {
3568			cp->c_touch_modtime = TRUE;	/* file data was modified */
3569
3570			/*
3571			 * If we are not suppressing the modtime update, then
3572			 * update the gen count as well.
3573			 */
3574			if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3575				hfs_incr_gencount(cp);
3576			}
3577		}
3578
3579		retval = hfs_update(vp, MNT_WAIT);
3580	}
3581	if (retval) {
3582		KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3583		     -1, -1, -1, retval, 0);
3584	}
3585
3586Err_Exit:
3587
3588	KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3589		 (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3590
3591	return (retval);
3592}
3593
3594/*
3595 * Preparation which must be done prior to deleting the catalog record
3596 * of a file or directory.  In order to make the on-disk as safe as possible,
3597 * we remove the catalog entry before releasing the bitmap blocks and the
3598 * overflow extent records.  However, some work must be done prior to deleting
3599 * the catalog record.
3600 *
3601 * When calling this function, the cnode must exist both in memory and on-disk.
3602 * If there are both resource fork and data fork vnodes, this function should
3603 * be called on both.
3604 */
3605
3606int
3607hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3608
3609	struct filefork *fp = VTOF(vp);
3610	struct cnode *cp = VTOC(vp);
3611#if QUOTA
3612	int retval = 0;
3613#endif /* QUOTA */
3614
3615	/* Cannot truncate an HFS directory! */
3616	if (vnode_isdir(vp)) {
3617		return (EISDIR);
3618	}
3619
3620	/*
3621	 * See the comment below in hfs_truncate for why we need to call
3622	 * setsize here.  Essentially we want to avoid pending IO if we
3623	 * already know that the blocks are going to be released here.
3624	 * This function is only called when totally removing all storage for a file, so
3625	 * we can take a shortcut and immediately setsize (0);
3626	 */
3627	ubc_setsize(vp, 0);
3628
3629	/* This should only happen with a corrupt filesystem */
3630	if ((off_t)fp->ff_size < 0)
3631		return (EINVAL);
3632
3633	/*
3634	 * We cannot just check if fp->ff_size == length (as an optimization)
3635	 * since there may be extra physical blocks that also need truncation.
3636	 */
3637#if QUOTA
3638	if ((retval = hfs_getinoquota(cp))) {
3639		return(retval);
3640	}
3641#endif /* QUOTA */
3642
3643	/* Wipe out any invalid ranges which have yet to be backed by disk */
3644	rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3645
3646	/*
3647	 * Account for any unmapped blocks. Since we're deleting the
3648	 * entire file, we don't have to worry about just shrinking
3649	 * to a smaller number of borrowed blocks.
3650	 */
3651	if (fp->ff_unallocblocks > 0) {
3652		u_int32_t loanedBlocks;
3653
3654		hfs_lock_mount (hfsmp);
3655		loanedBlocks = fp->ff_unallocblocks;
3656		cp->c_blocks -= loanedBlocks;
3657		fp->ff_blocks -= loanedBlocks;
3658		fp->ff_unallocblocks = 0;
3659
3660		hfsmp->loanedBlocks -= loanedBlocks;
3661
3662		hfs_unlock_mount (hfsmp);
3663	}
3664
3665	return 0;
3666}
3667
3668
3669/*
3670 * Special wrapper around calling TruncateFileC.  This function is useable
3671 * even when the catalog record does not exist any longer, making it ideal
3672 * for use when deleting a file.  The simplification here is that we know
3673 * that we are releasing all blocks.
3674 *
3675 * Note that this function may be called when there is no vnode backing
3676 * the file fork in question.  We may call this from hfs_vnop_inactive
3677 * to clear out resource fork data (and may not want to clear out the data
3678 * fork yet).  As a result, we pointer-check both sets of inputs before
3679 * doing anything with them.
3680 *
3681 * The caller is responsible for saving off a copy of the filefork(s)
3682 * embedded within the cnode prior to calling this function.  The pointers
3683 * supplied as arguments must be valid even if the cnode is no longer valid.
3684 */
3685
3686int
3687hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3688					 struct filefork *rsrcfork, u_int32_t fileid) {
3689
3690	off_t filebytes;
3691	u_int32_t fileblocks;
3692	int blksize = 0;
3693	int error = 0;
3694	int lockflags;
3695
3696	blksize = hfsmp->blockSize;
3697
3698	/* Data Fork */
3699	if (datafork) {
3700		datafork->ff_size = 0;
3701
3702		fileblocks = datafork->ff_blocks;
3703		filebytes = (off_t)fileblocks * (off_t)blksize;
3704
3705		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3706
3707		while (filebytes > 0) {
3708			if (filebytes > HFS_BIGFILE_SIZE) {
3709				filebytes -= HFS_BIGFILE_SIZE;
3710			} else {
3711				filebytes = 0;
3712			}
3713
3714			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
3715			if (hfs_start_transaction(hfsmp) != 0) {
3716				error = EINVAL;
3717				break;
3718			}
3719
3720			if (datafork->ff_unallocblocks == 0) {
3721				/* Protect extents b-tree and allocation bitmap */
3722				lockflags = SFL_BITMAP;
3723				if (overflow_extents(datafork))
3724					lockflags |= SFL_EXTENTS;
3725				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3726
3727				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3728
3729				hfs_systemfile_unlock(hfsmp, lockflags);
3730			}
3731			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3732
3733			/* Finish the transaction and start over if necessary */
3734			hfs_end_transaction(hfsmp);
3735
3736			if (error) {
3737				break;
3738			}
3739		}
3740	}
3741
3742	/* Resource fork */
3743	if (error == 0 && rsrcfork) {
3744		rsrcfork->ff_size = 0;
3745
3746		fileblocks = rsrcfork->ff_blocks;
3747		filebytes = (off_t)fileblocks * (off_t)blksize;
3748
3749		/* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3750
3751		while (filebytes > 0) {
3752			if (filebytes > HFS_BIGFILE_SIZE) {
3753				filebytes -= HFS_BIGFILE_SIZE;
3754			} else {
3755				filebytes = 0;
3756			}
3757
3758			/* Start a transaction, and wipe out as many blocks as we can in this iteration */
3759			if (hfs_start_transaction(hfsmp) != 0) {
3760				error = EINVAL;
3761				break;
3762			}
3763
3764			if (rsrcfork->ff_unallocblocks == 0) {
3765				/* Protect extents b-tree and allocation bitmap */
3766				lockflags = SFL_BITMAP;
3767				if (overflow_extents(rsrcfork))
3768					lockflags |= SFL_EXTENTS;
3769				lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3770
3771				error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3772
3773				hfs_systemfile_unlock(hfsmp, lockflags);
3774			}
3775			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3776
3777			/* Finish the transaction and start over if necessary */
3778			hfs_end_transaction(hfsmp);
3779
3780			if (error) {
3781				break;
3782			}
3783		}
3784	}
3785
3786	return error;
3787}
3788
3789errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3790{
3791	errno_t error;
3792
3793	/*
3794	 * Call ubc_setsize to give the VM subsystem a chance to do
3795	 * whatever it needs to with existing pages before we delete
3796	 * blocks.  Note that symlinks don't use the UBC so we'll
3797	 * get back ENOENT in that case.
3798	 */
3799	if (have_cnode_lock) {
3800		error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3801		if (error == EAGAIN) {
3802			cnode_t *cp = VTOC(vp);
3803
3804			if (cp->c_truncatelockowner != current_thread()) {
3805#if DEVELOPMENT || DEBUG
3806				panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3807#else
3808				printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
3809#endif
3810			}
3811
3812			hfs_unlock(cp);
3813			error = ubc_setsize_ex(vp, len, 0);
3814			hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3815		}
3816	} else
3817		error = ubc_setsize_ex(vp, len, 0);
3818
3819	return error == ENOENT ? 0 : error;
3820}
3821
3822/*
3823 * Truncate a cnode to at most length size, freeing (or adding) the
3824 * disk blocks.
3825 */
3826int
3827hfs_truncate(struct vnode *vp, off_t length, int flags,
3828			 int truncateflags, vfs_context_t context)
3829{
3830	struct filefork *fp = VTOF(vp);
3831	off_t filebytes;
3832	u_int32_t fileblocks;
3833	int blksize;
3834	errno_t error = 0;
3835	struct cnode *cp = VTOC(vp);
3836
3837	/* Cannot truncate an HFS directory! */
3838	if (vnode_isdir(vp)) {
3839		return (EISDIR);
3840	}
3841	/* A swap file cannot change size. */
3842	if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
3843		return (EPERM);
3844	}
3845
3846	blksize = VTOVCB(vp)->blockSize;
3847	fileblocks = fp->ff_blocks;
3848	filebytes = (off_t)fileblocks * (off_t)blksize;
3849
3850	bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
3851
3852	error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
3853	if (error)
3854		return error;
3855
3856	if (!caller_has_cnode_lock) {
3857		error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3858		if (error)
3859			return error;
3860	}
3861
3862	// have to loop truncating or growing files that are
3863	// really big because otherwise transactions can get
3864	// enormous and consume too many kernel resources.
3865
3866	if (length < filebytes) {
3867		while (filebytes > length) {
3868			if ((filebytes - length) > HFS_BIGFILE_SIZE) {
3869		    		filebytes -= HFS_BIGFILE_SIZE;
3870			} else {
3871		    		filebytes = length;
3872			}
3873			cp->c_flag |= C_FORCEUPDATE;
3874			error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3875			if (error)
3876				break;
3877		}
3878	} else if (length > filebytes) {
3879		while (filebytes < length) {
3880			if ((length - filebytes) > HFS_BIGFILE_SIZE) {
3881				filebytes += HFS_BIGFILE_SIZE;
3882			} else {
3883				filebytes = length;
3884			}
3885			cp->c_flag |= C_FORCEUPDATE;
3886			error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3887			if (error)
3888				break;
3889		}
3890	} else /* Same logical size */ {
3891
3892		error = do_hfs_truncate(vp, length, flags, truncateflags, context);
3893	}
3894	/* Files that are changing size are not hot file candidates. */
3895	if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3896		fp->ff_bytesread = 0;
3897	}
3898
3899	if (!caller_has_cnode_lock)
3900		hfs_unlock(cp);
3901
3902	// Make sure UBC's size matches up (in case we didn't completely succeed)
3903	errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
3904	if (!error)
3905		error = err2;
3906
3907	return error;
3908}
3909
3910
3911/*
3912 * Preallocate file storage space.
3913 */
3914int
3915hfs_vnop_allocate(struct vnop_allocate_args /* {
3916		vnode_t a_vp;
3917		off_t a_length;
3918		u_int32_t  a_flags;
3919		off_t *a_bytesallocated;
3920		off_t a_offset;
3921		vfs_context_t a_context;
3922	} */ *ap)
3923{
3924	struct vnode *vp = ap->a_vp;
3925	struct cnode *cp;
3926	struct filefork *fp;
3927	ExtendedVCB *vcb;
3928	off_t length = ap->a_length;
3929	off_t startingPEOF;
3930	off_t moreBytesRequested;
3931	off_t actualBytesAdded;
3932	off_t filebytes;
3933	u_int32_t fileblocks;
3934	int retval, retval2;
3935	u_int32_t blockHint;
3936	u_int32_t extendFlags;   /* For call to ExtendFileC */
3937	struct hfsmount *hfsmp;
3938	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3939	int lockflags;
3940	time_t orig_ctime;
3941
3942	*(ap->a_bytesallocated) = 0;
3943
3944	if (!vnode_isreg(vp))
3945		return (EISDIR);
3946	if (length < (off_t)0)
3947		return (EINVAL);
3948
3949	cp = VTOC(vp);
3950
3951	orig_ctime = VTOC(vp)->c_ctime;
3952
3953	check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3954
3955	hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3956
3957	if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
3958		goto Err_Exit;
3959	}
3960
3961	fp = VTOF(vp);
3962	hfsmp = VTOHFS(vp);
3963	vcb = VTOVCB(vp);
3964
3965	fileblocks = fp->ff_blocks;
3966	filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3967
3968	if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3969		retval = EINVAL;
3970		goto Err_Exit;
3971	}
3972
3973	/* Fill in the flags word for the call to Extend the file */
3974
3975	extendFlags = kEFNoClumpMask;
3976	if (ap->a_flags & ALLOCATECONTIG)
3977		extendFlags |= kEFContigMask;
3978	if (ap->a_flags & ALLOCATEALL)
3979		extendFlags |= kEFAllMask;
3980	if (cred && suser(cred, NULL) != 0)
3981		extendFlags |= kEFReserveMask;
3982	if (hfs_virtualmetafile(cp))
3983		extendFlags |= kEFMetadataMask;
3984
3985	retval = E_NONE;
3986	blockHint = 0;
3987	startingPEOF = filebytes;
3988
3989	if (ap->a_flags & ALLOCATEFROMPEOF)
3990		length += filebytes;
3991	else if (ap->a_flags & ALLOCATEFROMVOL)
3992		blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3993
3994	/* If no changes are necesary, then we're done */
3995	if (filebytes == length)
3996		goto Std_Exit;
3997
3998	/*
3999	 * Lengthen the size of the file. We must ensure that the
4000	 * last byte of the file is allocated. Since the smallest
4001	 * value of filebytes is 0, length will be at least 1.
4002	 */
4003	if (length > filebytes) {
4004		off_t total_bytes_added = 0, orig_request_size;
4005
4006		orig_request_size = moreBytesRequested = length - filebytes;
4007
4008#if QUOTA
4009		retval = hfs_chkdq(cp,
4010				(int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4011				cred, 0);
4012		if (retval)
4013			goto Err_Exit;
4014
4015#endif /* QUOTA */
4016		/*
4017		 * Metadata zone checks.
4018		 */
4019		if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4020			/*
4021			 * Allocate Journal and Quota files in metadata zone.
4022			 */
4023			if (hfs_virtualmetafile(cp)) {
4024				blockHint = hfsmp->hfs_metazone_start;
4025			} else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4026				   (blockHint <= hfsmp->hfs_metazone_end)) {
4027				/*
4028				 * Move blockHint outside metadata zone.
4029				 */
4030				blockHint = hfsmp->hfs_metazone_end + 1;
4031			}
4032		}
4033
4034
4035		while ((length > filebytes) && (retval == E_NONE)) {
4036		    off_t bytesRequested;
4037
4038		    if (hfs_start_transaction(hfsmp) != 0) {
4039			retval = EINVAL;
4040			goto Err_Exit;
4041		    }
4042
4043		    /* Protect extents b-tree and allocation bitmap */
4044		    lockflags = SFL_BITMAP;
4045		    if (overflow_extents(fp))
4046				lockflags |= SFL_EXTENTS;
4047		    lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4048
4049		    if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4050				bytesRequested = HFS_BIGFILE_SIZE;
4051		    } else {
4052				bytesRequested = moreBytesRequested;
4053		    }
4054
4055		    if (extendFlags & kEFContigMask) {
4056			    // if we're on a sparse device, this will force it to do a
4057			    // full scan to find the space needed.
4058			    hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4059		    }
4060
4061		    retval = MacToVFSError(ExtendFileC(vcb,
4062						(FCB*)fp,
4063						bytesRequested,
4064						blockHint,
4065						extendFlags,
4066						&actualBytesAdded));
4067
4068		    if (retval == E_NONE) {
4069			*(ap->a_bytesallocated) += actualBytesAdded;
4070			total_bytes_added += actualBytesAdded;
4071			moreBytesRequested -= actualBytesAdded;
4072			if (blockHint != 0) {
4073			    blockHint += actualBytesAdded / vcb->blockSize;
4074			}
4075		    }
4076		    filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4077
4078		    hfs_systemfile_unlock(hfsmp, lockflags);
4079
4080		    if (hfsmp->jnl) {
4081			(void) hfs_update(vp, TRUE);
4082			(void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4083		    }
4084
4085		    hfs_end_transaction(hfsmp);
4086		}
4087
4088
4089		/*
4090		 * if we get an error and no changes were made then exit
4091		 * otherwise we must do the hfs_update to reflect the changes
4092		 */
4093		if (retval && (startingPEOF == filebytes))
4094			goto Err_Exit;
4095
4096		/*
4097		 * Adjust actualBytesAdded to be allocation block aligned, not
4098		 * clump size aligned.
4099		 * NOTE: So what we are reporting does not affect reality
4100		 * until the file is closed, when we truncate the file to allocation
4101		 * block size.
4102		 */
4103		if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4104			*(ap->a_bytesallocated) =
4105				roundup(orig_request_size, (off_t)vcb->blockSize);
4106
4107	} else { /* Shorten the size of the file */
4108
4109		/*
4110		 * N.B. At present, this code is never called.  If and when we
4111		 * do start using it, it looks like there might be slightly
4112		 * strange semantics with the file size: it's possible for the
4113		 * file size to *increase* e.g. if current file size is 5,
4114		 * length is 1024 and filebytes is 4096, the file size will
4115		 * end up being 1024 bytes.  This isn't necessarily a problem
4116		 * but it's not consistent with the code above which doesn't
4117		 * change the file size.
4118		 */
4119
4120		retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4121		filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4122
4123		/*
4124		 * if we get an error and no changes were made then exit
4125		 * otherwise we must do the hfs_update to reflect the changes
4126		 */
4127		if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4128#if QUOTA
4129		/* These are  bytesreleased */
4130		(void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4131#endif /* QUOTA */
4132
4133		if (fp->ff_size > filebytes) {
4134			fp->ff_size = filebytes;
4135
4136			hfs_ubc_setsize(vp, fp->ff_size, true);
4137		}
4138	}
4139
4140Std_Exit:
4141	cp->c_touch_chgtime = TRUE;
4142	cp->c_touch_modtime = TRUE;
4143	retval2 = hfs_update(vp, MNT_WAIT);
4144
4145	if (retval == 0)
4146		retval = retval2;
4147Err_Exit:
4148	hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4149	hfs_unlock(cp);
4150	return (retval);
4151}
4152
4153
4154/*
4155 * Pagein for HFS filesystem
4156 */
4157int
4158hfs_vnop_pagein(struct vnop_pagein_args *ap)
4159/*
4160	struct vnop_pagein_args {
4161	   	vnode_t a_vp,
4162	   	upl_t 	      a_pl,
4163		vm_offset_t   a_pl_offset,
4164		off_t         a_f_offset,
4165		size_t        a_size,
4166		int           a_flags
4167		vfs_context_t a_context;
4168	};
4169*/
4170{
4171	vnode_t 	vp;
4172	struct cnode	*cp;
4173	struct filefork *fp;
4174	int		error = 0;
4175	upl_t 		upl;
4176	upl_page_info_t	*pl;
4177	off_t		f_offset;
4178	off_t		page_needed_f_offset;
4179	int		offset;
4180	int		isize;
4181	int		upl_size;
4182	int		pg_index;
4183	boolean_t	truncate_lock_held = FALSE;
4184	boolean_t 	file_converted = FALSE;
4185	kern_return_t	kret;
4186
4187	vp = ap->a_vp;
4188	cp = VTOC(vp);
4189	fp = VTOF(vp);
4190
4191#if CONFIG_PROTECT
4192	if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4193		/*
4194		 * If we errored here, then this means that one of two things occurred:
4195		 * 1. there was a problem with the decryption of the key.
4196		 * 2. the device is locked and we are not allowed to access this particular file.
4197		 *
4198		 * Either way, this means that we need to shut down this upl now.  As long as
4199		 * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4200		 * then we create a upl and immediately abort it.
4201		 */
4202		if (ap->a_pl == NULL) {
4203			/* create the upl */
4204			ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4205					UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4206			/* mark the range as needed so it doesn't immediately get discarded upon abort */
4207			ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4208
4209			/* Abort the range */
4210			ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4211		}
4212
4213
4214		return error;
4215	}
4216#endif /* CONFIG_PROTECT */
4217
4218	if (ap->a_pl != NULL) {
4219		/*
4220		 * this can only happen for swap files now that
4221		 * we're asking for V2 paging behavior...
4222		 * so don't need to worry about decompression, or
4223		 * keeping track of blocks read or taking the truncate lock
4224		 */
4225		error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4226				       ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4227		goto pagein_done;
4228	}
4229
4230	page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4231
4232retry_pagein:
4233	/*
4234	 * take truncate lock (shared/recursive) to guard against
4235	 * zero-fill thru fsync interfering, but only for v2
4236	 *
4237	 * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4238	 * lock shared and we are allowed to recurse 1 level if this thread already
4239	 * owns the lock exclusively... this can legally occur
4240	 * if we are doing a shrinking ftruncate against a file
4241	 * that is mapped private, and the pages being truncated
4242	 * do not currently exist in the cache... in that case
4243	 * we will have to page-in the missing pages in order
4244	 * to provide them to the private mapping... we must
4245	 * also call hfs_unlock_truncate with a postive been_recursed
4246	 * arg to indicate that if we have recursed, there is no need to drop
4247	 * the lock.  Allowing this simple recursion is necessary
4248	 * in order to avoid a certain deadlock... since the ftruncate
4249	 * already holds the truncate lock exclusively, if we try
4250	 * to acquire it shared to protect the pagein path, we will
4251	 * hang this thread
4252	 *
4253	 * NOTE: The if () block below is a workaround in order to prevent a
4254	 * VM deadlock. See rdar://7853471.
4255	 *
4256	 * If we are in a forced unmount, then launchd will still have the
4257	 * dyld_shared_cache file mapped as it is trying to reboot.  If we
4258	 * take the truncate lock here to service a page fault, then our
4259	 * thread could deadlock with the forced-unmount.  The forced unmount
4260	 * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4261	 * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4262	 * thread will think it needs to copy all of the data out of the file
4263	 * and into a VM copy object.  If we hold the cnode lock here, then that
4264	 * VM operation will not be able to proceed, because we'll set a busy page
4265	 * before attempting to grab the lock.  Note that this isn't as simple as "don't
4266	 * call ubc_setsize" because doing that would just shift the problem to the
4267	 * ubc_msync done before the vnode is reclaimed.
4268	 *
4269	 * So, if a forced unmount on this volume is in flight AND the cnode is
4270	 * marked C_DELETED, then just go ahead and do the page in without taking
4271	 * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4272	 * that is not going to be available on the next mount, this seems like a
4273	 * OK solution from a correctness point of view, even though it is hacky.
4274	 */
4275	if (vfs_isforce(vp->v_mount)) {
4276		if (cp->c_flag & C_DELETED) {
4277			/* If we don't get it, then just go ahead and operate without the lock */
4278			truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4279		}
4280	}
4281	else {
4282		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4283		truncate_lock_held = TRUE;
4284	}
4285
4286	kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4287
4288	if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4289		error = EINVAL;
4290		goto pagein_done;
4291	}
4292	ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4293
4294	upl_size = isize = ap->a_size;
4295
4296	/*
4297	 * Scan from the back to find the last page in the UPL, so that we
4298	 * aren't looking at a UPL that may have already been freed by the
4299	 * preceding aborts/completions.
4300	 */
4301	for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4302		if (upl_page_present(pl, --pg_index))
4303			break;
4304		if (pg_index == 0) {
4305			/*
4306			 * no absent pages were found in the range specified
4307			 * just abort the UPL to get rid of it and then we're done
4308			 */
4309			ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4310			goto pagein_done;
4311		}
4312	}
4313	/*
4314	 * initialize the offset variables before we touch the UPL.
4315	 * f_offset is the position into the file, in bytes
4316	 * offset is the position into the UPL, in bytes
4317	 * pg_index is the pg# of the UPL we're operating on
4318	 * isize is the offset into the UPL of the last page that is present.
4319	 */
4320	isize = ((pg_index + 1) * PAGE_SIZE);
4321	pg_index = 0;
4322	offset = 0;
4323	f_offset = ap->a_f_offset;
4324
4325	while (isize) {
4326		int  xsize;
4327		int  num_of_pages;
4328
4329		if ( !upl_page_present(pl, pg_index)) {
4330			/*
4331			 * we asked for RET_ONLY_ABSENT, so it's possible
4332			 * to get back empty slots in the UPL.
4333			 * just skip over them
4334			 */
4335			f_offset += PAGE_SIZE;
4336			offset   += PAGE_SIZE;
4337			isize    -= PAGE_SIZE;
4338			pg_index++;
4339
4340			continue;
4341		}
4342		/*
4343		 * We know that we have at least one absent page.
4344		 * Now checking to see how many in a row we have
4345		 */
4346		num_of_pages = 1;
4347		xsize = isize - PAGE_SIZE;
4348
4349		while (xsize) {
4350			if ( !upl_page_present(pl, pg_index + num_of_pages))
4351				break;
4352			num_of_pages++;
4353			xsize -= PAGE_SIZE;
4354		}
4355		xsize = num_of_pages * PAGE_SIZE;
4356
4357#if HFS_COMPRESSION
4358		if (VNODE_IS_RSRC(vp)) {
4359			/* allow pageins of the resource fork */
4360		} else {
4361			int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4362
4363			if (compressed) {
4364
4365				if (truncate_lock_held) {
4366					/*
4367					 * can't hold the truncate lock when calling into the decmpfs layer
4368					 * since it calls back into this layer... even though we're only
4369					 * holding the lock in shared mode, and the re-entrant path only
4370					 * takes the lock shared, we can deadlock if some other thread
4371					 * tries to grab the lock exclusively in between.
4372					 */
4373					hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4374					truncate_lock_held = FALSE;
4375				}
4376				ap->a_pl = upl;
4377				ap->a_pl_offset = offset;
4378				ap->a_f_offset = f_offset;
4379				ap->a_size = xsize;
4380
4381				error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4382				/*
4383				 * note that decpfs_pagein_compressed can change the state of
4384				 * 'compressed'... it will set it to 0 if the file is no longer
4385				 * compressed once the compression lock is successfully taken
4386				 * i.e. we would block on that lock while the file is being inflated
4387				 */
4388				if (compressed) {
4389					if (error == 0) {
4390						/* successful page-in, update the access time */
4391						VTOC(vp)->c_touch_acctime = TRUE;
4392
4393						/* compressed files are not hot file candidates */
4394						if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4395							fp->ff_bytesread = 0;
4396						}
4397					} else if (error == EAGAIN) {
4398						/*
4399						 * EAGAIN indicates someone else already holds the compression lock...
4400						 * to avoid deadlocking, we'll abort this range of pages with an
4401						 * indication that the pagein needs to be redriven
4402						 */
4403			        		ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4404					} else if (error == ENOSPC) {
4405
4406						if (upl_size == PAGE_SIZE)
4407							panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4408
4409						ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4410
4411						ap->a_size = PAGE_SIZE;
4412						ap->a_pl = NULL;
4413						ap->a_pl_offset = 0;
4414						ap->a_f_offset = page_needed_f_offset;
4415
4416						goto retry_pagein;
4417					}
4418					goto pagein_next_range;
4419				}
4420				else {
4421					/*
4422					 * Set file_converted only if the file became decompressed while we were
4423					 * paging in.  If it were still compressed, we would re-start the loop using the goto
4424					 * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4425					 * condition below, since we could have avoided taking the truncate lock to prevent
4426					 * a deadlock in the force unmount case.
4427					 */
4428					file_converted = TRUE;
4429				}
4430			}
4431			if (file_converted == TRUE) {
4432				/*
4433				 * the file was converted back to a regular file after we first saw it as compressed
4434				 * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4435				 * reset a_size so that we consider what remains of the original request
4436				 * and null out a_upl and a_pl_offset.
4437				 *
4438				 * We should only be able to get into this block if the decmpfs_pagein_compressed
4439				 * successfully decompressed the range in question for this file.
4440				 */
4441				ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4442
4443				ap->a_size = isize;
4444				ap->a_pl = NULL;
4445				ap->a_pl_offset = 0;
4446
4447				/* Reset file_converted back to false so that we don't infinite-loop. */
4448				file_converted = FALSE;
4449				goto retry_pagein;
4450			}
4451		}
4452#endif
4453		error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4454
4455		/*
4456		 * Keep track of blocks read.
4457		 */
4458		if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4459			int bytesread;
4460			int took_cnode_lock = 0;
4461
4462			if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4463				bytesread = fp->ff_size;
4464			else
4465				bytesread = xsize;
4466
4467			/* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4468			if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4469				hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4470				took_cnode_lock = 1;
4471			}
4472			/*
4473			 * If this file hasn't been seen since the start of
4474			 * the current sampling period then start over.
4475			 */
4476			if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4477				struct timeval tv;
4478
4479				fp->ff_bytesread = bytesread;
4480				microtime(&tv);
4481				cp->c_atime = tv.tv_sec;
4482			} else {
4483				fp->ff_bytesread += bytesread;
4484			}
4485			cp->c_touch_acctime = TRUE;
4486			if (took_cnode_lock)
4487				hfs_unlock(cp);
4488		}
4489pagein_next_range:
4490		f_offset += xsize;
4491		offset   += xsize;
4492		isize    -= xsize;
4493		pg_index += num_of_pages;
4494
4495		error = 0;
4496	}
4497
4498pagein_done:
4499	if (truncate_lock_held == TRUE) {
4500		/* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4501		hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4502	}
4503
4504	return (error);
4505}
4506
4507/*
4508 * Pageout for HFS filesystem.
4509 */
4510int
4511hfs_vnop_pageout(struct vnop_pageout_args *ap)
4512/*
4513	struct vnop_pageout_args {
4514	   vnode_t a_vp,
4515	   upl_t         a_pl,
4516	   vm_offset_t   a_pl_offset,
4517	   off_t         a_f_offset,
4518	   size_t        a_size,
4519	   int           a_flags
4520	   vfs_context_t a_context;
4521	};
4522*/
4523{
4524	vnode_t vp = ap->a_vp;
4525	struct cnode *cp;
4526	struct filefork *fp;
4527	int retval = 0;
4528	off_t filesize;
4529	upl_t 		upl;
4530	upl_page_info_t* pl;
4531	vm_offset_t	a_pl_offset;
4532	int		a_flags;
4533	int is_pageoutv2 = 0;
4534	kern_return_t kret;
4535
4536	cp = VTOC(vp);
4537	fp = VTOF(vp);
4538
4539	/*
4540	 * Figure out where the file ends, for pageout purposes.  If
4541	 * ff_new_size > ff_size, then we're in the middle of extending the
4542	 * file via a write, so it is safe (and necessary) that we be able
4543	 * to pageout up to that point.
4544	 */
4545	filesize = fp->ff_size;
4546	if (fp->ff_new_size > filesize)
4547		filesize = fp->ff_new_size;
4548
4549	a_flags = ap->a_flags;
4550	a_pl_offset = ap->a_pl_offset;
4551
4552	/*
4553	 * we can tell if we're getting the new or old behavior from the UPL
4554	 */
4555	if ((upl = ap->a_pl) == NULL) {
4556		int request_flags;
4557
4558		is_pageoutv2 = 1;
4559		/*
4560		 * we're in control of any UPL we commit
4561		 * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4562		 */
4563		a_flags &= ~UPL_NOCOMMIT;
4564		a_pl_offset = 0;
4565
4566		/*
4567		 * For V2 semantics, we want to take the cnode truncate lock
4568		 * shared to guard against the file size changing via zero-filling.
4569		 *
4570		 * However, we have to be careful because we may be invoked
4571		 * via the ubc_msync path to write out dirty mmap'd pages
4572		 * in response to a lock event on a content-protected
4573		 * filesystem (e.g. to write out class A files).
4574		 * As a result, we want to take the truncate lock 'SHARED' with
4575		 * the mini-recursion locktype so that we don't deadlock/panic
4576		 * because we may be already holding the truncate lock exclusive to force any other
4577		 * IOs to have blocked behind us.
4578		 */
4579		hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4580
4581		if (a_flags & UPL_MSYNC) {
4582			request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4583		}
4584		else {
4585			request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4586		}
4587
4588		kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4589
4590		if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4591			retval = EINVAL;
4592			goto pageout_done;
4593		}
4594	}
4595	/*
4596	 * from this point forward upl points at the UPL we're working with
4597	 * it was either passed in or we succesfully created it
4598	 */
4599
4600	/*
4601	 * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4602	 * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4603	 * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4604	 * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4605	 * logic in vnode_pageout except that we need to do it after grabbing the truncate
4606	 * lock in HFS so that we don't lock invert ourselves.
4607	 *
4608	 * Note that we can still get into this function on behalf of the default pager with
4609	 * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4610	 * since fsync and other writing threads will grab the locks, then mark the
4611	 * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4612	 * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4613	 * we do not try to grab anything for the pre-V2 case, which should only be accessed
4614	 * by the paging/VM system.
4615	 */
4616
4617	if (is_pageoutv2) {
4618		off_t f_offset;
4619		int offset;
4620		int isize;
4621		int pg_index;
4622		int error;
4623		int error_ret = 0;
4624
4625		isize = ap->a_size;
4626		f_offset = ap->a_f_offset;
4627
4628		/*
4629		 * Scan from the back to find the last page in the UPL, so that we
4630		 * aren't looking at a UPL that may have already been freed by the
4631		 * preceding aborts/completions.
4632		 */
4633		for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4634			if (upl_page_present(pl, --pg_index))
4635				break;
4636			if (pg_index == 0) {
4637				ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4638				goto pageout_done;
4639			}
4640		}
4641
4642		/*
4643		 * initialize the offset variables before we touch the UPL.
4644		 * a_f_offset is the position into the file, in bytes
4645		 * offset is the position into the UPL, in bytes
4646		 * pg_index is the pg# of the UPL we're operating on.
4647		 * isize is the offset into the UPL of the last non-clean page.
4648		 */
4649		isize = ((pg_index + 1) * PAGE_SIZE);
4650
4651		offset = 0;
4652		pg_index = 0;
4653
4654		while (isize) {
4655			int  xsize;
4656			int  num_of_pages;
4657
4658			if ( !upl_page_present(pl, pg_index)) {
4659				/*
4660				 * we asked for RET_ONLY_DIRTY, so it's possible
4661				 * to get back empty slots in the UPL.
4662				 * just skip over them
4663				 */
4664				f_offset += PAGE_SIZE;
4665				offset   += PAGE_SIZE;
4666				isize    -= PAGE_SIZE;
4667				pg_index++;
4668
4669				continue;
4670			}
4671			if ( !upl_dirty_page(pl, pg_index)) {
4672				panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4673			}
4674
4675			/*
4676			 * We know that we have at least one dirty page.
4677			 * Now checking to see how many in a row we have
4678			 */
4679			num_of_pages = 1;
4680			xsize = isize - PAGE_SIZE;
4681
4682			while (xsize) {
4683				if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4684					break;
4685				num_of_pages++;
4686				xsize -= PAGE_SIZE;
4687			}
4688			xsize = num_of_pages * PAGE_SIZE;
4689
4690			if (!vnode_isswap(vp)) {
4691				off_t end_of_range;
4692				int tooklock;
4693
4694				tooklock = 0;
4695
4696				if (cp->c_lockowner != current_thread()) {
4697					if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4698						/*
4699						 * we're in the v2 path, so we are the
4700						 * owner of the UPL... we may have already
4701						 * processed some of the UPL, so abort it
4702						 * from the current working offset to the
4703						 * end of the UPL
4704						 */
4705						ubc_upl_abort_range(upl,
4706								    offset,
4707								    ap->a_size - offset,
4708								    UPL_ABORT_FREE_ON_EMPTY);
4709						goto pageout_done;
4710					}
4711					tooklock = 1;
4712				}
4713				end_of_range = f_offset + xsize - 1;
4714
4715				if (end_of_range >= filesize) {
4716					end_of_range = (off_t)(filesize - 1);
4717				}
4718				if (f_offset < filesize) {
4719					rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4720					cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4721				}
4722				if (tooklock) {
4723					hfs_unlock(cp);
4724				}
4725			}
4726			if ((error = cluster_pageout(vp, upl, offset, f_offset,
4727							xsize, filesize, a_flags))) {
4728				if (error_ret == 0)
4729					error_ret = error;
4730			}
4731			f_offset += xsize;
4732			offset   += xsize;
4733			isize    -= xsize;
4734			pg_index += num_of_pages;
4735		}
4736		/* capture errnos bubbled out of cluster_pageout if they occurred */
4737		if (error_ret != 0) {
4738			retval = error_ret;
4739		}
4740	} /* end block for v2 pageout behavior */
4741	else {
4742		if (!vnode_isswap(vp)) {
4743			off_t end_of_range;
4744			int tooklock = 0;
4745
4746			if (cp->c_lockowner != current_thread()) {
4747				if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4748					if (!(a_flags & UPL_NOCOMMIT)) {
4749						ubc_upl_abort_range(upl,
4750								    a_pl_offset,
4751								    ap->a_size,
4752								    UPL_ABORT_FREE_ON_EMPTY);
4753					}
4754					goto pageout_done;
4755				}
4756				tooklock = 1;
4757			}
4758			end_of_range = ap->a_f_offset + ap->a_size - 1;
4759
4760			if (end_of_range >= filesize) {
4761				end_of_range = (off_t)(filesize - 1);
4762			}
4763			if (ap->a_f_offset < filesize) {
4764				rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4765				cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4766			}
4767
4768			if (tooklock) {
4769				hfs_unlock(cp);
4770			}
4771		}
4772		/*
4773		 * just call cluster_pageout for old pre-v2 behavior
4774		 */
4775		retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4776				ap->a_size, filesize, a_flags);
4777	}
4778
4779	/*
4780	 * If data was written, update the modification time of the file
4781	 * but only if it's mapped writable; we will have touched the
4782	 * modifcation time for direct writes.
4783	 */
4784	if (retval == 0 && (ubc_is_mapped_writable(vp)
4785						|| ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4786		hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4787
4788		// Check again with lock
4789		bool mapped_writable = ubc_is_mapped_writable(vp);
4790		if (mapped_writable
4791			|| ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4792			cp->c_touch_modtime = TRUE;
4793			cp->c_touch_chgtime = TRUE;
4794
4795			/*
4796			 * We only need to increment the generation counter if
4797			 * it's currently mapped writable because we incremented
4798			 * the counter in hfs_vnop_mnomap.
4799			 */
4800			if (mapped_writable)
4801				hfs_incr_gencount(VTOC(vp));
4802
4803			/*
4804			 * If setuid or setgid bits are set and this process is
4805			 * not the superuser then clear the setuid and setgid bits
4806			 * as a precaution against tampering.
4807			 */
4808			if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4809				(vfs_context_suser(ap->a_context) != 0)) {
4810				cp->c_mode &= ~(S_ISUID | S_ISGID);
4811			}
4812		}
4813
4814		hfs_unlock(cp);
4815	}
4816
4817pageout_done:
4818	if (is_pageoutv2) {
4819		/*
4820		 * Release the truncate lock.  Note that because
4821		 * we may have taken the lock recursively by
4822		 * being invoked via ubc_msync due to lockdown,
4823		 * we should release it recursively, too.
4824		 */
4825		hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4826	}
4827	return (retval);
4828}
4829
4830/*
4831 * Intercept B-Tree node writes to unswap them if necessary.
4832 */
4833int
4834hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4835{
4836	int retval = 0;
4837	register struct buf *bp = ap->a_bp;
4838	register struct vnode *vp = buf_vnode(bp);
4839	BlockDescriptor block;
4840
4841	/* Trap B-Tree writes */
4842	if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4843	    (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4844	    (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4845	    (vp == VTOHFS(vp)->hfc_filevp)) {
4846
4847		/*
4848		 * Swap and validate the node if it is in native byte order.
4849		 * This is always be true on big endian, so we always validate
4850		 * before writing here.  On little endian, the node typically has
4851		 * been swapped and validated when it was written to the journal,
4852		 * so we won't do anything here.
4853		 */
4854		if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4855			/* Prepare the block pointer */
4856			block.blockHeader = bp;
4857			block.buffer = (char *)buf_dataptr(bp);
4858			block.blockNum = buf_lblkno(bp);
4859			/* not found in cache ==> came from disk */
4860			block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4861			block.blockSize = buf_count(bp);
4862
4863			/* Endian un-swap B-Tree node */
4864			retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4865			if (retval)
4866				panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4867		}
4868	}
4869
4870	/* This buffer shouldn't be locked anymore but if it is clear it */
4871	if ((buf_flags(bp) & B_LOCKED)) {
4872	        // XXXdbg
4873	        if (VTOHFS(vp)->jnl) {
4874		        panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4875		}
4876		buf_clearflags(bp, B_LOCKED);
4877	}
4878	retval = vn_bwrite (ap);
4879
4880	return (retval);
4881}
4882
4883/*
4884 * Relocate a file to a new location on disk
4885 *  cnode must be locked on entry
4886 *
4887 * Relocation occurs by cloning the file's data from its
4888 * current set of blocks to a new set of blocks. During
4889 * the relocation all of the blocks (old and new) are
4890 * owned by the file.
4891 *
4892 * -----------------
4893 * |///////////////|
4894 * -----------------
4895 * 0               N (file offset)
4896 *
4897 * -----------------     -----------------
4898 * |///////////////|     |               |     STEP 1 (acquire new blocks)
4899 * -----------------     -----------------
4900 * 0               N     N+1             2N
4901 *
4902 * -----------------     -----------------
4903 * |///////////////|     |///////////////|     STEP 2 (clone data)
4904 * -----------------     -----------------
4905 * 0               N     N+1             2N
4906 *
4907 *                       -----------------
4908 *                       |///////////////|     STEP 3 (head truncate blocks)
4909 *                       -----------------
4910 *                       0               N
4911 *
4912 * During steps 2 and 3 page-outs to file offsets less
4913 * than or equal to N are suspended.
4914 *
4915 * During step 3 page-ins to the file get suspended.
4916 */
4917int
4918hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4919	struct  proc *p)
4920{
4921	struct  cnode *cp;
4922	struct  filefork *fp;
4923	struct  hfsmount *hfsmp;
4924	u_int32_t  headblks;
4925	u_int32_t  datablks;
4926	u_int32_t  blksize;
4927	u_int32_t  growsize;
4928	u_int32_t  nextallocsave;
4929	daddr64_t  sector_a,  sector_b;
4930	int eflags;
4931	off_t  newbytes;
4932	int  retval;
4933	int lockflags = 0;
4934	int took_trunc_lock = 0;
4935	int started_tr = 0;
4936	enum vtype vnodetype;
4937
4938	vnodetype = vnode_vtype(vp);
4939	if (vnodetype != VREG) {
4940		/* Not allowed to move symlinks. */
4941		return (EPERM);
4942	}
4943
4944	hfsmp = VTOHFS(vp);
4945	if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4946		return (ENOSPC);
4947	}
4948
4949	cp = VTOC(vp);
4950	fp = VTOF(vp);
4951	if (fp->ff_unallocblocks)
4952		return (EINVAL);
4953
4954#if CONFIG_PROTECT
4955	/*
4956	 * <rdar://problem/9118426>
4957	 * Disable HFS file relocation on content-protected filesystems
4958	 */
4959	if (cp_fs_protected (hfsmp->hfs_mp)) {
4960		return EINVAL;
4961	}
4962#endif
4963	/* If it's an SSD, also disable HFS relocation */
4964	if (hfsmp->hfs_flags & HFS_SSD) {
4965		return EINVAL;
4966	}
4967
4968
4969	blksize = hfsmp->blockSize;
4970	if (blockHint == 0)
4971		blockHint = hfsmp->nextAllocation;
4972
4973	if (fp->ff_size > 0x7fffffff) {
4974		return (EFBIG);
4975	}
4976
4977	//
4978	// We do not believe that this call to hfs_fsync() is
4979	// necessary and it causes a journal transaction
4980	// deadlock so we are removing it.
4981	//
4982	//if (vnodetype == VREG && !vnode_issystem(vp)) {
4983	//	retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4984	//	if (retval)
4985	//		return (retval);
4986	//}
4987
4988	if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4989		hfs_unlock(cp);
4990		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4991		/* Force lock since callers expects lock to be held. */
4992		if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
4993			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4994			return (retval);
4995		}
4996		/* No need to continue if file was removed. */
4997		if (cp->c_flag & C_NOEXISTS) {
4998			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4999			return (ENOENT);
5000		}
5001		took_trunc_lock = 1;
5002	}
5003	headblks = fp->ff_blocks;
5004	datablks = howmany(fp->ff_size, blksize);
5005	growsize = datablks * blksize;
5006	eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5007	if (blockHint >= hfsmp->hfs_metazone_start &&
5008	    blockHint <= hfsmp->hfs_metazone_end)
5009		eflags |= kEFMetadataMask;
5010
5011	if (hfs_start_transaction(hfsmp) != 0) {
5012		if (took_trunc_lock)
5013			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5014	    return (EINVAL);
5015	}
5016	started_tr = 1;
5017	/*
5018	 * Protect the extents b-tree and the allocation bitmap
5019	 * during MapFileBlockC and ExtendFileC operations.
5020	 */
5021	lockflags = SFL_BITMAP;
5022	if (overflow_extents(fp))
5023		lockflags |= SFL_EXTENTS;
5024	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5025
5026	retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5027	if (retval) {
5028		retval = MacToVFSError(retval);
5029		goto out;
5030	}
5031
5032	/*
5033	 * STEP 1 - acquire new allocation blocks.
5034	 */
5035	nextallocsave = hfsmp->nextAllocation;
5036	retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5037	if (eflags & kEFMetadataMask) {
5038		hfs_lock_mount(hfsmp);
5039		HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5040		MarkVCBDirty(hfsmp);
5041		hfs_unlock_mount(hfsmp);
5042	}
5043
5044	retval = MacToVFSError(retval);
5045	if (retval == 0) {
5046		cp->c_flag |= C_MODIFIED;
5047		if (newbytes < growsize) {
5048			retval = ENOSPC;
5049			goto restore;
5050		} else if (fp->ff_blocks < (headblks + datablks)) {
5051			printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5052			retval = ENOSPC;
5053			goto restore;
5054		}
5055
5056		retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5057		if (retval) {
5058			retval = MacToVFSError(retval);
5059		} else if ((sector_a + 1) == sector_b) {
5060			retval = ENOSPC;
5061			goto restore;
5062		} else if ((eflags & kEFMetadataMask) &&
5063		           ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5064		              hfsmp->hfs_metazone_end)) {
5065#if 0
5066			const char * filestr;
5067			char emptystr = '\0';
5068
5069			if (cp->c_desc.cd_nameptr != NULL) {
5070				filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5071			} else if (vnode_name(vp) != NULL) {
5072				filestr = vnode_name(vp);
5073			} else {
5074				filestr = &emptystr;
5075			}
5076#endif
5077			retval = ENOSPC;
5078			goto restore;
5079		}
5080	}
5081	/* Done with system locks and journal for now. */
5082	hfs_systemfile_unlock(hfsmp, lockflags);
5083	lockflags = 0;
5084	hfs_end_transaction(hfsmp);
5085	started_tr = 0;
5086
5087	if (retval) {
5088		/*
5089		 * Check to see if failure is due to excessive fragmentation.
5090		 */
5091		if ((retval == ENOSPC) &&
5092		    (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5093			hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5094		}
5095		goto out;
5096	}
5097	/*
5098	 * STEP 2 - clone file data into the new allocation blocks.
5099	 */
5100
5101	if (vnodetype == VLNK)
5102		retval = EPERM;
5103	else if (vnode_issystem(vp))
5104		retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5105	else
5106		retval = hfs_clonefile(vp, headblks, datablks, blksize);
5107
5108	/* Start transaction for step 3 or for a restore. */
5109	if (hfs_start_transaction(hfsmp) != 0) {
5110		retval = EINVAL;
5111		goto out;
5112	}
5113	started_tr = 1;
5114	if (retval)
5115		goto restore;
5116
5117	/*
5118	 * STEP 3 - switch to cloned data and remove old blocks.
5119	 */
5120	lockflags = SFL_BITMAP;
5121	if (overflow_extents(fp))
5122		lockflags |= SFL_EXTENTS;
5123	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5124
5125	retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5126
5127	hfs_systemfile_unlock(hfsmp, lockflags);
5128	lockflags = 0;
5129	if (retval)
5130		goto restore;
5131out:
5132	if (took_trunc_lock)
5133		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5134
5135	if (lockflags) {
5136		hfs_systemfile_unlock(hfsmp, lockflags);
5137		lockflags = 0;
5138	}
5139
5140	/* Push cnode's new extent data to disk. */
5141	if (retval == 0) {
5142		(void) hfs_update(vp, MNT_WAIT);
5143	}
5144	if (hfsmp->jnl) {
5145		if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5146			(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5147		else
5148			(void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
5149	}
5150exit:
5151	if (started_tr)
5152		hfs_end_transaction(hfsmp);
5153
5154	return (retval);
5155
5156restore:
5157	if (fp->ff_blocks == headblks) {
5158		if (took_trunc_lock)
5159			hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5160		goto exit;
5161	}
5162	/*
5163	 * Give back any newly allocated space.
5164	 */
5165	if (lockflags == 0) {
5166		lockflags = SFL_BITMAP;
5167		if (overflow_extents(fp))
5168			lockflags |= SFL_EXTENTS;
5169		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5170	}
5171
5172	(void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5173						 FTOC(fp)->c_fileid, false);
5174
5175	hfs_systemfile_unlock(hfsmp, lockflags);
5176	lockflags = 0;
5177
5178	if (took_trunc_lock)
5179		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5180	goto exit;
5181}
5182
5183
5184/*
5185 * Clone a file's data within the file.
5186 *
5187 */
5188static int
5189hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5190{
5191	caddr_t  bufp;
5192	size_t  bufsize;
5193	size_t  copysize;
5194        size_t  iosize;
5195	size_t  offset;
5196	off_t	writebase;
5197	uio_t auio;
5198	int  error = 0;
5199
5200	writebase = blkstart * blksize;
5201	copysize = blkcnt * blksize;
5202	iosize = bufsize = MIN(copysize, 128 * 1024);
5203	offset = 0;
5204
5205	hfs_unlock(VTOC(vp));
5206
5207#if CONFIG_PROTECT
5208	if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5209		hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5210		return (error);
5211	}
5212#endif /* CONFIG_PROTECT */
5213
5214	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5215		hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5216		return (ENOMEM);
5217	}
5218
5219	auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5220
5221	while (offset < copysize) {
5222		iosize = MIN(copysize - offset, iosize);
5223
5224		uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5225		uio_addiov(auio, (uintptr_t)bufp, iosize);
5226
5227		error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5228		if (error) {
5229			printf("hfs_clonefile: cluster_read failed - %d\n", error);
5230			break;
5231		}
5232		if (uio_resid(auio) != 0) {
5233			printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5234			error = EIO;
5235			break;
5236		}
5237
5238		uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5239		uio_addiov(auio, (uintptr_t)bufp, iosize);
5240
5241		error = cluster_write(vp, auio, writebase + offset,
5242		                      writebase + offset + iosize,
5243		                      uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5244		if (error) {
5245			printf("hfs_clonefile: cluster_write failed - %d\n", error);
5246			break;
5247		}
5248		if (uio_resid(auio) != 0) {
5249			printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5250			error = EIO;
5251			break;
5252		}
5253		offset += iosize;
5254	}
5255	uio_free(auio);
5256
5257	if ((blksize & PAGE_MASK)) {
5258		/*
5259		 * since the copy may not have started on a PAGE
5260		 * boundary (or may not have ended on one), we
5261		 * may have pages left in the cache since NOCACHE
5262		 * will let partially written pages linger...
5263		 * lets just flush the entire range to make sure
5264		 * we don't have any pages left that are beyond
5265		 * (or intersect) the real LEOF of this file
5266		 */
5267		ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5268	} else {
5269		/*
5270		 * No need to call ubc_msync or hfs_invalbuf
5271		 * since the file was copied using IO_NOCACHE and
5272		 * the copy was done starting and ending on a page
5273		 * boundary in the file.
5274		 */
5275	}
5276	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5277
5278	hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5279	return (error);
5280}
5281
5282/*
5283 * Clone a system (metadata) file.
5284 *
5285 */
5286static int
5287hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5288                 kauth_cred_t cred, struct proc *p)
5289{
5290	caddr_t  bufp;
5291	char * offset;
5292	size_t  bufsize;
5293	size_t  iosize;
5294	struct buf *bp = NULL;
5295	daddr64_t  blkno;
5296 	daddr64_t  blk;
5297	daddr64_t  start_blk;
5298	daddr64_t  last_blk;
5299	int  breadcnt;
5300        int  i;
5301	int  error = 0;
5302
5303
5304	iosize = GetLogicalBlockSize(vp);
5305	bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5306	breadcnt = bufsize / iosize;
5307
5308	if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5309		return (ENOMEM);
5310	}
5311	start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5312	last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5313	blkno = 0;
5314
5315	while (blkno < last_blk) {
5316		/*
5317		 * Read up to a megabyte
5318		 */
5319		offset = bufp;
5320		for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5321			error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5322			if (error) {
5323				printf("hfs_clonesysfile: meta_bread error %d\n", error);
5324				goto out;
5325			}
5326			if (buf_count(bp) != iosize) {
5327				printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5328				goto out;
5329			}
5330			bcopy((char *)buf_dataptr(bp), offset, iosize);
5331
5332			buf_markinvalid(bp);
5333			buf_brelse(bp);
5334			bp = NULL;
5335
5336			offset += iosize;
5337		}
5338
5339		/*
5340		 * Write up to a megabyte
5341		 */
5342		offset = bufp;
5343		for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5344			bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5345			if (bp == NULL) {
5346				printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5347				error = EIO;
5348				goto out;
5349			}
5350			bcopy(offset, (char *)buf_dataptr(bp), iosize);
5351			error = (int)buf_bwrite(bp);
5352			bp = NULL;
5353			if (error)
5354				goto out;
5355			offset += iosize;
5356		}
5357	}
5358out:
5359	if (bp) {
5360		buf_brelse(bp);
5361	}
5362
5363	kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5364
5365	error = hfs_fsync(vp, MNT_WAIT, 0, p);
5366
5367	return (error);
5368}
5369