1/*
2 * Copyright (c) 1999-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1991, 1993, 1994
30 *	The Regents of the University of California.  All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 *    must display the following acknowledgement:
47 *	This product includes software developed by the University of
48 *	California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *      hfs_vfsops.c
66 *  derived from	@(#)ufs_vfsops.c	8.8 (Berkeley) 5/20/95
67 *
68 *      (c) Copyright 1997-2002 Apple Computer, Inc. All rights reserved.
69 *
70 *      hfs_vfsops.c -- VFS layer for loadable HFS file system.
71 *
72 */
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/kauth.h>
76
77#include <sys/ubc.h>
78#include <sys/ubc_internal.h>
79#include <sys/vnode_internal.h>
80#include <sys/mount_internal.h>
81#include <sys/sysctl.h>
82#include <sys/malloc.h>
83#include <sys/stat.h>
84#include <sys/quota.h>
85#include <sys/disk.h>
86#include <sys/paths.h>
87#include <sys/utfconv.h>
88#include <sys/kdebug.h>
89#include <sys/fslog.h>
90#include <sys/ubc.h>
91#include <sys/buf_internal.h>
92
93/* for parsing boot-args */
94#include <pexpert/pexpert.h>
95
96
97#include <kern/locks.h>
98
99#include <vfs/vfs_journal.h>
100
101#include <miscfs/specfs/specdev.h>
102#include <hfs/hfs_mount.h>
103
104#include <libkern/crypto/md5.h>
105#include <uuid/uuid.h>
106
107#include "hfs.h"
108#include "hfs_catalog.h"
109#include "hfs_cnode.h"
110#include "hfs_dbg.h"
111#include "hfs_endian.h"
112#include "hfs_hotfiles.h"
113#include "hfs_quota.h"
114#include "hfs_btreeio.h"
115#include "hfs_kdebug.h"
116
117#include "hfscommon/headers/FileMgrInternal.h"
118#include "hfscommon/headers/BTreesInternal.h"
119
120#if CONFIG_PROTECT
121#include <sys/cprotect.h>
122#endif
123
124#define HFS_MOUNT_DEBUG 1
125
126#if	HFS_DIAGNOSTIC
127int hfs_dbg_all = 0;
128int hfs_dbg_err = 0;
129#endif
130
131/* Enable/disable debugging code for live volume resizing */
132int hfs_resize_debug = 0;
133
134lck_grp_attr_t *  hfs_group_attr;
135lck_attr_t *  hfs_lock_attr;
136lck_grp_t *  hfs_mutex_group;
137lck_grp_t *  hfs_rwlock_group;
138lck_grp_t *  hfs_spinlock_group;
139
140extern struct vnodeopv_desc hfs_vnodeop_opv_desc;
141
142#if CONFIG_HFS_STD
143extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc;
144static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush);
145#endif
146
147/* not static so we can re-use in hfs_readwrite.c for build_path calls */
148int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
149
150static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args);
151static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context);
152static int hfs_flushfiles(struct mount *, int, struct proc *);
153static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp);
154static int hfs_init(struct vfsconf *vfsp);
155static void hfs_locks_destroy(struct hfsmount *hfsmp);
156static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context);
157static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context);
158static int hfs_start(struct mount *mp, int flags, vfs_context_t context);
159static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context);
160static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec);
161static int hfs_journal_replay(vnode_t devvp, vfs_context_t context);
162static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context);
163static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context);
164
165void hfs_initialize_allocator (struct hfsmount *hfsmp);
166int hfs_teardown_allocator (struct hfsmount *hfsmp);
167
168int hfs_mount(struct mount *mp, vnode_t  devvp, user_addr_t data, vfs_context_t context);
169int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context);
170int hfs_reload(struct mount *mp);
171int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context);
172int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context);
173int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
174                      user_addr_t newp, size_t newlen, vfs_context_t context);
175int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context);
176
177/*
178 * Called by vfs_mountroot when mounting HFS Plus as root.
179 */
180
181int
182hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context)
183{
184	struct hfsmount *hfsmp;
185	ExtendedVCB *vcb;
186	struct vfsstatfs *vfsp;
187	int error;
188
189	if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) {
190		if (HFS_MOUNT_DEBUG) {
191			printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n",
192					error, rvp, (rvp->v_name ? rvp->v_name : "unknown device"));
193		}
194		return (error);
195	}
196
197	/* Init hfsmp */
198	hfsmp = VFSTOHFS(mp);
199
200	hfsmp->hfs_uid = UNKNOWNUID;
201	hfsmp->hfs_gid = UNKNOWNGID;
202	hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
203	hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
204
205	/* Establish the free block reserve. */
206	vcb = HFSTOVCB(hfsmp);
207	vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100;
208	vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize);
209
210	vfsp = vfs_statfs(mp);
211	(void)hfs_statfs(mp, vfsp, NULL);
212
213	return (0);
214}
215
216
217/*
218 * VFS Operations.
219 *
220 * mount system call
221 */
222
223int
224hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context)
225{
226	struct proc *p = vfs_context_proc(context);
227	struct hfsmount *hfsmp = NULL;
228	struct hfs_mount_args args;
229	int retval = E_NONE;
230	u_int32_t cmdflags;
231
232	if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) {
233		if (HFS_MOUNT_DEBUG) {
234			printf("hfs_mount: copyin returned %d for fs\n", retval);
235		}
236		return (retval);
237	}
238	cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS;
239	if (cmdflags & MNT_UPDATE) {
240		hfsmp = VFSTOHFS(mp);
241
242		/* Reload incore data after an fsck. */
243		if (cmdflags & MNT_RELOAD) {
244			if (vfs_isrdonly(mp)) {
245				int error = hfs_reload(mp);
246				if (error && HFS_MOUNT_DEBUG) {
247					printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN);
248				}
249				return error;
250			}
251			else {
252				if (HFS_MOUNT_DEBUG) {
253					printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN);
254				}
255				return (EINVAL);
256			}
257		}
258
259		/* Change to a read-only file system. */
260		if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) &&
261		    vfs_isrdonly(mp)) {
262			int flags;
263
264			/* Set flag to indicate that a downgrade to read-only
265			 * is in progress and therefore block any further
266			 * modifications to the file system.
267			 */
268			hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
269			hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE;
270			hfsmp->hfs_downgrading_proc = current_thread();
271			hfs_unlock_global (hfsmp);
272
273			/* use VFS_SYNC to push out System (btree) files */
274			retval = VFS_SYNC(mp, MNT_WAIT, context);
275			if (retval && ((cmdflags & MNT_FORCE) == 0)) {
276				hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
277				hfsmp->hfs_downgrading_proc = NULL;
278				if (HFS_MOUNT_DEBUG) {
279					printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN);
280				}
281				goto out;
282			}
283
284			flags = WRITECLOSE;
285			if (cmdflags & MNT_FORCE)
286				flags |= FORCECLOSE;
287
288			if ((retval = hfs_flushfiles(mp, flags, p))) {
289				hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
290				hfsmp->hfs_downgrading_proc = NULL;
291				if (HFS_MOUNT_DEBUG) {
292					printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN);
293				}
294				goto out;
295			}
296
297			/* mark the volume cleanly unmounted */
298			hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask;
299			retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
300			hfsmp->hfs_flags |= HFS_READ_ONLY;
301
302			/*
303			 * Close down the journal.
304			 *
305			 * NOTE: It is critically important to close down the journal
306			 * and have it issue all pending I/O prior to calling VNOP_FSYNC below.
307			 * In a journaled environment it is expected that the journal be
308			 * the only actor permitted to issue I/O for metadata blocks in HFS.
309			 * If we were to call VNOP_FSYNC prior to closing down the journal,
310			 * we would inadvertantly issue (and wait for) the I/O we just
311			 * initiated above as part of the flushvolumeheader call.
312			 *
313			 * To avoid this, we follow the same order of operations as in
314			 * unmount and issue the journal_close prior to calling VNOP_FSYNC.
315			 */
316
317			if (hfsmp->jnl) {
318				hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
319
320			    journal_close(hfsmp->jnl);
321			    hfsmp->jnl = NULL;
322
323			    // Note: we explicitly don't want to shutdown
324			    //       access to the jvp because we may need
325			    //       it later if we go back to being read-write.
326
327				hfs_unlock_global (hfsmp);
328			}
329
330
331			/*
332			 * Write out any pending I/O still outstanding against the device node
333			 * now that the journal has been closed.
334			 */
335			if (retval == 0) {
336				vnode_get(hfsmp->hfs_devvp);
337				retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
338				vnode_put(hfsmp->hfs_devvp);
339			}
340
341			if (retval) {
342				if (HFS_MOUNT_DEBUG) {
343					printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN);
344				}
345				hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
346				hfsmp->hfs_downgrading_proc = NULL;
347				hfsmp->hfs_flags &= ~HFS_READ_ONLY;
348				goto out;
349			}
350
351			if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) {
352				if (hfsmp->hfs_summary_table) {
353					int err = 0;
354					/*
355					 * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress
356					 */
357					if (hfsmp->hfs_allocation_vp) {
358						err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
359					}
360					FREE (hfsmp->hfs_summary_table, M_TEMP);
361					hfsmp->hfs_summary_table = NULL;
362					hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE;
363					if (err == 0 && hfsmp->hfs_allocation_vp){
364						hfs_unlock (VTOC(hfsmp->hfs_allocation_vp));
365					}
366				}
367			}
368
369			hfsmp->hfs_downgrading_proc = NULL;
370		}
371
372		/* Change to a writable file system. */
373		if (vfs_iswriteupgrade(mp)) {
374			/*
375			 * On inconsistent disks, do not allow read-write mount
376			 * unless it is the boot volume being mounted.
377			 */
378			if (!(vfs_flags(mp) & MNT_ROOTFS) &&
379					(hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) {
380				if (HFS_MOUNT_DEBUG) {
381					printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n",  (hfsmp->vcbVN));
382				}
383				retval = EINVAL;
384				goto out;
385			}
386
387			// If the journal was shut-down previously because we were
388			// asked to be read-only, let's start it back up again now
389
390			if (   (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask)
391			    && hfsmp->jnl == NULL
392			    && hfsmp->jvp != NULL) {
393			    int jflags;
394
395			    if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) {
396					jflags = JOURNAL_RESET;
397				} else {
398					jflags = 0;
399				}
400
401				hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
402
403				/* We provide the mount point twice here: The first is used as
404				 * an opaque argument to be passed back when hfs_sync_metadata
405				 * is called.  The second is provided to the throttling code to
406				 * indicate which mount's device should be used when accounting
407				 * for metadata writes.
408				 */
409				hfsmp->jnl = journal_open(hfsmp->jvp,
410						(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
411						hfsmp->jnl_size,
412						hfsmp->hfs_devvp,
413						hfsmp->hfs_logical_block_size,
414						jflags,
415						0,
416						hfs_sync_metadata, hfsmp->hfs_mp,
417						hfsmp->hfs_mp);
418
419				/*
420				 * Set up the trim callback function so that we can add
421				 * recently freed extents to the free extent cache once
422				 * the transaction that freed them is written to the
423				 * journal on disk.
424				 */
425				if (hfsmp->jnl)
426					journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp);
427
428				hfs_unlock_global (hfsmp);
429
430				if (hfsmp->jnl == NULL) {
431					if (HFS_MOUNT_DEBUG) {
432						printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN));
433					}
434					retval = EINVAL;
435					goto out;
436				} else {
437					hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET;
438				}
439
440			}
441
442			/* See if we need to erase unused Catalog nodes due to <rdar://problem/6947811>. */
443			retval = hfs_erase_unused_nodes(hfsmp);
444			if (retval != E_NONE) {
445				if (HFS_MOUNT_DEBUG) {
446					printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN);
447				}
448				goto out;
449			}
450
451			/* If this mount point was downgraded from read-write
452			 * to read-only, clear that information as we are now
453			 * moving back to read-write.
454			 */
455			hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
456			hfsmp->hfs_downgrading_proc = NULL;
457
458			/* mark the volume dirty (clear clean unmount bit) */
459			hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask;
460
461			retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
462			if (retval != E_NONE) {
463				if (HFS_MOUNT_DEBUG) {
464					printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN);
465				}
466				goto out;
467			}
468
469			/* Only clear HFS_READ_ONLY after a successful write */
470			hfsmp->hfs_flags &= ~HFS_READ_ONLY;
471
472
473			if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) {
474				/* Setup private/hidden directories for hardlinks. */
475				hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
476				hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
477
478				hfs_remove_orphans(hfsmp);
479
480				/*
481				 * Allow hot file clustering if conditions allow.
482				 */
483				if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) &&
484					   ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0))	{
485					(void) hfs_recording_init(hfsmp);
486				}
487				/* Force ACLs on HFS+ file systems. */
488				if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) {
489					vfs_setextendedsecurity(HFSTOVFS(hfsmp));
490				}
491			}
492		}
493
494		/* Update file system parameters. */
495		retval = hfs_changefs(mp, &args);
496		if (retval &&  HFS_MOUNT_DEBUG) {
497			printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN);
498		}
499
500	} else /* not an update request */ {
501
502		/* Set the mount flag to indicate that we support volfs  */
503		vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS));
504
505		retval = hfs_mountfs(devvp, mp, &args, 0, context);
506		if (retval) {
507			const char *name = vnode_getname(devvp);
508			printf("hfs_mount: hfs_mountfs returned error=%d for device %s\n", retval, (name ? name : "unknown-dev"));
509			if (name) {
510				vnode_putname(name);
511			}
512			goto out;
513		}
514
515		/* After hfs_mountfs succeeds, we should have valid hfsmp */
516		hfsmp = VFSTOHFS(mp);
517
518		/*
519		 * Check to see if the file system exists on CoreStorage.
520		 *
521		 * This must be done after examining the root folder's CP EA since
522		 * hfs_vfs_root will create a vnode (which must not occur until after
523		 * we've established the CP level of the FS).
524		 */
525		if (retval == 0) {
526			errno_t err;
527			vnode_t root_vnode;
528			err = hfs_vfs_root(mp, &root_vnode, context);
529			if (err == 0) {
530				if (VNOP_IOCTL(devvp, _DKIOCCSSETFSVNODE,
531							(caddr_t)&root_vnode, 0, context) == 0) {
532					err = vnode_ref(root_vnode);
533					if (err == 0) {
534						hfsmp->hfs_flags |= HFS_CS;
535					}
536				}
537
538				err = vnode_put(root_vnode);
539				if (err) {
540					printf("hfs: could not release io count on root vnode with error: %d\n",
541							err);
542				}
543			} else {
544				printf("hfs: could not get root vnode with error: %d\n",
545						err);
546			}
547		}
548	}
549
550out:
551	if (retval == 0) {
552		(void)hfs_statfs(mp, vfs_statfs(mp), context);
553	}
554	return (retval);
555}
556
557
558struct hfs_changefs_cargs {
559	struct hfsmount *hfsmp;
560        int		namefix;
561        int		permfix;
562        int		permswitch;
563};
564
565static int
566hfs_changefs_callback(struct vnode *vp, void *cargs)
567{
568	ExtendedVCB *vcb;
569	struct cnode *cp;
570	struct cat_desc cndesc;
571	struct cat_attr cnattr;
572	struct hfs_changefs_cargs *args;
573	int lockflags;
574	int error;
575
576	args = (struct hfs_changefs_cargs *)cargs;
577
578	cp = VTOC(vp);
579	vcb = HFSTOVCB(args->hfsmp);
580
581	lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
582	error = cat_lookup(args->hfsmp, &cp->c_desc, 0, 0, &cndesc, &cnattr, NULL, NULL);
583	hfs_systemfile_unlock(args->hfsmp, lockflags);
584	if (error) {
585	        /*
586		 * If we couldn't find this guy skip to the next one
587		 */
588	        if (args->namefix)
589		        cache_purge(vp);
590
591		return (VNODE_RETURNED);
592	}
593	/*
594	 * Get the real uid/gid and perm mask from disk.
595	 */
596	if (args->permswitch || args->permfix) {
597	        cp->c_uid = cnattr.ca_uid;
598		cp->c_gid = cnattr.ca_gid;
599		cp->c_mode = cnattr.ca_mode;
600	}
601	/*
602	 * If we're switching name converters then...
603	 *   Remove the existing entry from the namei cache.
604	 *   Update name to one based on new encoder.
605	 */
606	if (args->namefix) {
607	        cache_purge(vp);
608		replace_desc(cp, &cndesc);
609
610		if (cndesc.cd_cnid == kHFSRootFolderID) {
611		        strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1);
612			cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding;
613		}
614	} else {
615	        cat_releasedesc(&cndesc);
616	}
617	return (VNODE_RETURNED);
618}
619
620/* Change fs mount parameters */
621static int
622hfs_changefs(struct mount *mp, struct hfs_mount_args *args)
623{
624	int retval = 0;
625	int namefix, permfix, permswitch;
626	struct hfsmount *hfsmp;
627	ExtendedVCB *vcb;
628	struct hfs_changefs_cargs cargs;
629	u_int32_t mount_flags;
630
631#if CONFIG_HFS_STD
632	u_int32_t old_encoding = 0;
633	hfs_to_unicode_func_t	get_unicode_func;
634	unicode_to_hfs_func_t	get_hfsname_func;
635#endif
636
637	hfsmp = VFSTOHFS(mp);
638	vcb = HFSTOVCB(hfsmp);
639	mount_flags = (unsigned int)vfs_flags(mp);
640
641	hfsmp->hfs_flags |= HFS_IN_CHANGEFS;
642
643	permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) &&
644	               ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) ||
645	              (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) &&
646	               (mount_flags & MNT_UNKNOWNPERMISSIONS)));
647
648	/* The root filesystem must operate with actual permissions: */
649	if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) {
650		vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS));	/* Just say "No". */
651		retval = EINVAL;
652		goto exit;
653	}
654	if (mount_flags & MNT_UNKNOWNPERMISSIONS)
655		hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
656	else
657		hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS;
658
659	namefix = permfix = 0;
660
661	/*
662	 * Tracking of hot files requires up-to-date access times.  So if
663	 * access time updates are disabled, we must also disable hot files.
664	 */
665	if (mount_flags & MNT_NOATIME) {
666		(void) hfs_recording_suspend(hfsmp);
667	}
668
669	/* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */
670	if (args->hfs_timezone.tz_minuteswest != VNOVAL) {
671		gTimeZone = args->hfs_timezone;
672	}
673
674	/* Change the default uid, gid and/or mask */
675	if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) {
676		hfsmp->hfs_uid = args->hfs_uid;
677		if (vcb->vcbSigWord == kHFSPlusSigWord)
678			++permfix;
679	}
680	if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) {
681		hfsmp->hfs_gid = args->hfs_gid;
682		if (vcb->vcbSigWord == kHFSPlusSigWord)
683			++permfix;
684	}
685	if (args->hfs_mask != (mode_t)VNOVAL) {
686		if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) {
687			hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
688			hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
689			if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES))
690				hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
691			if (vcb->vcbSigWord == kHFSPlusSigWord)
692				++permfix;
693		}
694	}
695
696#if CONFIG_HFS_STD
697	/* Change the hfs encoding value (hfs only) */
698	if ((vcb->vcbSigWord == kHFSSigWord)	&&
699	    (args->hfs_encoding != (u_int32_t)VNOVAL)              &&
700	    (hfsmp->hfs_encoding != args->hfs_encoding)) {
701
702		retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func);
703		if (retval)
704			goto exit;
705
706		/*
707		 * Connect the new hfs_get_unicode converter but leave
708		 * the old hfs_get_hfsname converter in place so that
709		 * we can lookup existing vnodes to get their correctly
710		 * encoded names.
711		 *
712		 * When we're all finished, we can then connect the new
713		 * hfs_get_hfsname converter and release our interest
714		 * in the old converters.
715		 */
716		hfsmp->hfs_get_unicode = get_unicode_func;
717		old_encoding = hfsmp->hfs_encoding;
718		hfsmp->hfs_encoding = args->hfs_encoding;
719		++namefix;
720	}
721#endif
722
723	if (!(namefix || permfix || permswitch))
724		goto exit;
725
726	/* XXX 3762912 hack to support HFS filesystem 'owner' */
727	if (permfix)
728		vfs_setowner(mp,
729		    hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid,
730		    hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid);
731
732	/*
733	 * For each active vnode fix things that changed
734	 *
735	 * Note that we can visit a vnode more than once
736	 * and we can race with fsync.
737	 *
738	 * hfs_changefs_callback will be called for each vnode
739	 * hung off of this mount point
740	 *
741	 * The vnode will be properly referenced and unreferenced
742	 * around the callback
743	 */
744	cargs.hfsmp = hfsmp;
745	cargs.namefix = namefix;
746	cargs.permfix = permfix;
747	cargs.permswitch = permswitch;
748
749	vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs);
750
751#if CONFIG_HFS_STD
752	/*
753	 * If we're switching name converters we can now
754	 * connect the new hfs_get_hfsname converter and
755	 * release our interest in the old converters.
756	 */
757	if (namefix) {
758		/* HFS standard only */
759		hfsmp->hfs_get_hfsname = get_hfsname_func;
760		vcb->volumeNameEncodingHint = args->hfs_encoding;
761		(void) hfs_relconverter(old_encoding);
762	}
763#endif
764
765exit:
766	hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS;
767	return (retval);
768}
769
770
771struct hfs_reload_cargs {
772	struct hfsmount *hfsmp;
773        int		error;
774};
775
776static int
777hfs_reload_callback(struct vnode *vp, void *cargs)
778{
779	struct cnode *cp;
780	struct hfs_reload_cargs *args;
781	int lockflags;
782
783	args = (struct hfs_reload_cargs *)cargs;
784	/*
785	 * flush all the buffers associated with this node
786	 */
787	(void) buf_invalidateblks(vp, 0, 0, 0);
788
789	cp = VTOC(vp);
790	/*
791	 * Remove any directory hints
792	 */
793	if (vnode_isdir(vp))
794	        hfs_reldirhints(cp, 0);
795
796	/*
797	 * Re-read cnode data for all active vnodes (non-metadata files).
798	 */
799	if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) {
800	        struct cat_fork *datafork;
801		struct cat_desc desc;
802
803		datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL;
804
805		/* lookup by fileID since name could have changed */
806		lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
807		args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, 0, &desc, &cp->c_attr, datafork);
808		hfs_systemfile_unlock(args->hfsmp, lockflags);
809		if (args->error) {
810		        return (VNODE_RETURNED_DONE);
811		}
812
813		/* update cnode's catalog descriptor */
814		(void) replace_desc(cp, &desc);
815	}
816	return (VNODE_RETURNED);
817}
818
819/*
820 * Reload all incore data for a filesystem (used after running fsck on
821 * the root filesystem and finding things to fix). The filesystem must
822 * be mounted read-only.
823 *
824 * Things to do to update the mount:
825 *	invalidate all cached meta-data.
826 *	invalidate all inactive vnodes.
827 *	invalidate all cached file data.
828 *	re-read volume header from disk.
829 *	re-load meta-file info (extents, file size).
830 *	re-load B-tree header data.
831 *	re-read cnode data for all active vnodes.
832 */
833int
834hfs_reload(struct mount *mountp)
835{
836	register struct vnode *devvp;
837	struct buf *bp;
838	int error, i;
839	struct hfsmount *hfsmp;
840	struct HFSPlusVolumeHeader *vhp;
841	ExtendedVCB *vcb;
842	struct filefork *forkp;
843    	struct cat_desc cndesc;
844	struct hfs_reload_cargs args;
845	daddr64_t priIDSector;
846
847    	hfsmp = VFSTOHFS(mountp);
848	vcb = HFSTOVCB(hfsmp);
849
850	if (vcb->vcbSigWord == kHFSSigWord)
851		return (EINVAL);	/* rooting from HFS is not supported! */
852
853	/*
854	 * Invalidate all cached meta-data.
855	 */
856	devvp = hfsmp->hfs_devvp;
857	if (buf_invalidateblks(devvp, 0, 0, 0))
858		panic("hfs_reload: dirty1");
859
860	args.hfsmp = hfsmp;
861	args.error = 0;
862	/*
863	 * hfs_reload_callback will be called for each vnode
864	 * hung off of this mount point that can't be recycled...
865	 * vnode_iterate will recycle those that it can (the VNODE_RELOAD option)
866	 * the vnode will be in an 'unbusy' state (VNODE_WAIT) and
867	 * properly referenced and unreferenced around the callback
868	 */
869	vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args);
870
871	if (args.error)
872	        return (args.error);
873
874	/*
875	 * Re-read VolumeHeader from disk.
876	 */
877	priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
878			HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
879
880	error = (int)buf_meta_bread(hfsmp->hfs_devvp,
881			HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
882			hfsmp->hfs_physical_block_size, NOCRED, &bp);
883	if (error) {
884        	if (bp != NULL)
885        		buf_brelse(bp);
886		return (error);
887	}
888
889	vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
890
891	/* Do a quick sanity check */
892	if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
893	     SWAP_BE16(vhp->signature) != kHFSXSigWord) ||
894	    (SWAP_BE16(vhp->version) != kHFSPlusVersion &&
895	     SWAP_BE16(vhp->version) != kHFSXVersion) ||
896	    SWAP_BE32(vhp->blockSize) != vcb->blockSize) {
897		buf_brelse(bp);
898		return (EIO);
899	}
900
901	vcb->vcbLsMod		= to_bsd_time(SWAP_BE32(vhp->modifyDate));
902	vcb->vcbAtrb		= SWAP_BE32 (vhp->attributes);
903	vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
904	vcb->vcbClpSiz		= SWAP_BE32 (vhp->rsrcClumpSize);
905	vcb->vcbNxtCNID		= SWAP_BE32 (vhp->nextCatalogID);
906	vcb->vcbVolBkUp		= to_bsd_time(SWAP_BE32(vhp->backupDate));
907	vcb->vcbWrCnt		= SWAP_BE32 (vhp->writeCount);
908	vcb->vcbFilCnt		= SWAP_BE32 (vhp->fileCount);
909	vcb->vcbDirCnt		= SWAP_BE32 (vhp->folderCount);
910	HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation));
911	vcb->totalBlocks	= SWAP_BE32 (vhp->totalBlocks);
912	vcb->freeBlocks		= SWAP_BE32 (vhp->freeBlocks);
913	vcb->encodingsBitmap	= SWAP_BE64 (vhp->encodingsBitmap);
914	bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo));
915	vcb->localCreateDate	= SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */
916
917	/*
918	 * Re-load meta-file vnode data (extent info, file size, etc).
919	 */
920	forkp = VTOF((struct vnode *)vcb->extentsRefNum);
921	for (i = 0; i < kHFSPlusExtentDensity; i++) {
922		forkp->ff_extents[i].startBlock =
923			SWAP_BE32 (vhp->extentsFile.extents[i].startBlock);
924		forkp->ff_extents[i].blockCount =
925			SWAP_BE32 (vhp->extentsFile.extents[i].blockCount);
926	}
927	forkp->ff_size      = SWAP_BE64 (vhp->extentsFile.logicalSize);
928	forkp->ff_blocks    = SWAP_BE32 (vhp->extentsFile.totalBlocks);
929	forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize);
930
931
932	forkp = VTOF((struct vnode *)vcb->catalogRefNum);
933	for (i = 0; i < kHFSPlusExtentDensity; i++) {
934		forkp->ff_extents[i].startBlock	=
935			SWAP_BE32 (vhp->catalogFile.extents[i].startBlock);
936		forkp->ff_extents[i].blockCount	=
937			SWAP_BE32 (vhp->catalogFile.extents[i].blockCount);
938	}
939	forkp->ff_size      = SWAP_BE64 (vhp->catalogFile.logicalSize);
940	forkp->ff_blocks    = SWAP_BE32 (vhp->catalogFile.totalBlocks);
941	forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize);
942
943	if (hfsmp->hfs_attribute_vp) {
944		forkp = VTOF(hfsmp->hfs_attribute_vp);
945		for (i = 0; i < kHFSPlusExtentDensity; i++) {
946			forkp->ff_extents[i].startBlock	=
947				SWAP_BE32 (vhp->attributesFile.extents[i].startBlock);
948			forkp->ff_extents[i].blockCount	=
949				SWAP_BE32 (vhp->attributesFile.extents[i].blockCount);
950		}
951		forkp->ff_size      = SWAP_BE64 (vhp->attributesFile.logicalSize);
952		forkp->ff_blocks    = SWAP_BE32 (vhp->attributesFile.totalBlocks);
953		forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize);
954	}
955
956	forkp = VTOF((struct vnode *)vcb->allocationsRefNum);
957	for (i = 0; i < kHFSPlusExtentDensity; i++) {
958		forkp->ff_extents[i].startBlock	=
959			SWAP_BE32 (vhp->allocationFile.extents[i].startBlock);
960		forkp->ff_extents[i].blockCount	=
961			SWAP_BE32 (vhp->allocationFile.extents[i].blockCount);
962	}
963	forkp->ff_size      = SWAP_BE64 (vhp->allocationFile.logicalSize);
964	forkp->ff_blocks    = SWAP_BE32 (vhp->allocationFile.totalBlocks);
965	forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize);
966
967	buf_brelse(bp);
968	vhp = NULL;
969
970	/*
971	 * Re-load B-tree header data
972	 */
973	forkp = VTOF((struct vnode *)vcb->extentsRefNum);
974	if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
975		return (error);
976
977	forkp = VTOF((struct vnode *)vcb->catalogRefNum);
978	if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
979		return (error);
980
981	if (hfsmp->hfs_attribute_vp) {
982		forkp = VTOF(hfsmp->hfs_attribute_vp);
983		if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
984			return (error);
985	}
986
987	/* Reload the volume name */
988	if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, NULL, NULL)))
989		return (error);
990	vcb->volumeNameEncodingHint = cndesc.cd_encoding;
991	bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen));
992	cat_releasedesc(&cndesc);
993
994	/* Re-establish private/hidden directories. */
995	hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
996	hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
997
998	/* In case any volume information changed to trigger a notification */
999	hfs_generate_volume_notifications(hfsmp);
1000
1001	return (0);
1002}
1003
1004__unused
1005static uint64_t tv_to_usecs(struct timeval *tv)
1006{
1007	return tv->tv_sec * 1000000ULL + tv->tv_usec;
1008}
1009
1010// Returns TRUE if b - a >= usecs
1011static boolean_t hfs_has_elapsed (const struct timeval *a,
1012                                  const struct timeval *b,
1013                                  uint64_t usecs)
1014{
1015    struct timeval diff;
1016    timersub(b, a, &diff);
1017    return diff.tv_sec * 1000000ULL + diff.tv_usec >= usecs;
1018}
1019
1020static void
1021hfs_syncer(void *arg0, void *unused)
1022{
1023#pragma unused(unused)
1024
1025    struct hfsmount *hfsmp = arg0;
1026    struct timeval   now;
1027
1028    microuptime(&now);
1029
1030    KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER | DBG_FUNC_START, hfsmp,
1031                          tv_to_usecs(&now),
1032                          tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp),
1033                          hfsmp->hfs_mp->mnt_pending_write_size, 0);
1034
1035    hfs_syncer_lock(hfsmp);
1036
1037    if (!hfsmp->hfs_syncer) {
1038        // hfs_unmount is waiting for us leave now and let it do the sync
1039        hfsmp->hfs_sync_incomplete = FALSE;
1040        hfs_syncer_unlock(hfsmp);
1041        hfs_syncer_wakeup(hfsmp);
1042        return;
1043    }
1044
1045    /* Check to see whether we should flush now: either the oldest is
1046       > HFS_MAX_META_DELAY or HFS_META_DELAY has elapsed since the
1047       request and there are no pending writes. */
1048
1049    boolean_t flush_now = FALSE;
1050
1051    if (hfs_has_elapsed(&hfsmp->hfs_sync_req_oldest, &now, HFS_MAX_META_DELAY))
1052        flush_now = TRUE;
1053    else if (!hfsmp->hfs_mp->mnt_pending_write_size) {
1054        /* N.B. accessing mnt_last_write_completed_timestamp is not thread safe, but
1055           it won't matter for what we're using it for. */
1056        if (hfs_has_elapsed(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp,
1057                            &now,
1058                            HFS_META_DELAY)) {
1059            flush_now = TRUE;
1060        }
1061    }
1062
1063    if (!flush_now) {
1064        thread_call_t syncer = hfsmp->hfs_syncer;
1065
1066        hfs_syncer_unlock(hfsmp);
1067
1068        hfs_syncer_queue(syncer);
1069
1070        return;
1071    }
1072
1073    timerclear(&hfsmp->hfs_sync_req_oldest);
1074
1075    hfs_syncer_unlock(hfsmp);
1076
1077    KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER_TIMED | DBG_FUNC_START,
1078                          tv_to_usecs(&now),
1079                          tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp),
1080                          tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_issued_timestamp),
1081                          hfsmp->hfs_mp->mnt_pending_write_size, 0);
1082
1083    if (hfsmp->hfs_syncer_thread) {
1084        printf("hfs: syncer already running!");
1085		return;
1086	}
1087
1088    hfsmp->hfs_syncer_thread = current_thread();
1089
1090    hfs_start_transaction(hfsmp);   // so we hold off any new writes
1091
1092    /*
1093     * We intentionally do a synchronous flush (of the journal or entire volume) here.
1094     * For journaled volumes, this means we wait until the metadata blocks are written
1095     * to both the journal and their final locations (in the B-trees, etc.).
1096     *
1097     * This tends to avoid interleaving the metadata writes with other writes (for
1098     * example, user data, or to the journal when a later transaction notices that
1099     * an earlier transaction has finished its async writes, and then updates the
1100     * journal start in the journal header).  Avoiding interleaving of writes is
1101     * very good for performance on simple flash devices like SD cards, thumb drives;
1102     * and on devices like floppies.  Since removable devices tend to be this kind of
1103     * simple device, doing a synchronous flush actually improves performance in
1104     * practice.
1105     *
1106     * NOTE: For non-journaled volumes, the call to hfs_sync will also cause dirty
1107     * user data to be written.
1108     */
1109    if (hfsmp->jnl) {
1110        hfs_journal_flush(hfsmp, TRUE);
1111    } else {
1112        hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1113    }
1114
1115    KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER_TIMED | DBG_FUNC_END,
1116                          (microuptime(&now), tv_to_usecs(&now)),
1117                          tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_completed_timestamp),
1118                          tv_to_usecs(&hfsmp->hfs_mp->mnt_last_write_issued_timestamp),
1119                          hfsmp->hfs_mp->mnt_pending_write_size, 0);
1120
1121    hfs_end_transaction(hfsmp);
1122
1123    hfsmp->hfs_syncer_thread = NULL;
1124
1125    hfs_syncer_lock(hfsmp);
1126
1127    // If hfs_unmount lets us and we missed a sync, schedule again
1128    if (hfsmp->hfs_syncer && timerisset(&hfsmp->hfs_sync_req_oldest)) {
1129        thread_call_t syncer = hfsmp->hfs_syncer;
1130
1131        hfs_syncer_unlock(hfsmp);
1132
1133        hfs_syncer_queue(syncer);
1134    } else {
1135        hfsmp->hfs_sync_incomplete = FALSE;
1136        hfs_syncer_unlock(hfsmp);
1137        hfs_syncer_wakeup(hfsmp);
1138    }
1139
1140    /* BE CAREFUL WHAT YOU ADD HERE: at this point hfs_unmount is free
1141       to continue and therefore hfsmp might be invalid. */
1142
1143    KERNEL_DEBUG_CONSTANT(HFSDBG_SYNCER | DBG_FUNC_END, 0, 0, 0, 0, 0);
1144}
1145
1146
1147extern int IOBSDIsMediaEjectable( const char *cdev_name );
1148
1149/*
1150 * Call into the allocator code and perform a full scan of the bitmap file.
1151 *
1152 * This allows us to TRIM unallocated ranges if needed, and also to build up
1153 * an in-memory summary table of the state of the allocated blocks.
1154 */
1155void hfs_scan_blocks (struct hfsmount *hfsmp) {
1156	/*
1157	 * Take the allocation file lock.  Journal transactions will block until
1158	 * we're done here.
1159	 */
1160
1161	int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1162
1163	/*
1164	 * We serialize here with the HFS mount lock as we're mounting.
1165	 *
1166	 * The mount can only proceed once this thread has acquired the bitmap
1167	 * lock, since we absolutely do not want someone else racing in and
1168	 * getting the bitmap lock, doing a read/write of the bitmap file,
1169	 * then us getting the bitmap lock.
1170	 *
1171	 * To prevent this, the mount thread takes the HFS mount mutex, starts us
1172	 * up, then immediately msleeps on the scan_var variable in the mount
1173	 * point as a condition variable.  This serialization is safe since
1174	 * if we race in and try to proceed while they're still holding the lock,
1175	 * we'll block trying to acquire the global lock.  Since the mount thread
1176	 * acquires the HFS mutex before starting this function in a new thread,
1177	 * any lock acquisition on our part must be linearizably AFTER the mount thread's.
1178	 *
1179	 * Note that the HFS mount mutex is always taken last, and always for only
1180	 * a short time.  In this case, we just take it long enough to mark the
1181	 * scan-in-flight bit.
1182	 */
1183	(void) hfs_lock_mount (hfsmp);
1184	hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_INFLIGHT;
1185	wakeup((caddr_t) &hfsmp->scan_var);
1186	hfs_unlock_mount (hfsmp);
1187
1188	/* Initialize the summary table */
1189	if (hfs_init_summary (hfsmp)) {
1190		printf("hfs: could not initialize summary table for %s\n", hfsmp->vcbVN);
1191	}
1192
1193	/*
1194	 * ScanUnmapBlocks assumes that the bitmap lock is held when you
1195	 * call the function. We don't care if there were any errors issuing unmaps.
1196	 *
1197	 * It will also attempt to build up the summary table for subsequent
1198	 * allocator use, as configured.
1199	 */
1200	(void) ScanUnmapBlocks(hfsmp);
1201
1202	hfs_systemfile_unlock(hfsmp, flags);
1203}
1204
1205static int hfs_root_unmounted_cleanly = 0;
1206
1207SYSCTL_DECL(_vfs_generic);
1208SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
1209
1210/*
1211 * Common code for mount and mountroot
1212 */
1213int
1214hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
1215            int journal_replay_only, vfs_context_t context)
1216{
1217	struct proc *p = vfs_context_proc(context);
1218	int retval = E_NONE;
1219	struct hfsmount	*hfsmp = NULL;
1220	struct buf *bp;
1221	dev_t dev;
1222	HFSMasterDirectoryBlock *mdbp = NULL;
1223	int ronly;
1224#if QUOTA
1225	int i;
1226#endif
1227	int mntwrapper;
1228	kauth_cred_t cred;
1229	u_int64_t disksize;
1230	daddr64_t log_blkcnt;
1231	u_int32_t log_blksize;
1232	u_int32_t phys_blksize;
1233	u_int32_t minblksize;
1234	u_int32_t iswritable;
1235	daddr64_t mdb_offset;
1236	int isvirtual = 0;
1237	int isroot = 0;
1238	u_int32_t device_features = 0;
1239	int isssd;
1240
1241	if (args == NULL) {
1242		/* only hfs_mountroot passes us NULL as the 'args' argument */
1243		isroot = 1;
1244	}
1245
1246	ronly = vfs_isrdonly(mp);
1247	dev = vnode_specrdev(devvp);
1248	cred = p ? vfs_context_ucred(context) : NOCRED;
1249	mntwrapper = 0;
1250
1251	bp = NULL;
1252	hfsmp = NULL;
1253	mdbp = NULL;
1254	minblksize = kHFSBlockSize;
1255
1256	/* Advisory locking should be handled at the VFS layer */
1257	vfs_setlocklocal(mp);
1258
1259	/* Get the logical block size (treated as physical block size everywhere) */
1260	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) {
1261		if (HFS_MOUNT_DEBUG) {
1262			printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n");
1263		}
1264		retval = ENXIO;
1265		goto error_exit;
1266	}
1267	if (log_blksize == 0 || log_blksize > 1024*1024*1024) {
1268		printf("hfs: logical block size 0x%x looks bad.  Not mounting.\n", log_blksize);
1269		retval = ENXIO;
1270		goto error_exit;
1271	}
1272
1273	/* Get the physical block size. */
1274	retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context);
1275	if (retval) {
1276		if ((retval != ENOTSUP) && (retval != ENOTTY)) {
1277			if (HFS_MOUNT_DEBUG) {
1278				printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n");
1279			}
1280			retval = ENXIO;
1281			goto error_exit;
1282		}
1283		/* If device does not support this ioctl, assume that physical
1284		 * block size is same as logical block size
1285		 */
1286		phys_blksize = log_blksize;
1287	}
1288	if (phys_blksize == 0 || phys_blksize > MAXBSIZE) {
1289		printf("hfs: physical block size 0x%x looks bad.  Not mounting.\n", phys_blksize);
1290		retval = ENXIO;
1291		goto error_exit;
1292	}
1293
1294	/* Switch to 512 byte sectors (temporarily) */
1295	if (log_blksize > 512) {
1296		u_int32_t size512 = 512;
1297
1298		if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) {
1299			if (HFS_MOUNT_DEBUG) {
1300				printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n");
1301			}
1302			retval = ENXIO;
1303			goto error_exit;
1304		}
1305	}
1306	/* Get the number of 512 byte physical blocks. */
1307	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1308		/* resetting block size may fail if getting block count did */
1309		(void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context);
1310		if (HFS_MOUNT_DEBUG) {
1311			printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n");
1312		}
1313		retval = ENXIO;
1314		goto error_exit;
1315	}
1316	/* Compute an accurate disk size (i.e. within 512 bytes) */
1317	disksize = (u_int64_t)log_blkcnt * (u_int64_t)512;
1318
1319	/*
1320	 * On Tiger it is not necessary to switch the device
1321	 * block size to be 4k if there are more than 31-bits
1322	 * worth of blocks but to insure compatibility with
1323	 * pre-Tiger systems we have to do it.
1324	 *
1325	 * If the device size is not a multiple of 4K (8 * 512), then
1326	 * switching the logical block size isn't going to help because
1327	 * we will be unable to write the alternate volume header.
1328	 * In this case, just leave the logical block size unchanged.
1329	 */
1330	if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) {
1331		minblksize = log_blksize = 4096;
1332		if (phys_blksize < log_blksize)
1333			phys_blksize = log_blksize;
1334	}
1335
1336	/*
1337	 * The cluster layer is not currently prepared to deal with a logical
1338	 * block size larger than the system's page size.  (It can handle
1339	 * blocks per page, but not multiple pages per block.)  So limit the
1340	 * logical block size to the page size.
1341	 */
1342	if (log_blksize > PAGE_SIZE) {
1343		log_blksize = PAGE_SIZE;
1344	}
1345
1346	/* Now switch to our preferred physical block size. */
1347	if (log_blksize > 512) {
1348		if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1349			if (HFS_MOUNT_DEBUG) {
1350				printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n");
1351			}
1352			retval = ENXIO;
1353			goto error_exit;
1354		}
1355		/* Get the count of physical blocks. */
1356		if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1357			if (HFS_MOUNT_DEBUG) {
1358				printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n");
1359			}
1360			retval = ENXIO;
1361			goto error_exit;
1362		}
1363	}
1364	/*
1365	 * At this point:
1366	 *   minblksize is the minimum physical block size
1367	 *   log_blksize has our preferred physical block size
1368	 *   log_blkcnt has the total number of physical blocks
1369	 */
1370
1371	mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize);
1372	if ((retval = (int)buf_meta_bread(devvp,
1373				HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)),
1374				phys_blksize, cred, &bp))) {
1375		if (HFS_MOUNT_DEBUG) {
1376			printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval);
1377		}
1378		goto error_exit;
1379	}
1380	MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK);
1381	if (mdbp == NULL) {
1382		retval = ENOMEM;
1383		if (HFS_MOUNT_DEBUG) {
1384			printf("hfs_mountfs: MALLOC failed\n");
1385		}
1386		goto error_exit;
1387	}
1388	bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize);
1389	buf_brelse(bp);
1390	bp = NULL;
1391
1392	MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK);
1393	if (hfsmp == NULL) {
1394		if (HFS_MOUNT_DEBUG) {
1395			printf("hfs_mountfs: MALLOC (2) failed\n");
1396		}
1397		retval = ENOMEM;
1398		goto error_exit;
1399	}
1400	bzero(hfsmp, sizeof(struct hfsmount));
1401
1402	hfs_chashinit_finish(hfsmp);
1403
1404	/* Init the ID lookup hashtable */
1405	hfs_idhash_init (hfsmp);
1406
1407	/*
1408	 * See if the disk supports unmap (trim).
1409	 *
1410	 * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field
1411	 * returned by vfs_ioattr.  We need to call VNOP_IOCTL ourselves.
1412	 */
1413	if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) {
1414		if (device_features & DK_FEATURE_UNMAP) {
1415			hfsmp->hfs_flags |= HFS_UNMAP;
1416		}
1417	}
1418
1419	/*
1420	 * See if the disk is a solid state device, too.  We need this to decide what to do about
1421	 * hotfiles.
1422	 */
1423	if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) {
1424		if (isssd) {
1425			hfsmp->hfs_flags |= HFS_SSD;
1426		}
1427	}
1428
1429
1430	/*
1431	 *  Init the volume information structure
1432	 */
1433
1434	lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr);
1435	lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr);
1436	lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr);
1437	lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr);
1438	lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr);
1439
1440	vfs_setfsprivate(mp, hfsmp);
1441	hfsmp->hfs_mp = mp;			/* Make VFSTOHFS work */
1442	hfsmp->hfs_raw_dev = vnode_specrdev(devvp);
1443	hfsmp->hfs_devvp = devvp;
1444	vnode_ref(devvp);  /* Hold a ref on the device, dropped when hfsmp is freed. */
1445	hfsmp->hfs_logical_block_size = log_blksize;
1446	hfsmp->hfs_logical_block_count = log_blkcnt;
1447	hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1448	hfsmp->hfs_physical_block_size = phys_blksize;
1449	hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize);
1450	hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1451	if (ronly)
1452		hfsmp->hfs_flags |= HFS_READ_ONLY;
1453	if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)
1454		hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
1455
1456#if QUOTA
1457	for (i = 0; i < MAXQUOTAS; i++)
1458		dqfileinit(&hfsmp->hfs_qfiles[i]);
1459#endif
1460
1461	if (args) {
1462		hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid;
1463		if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID;
1464		hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid;
1465		if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID;
1466		vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);				/* tell the VFS */
1467		if (args->hfs_mask != (mode_t)VNOVAL) {
1468			hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
1469			if (args->flags & HFSFSMNT_NOXONFILES) {
1470				hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
1471			} else {
1472				hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
1473			}
1474		} else {
1475			hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;		/* 0777: rwx---rwx */
1476			hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;	/* 0666: no --x by default? */
1477		}
1478		if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER))
1479			mntwrapper = 1;
1480	} else {
1481		/* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */
1482		if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) {
1483			hfsmp->hfs_uid = UNKNOWNUID;
1484			hfsmp->hfs_gid = UNKNOWNGID;
1485			vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);			/* tell the VFS */
1486			hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;		/* 0777: rwx---rwx */
1487			hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;	/* 0666: no --x by default? */
1488		}
1489	}
1490
1491	/* Find out if disk media is writable. */
1492	if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) {
1493		if (iswritable)
1494			hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1495		else
1496			hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1497	}
1498
1499	// record the current time at which we're mounting this volume
1500	struct timeval tv;
1501	microtime(&tv);
1502	hfsmp->hfs_mount_time = tv.tv_sec;
1503
1504	/* Mount a standard HFS disk */
1505	if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) &&
1506	    (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) {
1507#if CONFIG_HFS_STD
1508		/* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */
1509		if (vfs_isrdwr(mp)) {
1510			retval = EROFS;
1511			goto error_exit;
1512		}
1513
1514		printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n");
1515
1516		/* Treat it as if it's read-only and not writeable */
1517		hfsmp->hfs_flags |= HFS_READ_ONLY;
1518		hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1519
1520	   	/* If only journal replay is requested, exit immediately */
1521		if (journal_replay_only) {
1522			retval = 0;
1523			goto error_exit;
1524		}
1525
1526	        if ((vfs_flags(mp) & MNT_ROOTFS)) {
1527			retval = EINVAL;  /* Cannot root from HFS standard disks */
1528			goto error_exit;
1529		}
1530		/* HFS disks can only use 512 byte physical blocks */
1531		if (log_blksize > kHFSBlockSize) {
1532			log_blksize = kHFSBlockSize;
1533			if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1534				retval = ENXIO;
1535				goto error_exit;
1536			}
1537			if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1538				retval = ENXIO;
1539				goto error_exit;
1540			}
1541			hfsmp->hfs_logical_block_size = log_blksize;
1542			hfsmp->hfs_logical_block_count = log_blkcnt;
1543			hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1544			hfsmp->hfs_physical_block_size = log_blksize;
1545			hfsmp->hfs_log_per_phys = 1;
1546		}
1547		if (args) {
1548			hfsmp->hfs_encoding = args->hfs_encoding;
1549			HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding;
1550
1551			/* establish the timezone */
1552			gTimeZone = args->hfs_timezone;
1553		}
1554
1555		retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode,
1556					&hfsmp->hfs_get_hfsname);
1557		if (retval)
1558			goto error_exit;
1559
1560		retval = hfs_MountHFSVolume(hfsmp, mdbp, p);
1561		if (retval)
1562			(void) hfs_relconverter(hfsmp->hfs_encoding);
1563#else
1564		/* On platforms where HFS Standard is not supported, deny the mount altogether */
1565		retval = EINVAL;
1566		goto error_exit;
1567#endif
1568
1569	}
1570	else { /* Mount an HFS Plus disk */
1571		HFSPlusVolumeHeader *vhp;
1572		off_t embeddedOffset;
1573		int   jnl_disable = 0;
1574
1575		/* Get the embedded Volume Header */
1576		if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
1577			embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize;
1578			embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) *
1579			                  (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1580
1581			/*
1582			 * If the embedded volume doesn't start on a block
1583			 * boundary, then switch the device to a 512-byte
1584			 * block size so everything will line up on a block
1585			 * boundary.
1586			 */
1587			if ((embeddedOffset % log_blksize) != 0) {
1588				printf("hfs_mountfs: embedded volume offset not"
1589				    " a multiple of physical block size (%d);"
1590				    " switching to 512\n", log_blksize);
1591				log_blksize = 512;
1592				if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE,
1593				    (caddr_t)&log_blksize, FWRITE, context)) {
1594
1595					if (HFS_MOUNT_DEBUG) {
1596						printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n");
1597					}
1598					retval = ENXIO;
1599					goto error_exit;
1600				}
1601				if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT,
1602				    (caddr_t)&log_blkcnt, 0, context)) {
1603					if (HFS_MOUNT_DEBUG) {
1604						printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n");
1605					}
1606					retval = ENXIO;
1607					goto error_exit;
1608				}
1609				/* Note: relative block count adjustment */
1610				hfsmp->hfs_logical_block_count *=
1611				    hfsmp->hfs_logical_block_size / log_blksize;
1612
1613				/* Update logical /physical block size */
1614				hfsmp->hfs_logical_block_size = log_blksize;
1615				hfsmp->hfs_physical_block_size = log_blksize;
1616
1617				phys_blksize = log_blksize;
1618				hfsmp->hfs_log_per_phys = 1;
1619			}
1620
1621			disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) *
1622			           (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1623
1624			hfsmp->hfs_logical_block_count = disksize / log_blksize;
1625
1626			hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1627
1628			mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1629			retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1630					phys_blksize, cred, &bp);
1631			if (retval) {
1632				if (HFS_MOUNT_DEBUG) {
1633					printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval);
1634				}
1635				goto error_exit;
1636			}
1637			bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512);
1638			buf_brelse(bp);
1639			bp = NULL;
1640			vhp = (HFSPlusVolumeHeader*) mdbp;
1641
1642		}
1643		else { /* pure HFS+ */
1644			embeddedOffset = 0;
1645			vhp = (HFSPlusVolumeHeader*) mdbp;
1646		}
1647
1648		if (isroot) {
1649			hfs_root_unmounted_cleanly = ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0);
1650		}
1651
1652		/*
1653		 * On inconsistent disks, do not allow read-write mount
1654		 * unless it is the boot volume being mounted.  We also
1655		 * always want to replay the journal if the journal_replay_only
1656		 * flag is set because that will (most likely) get the
1657		 * disk into a consistent state before fsck_hfs starts
1658		 * looking at it.
1659		 */
1660		if (  !(vfs_flags(mp) & MNT_ROOTFS)
1661		   && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask)
1662		   && !journal_replay_only
1663		   && !(hfsmp->hfs_flags & HFS_READ_ONLY)) {
1664
1665			if (HFS_MOUNT_DEBUG) {
1666				printf("hfs_mountfs: failed to mount non-root inconsistent disk\n");
1667			}
1668			retval = EINVAL;
1669			goto error_exit;
1670		}
1671
1672
1673		// XXXdbg
1674		//
1675		hfsmp->jnl = NULL;
1676		hfsmp->jvp = NULL;
1677		if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) &&
1678		    args->journal_disable) {
1679		    jnl_disable = 1;
1680		}
1681
1682		//
1683		// We only initialize the journal here if the last person
1684		// to mount this volume was journaling aware.  Otherwise
1685		// we delay journal initialization until later at the end
1686		// of hfs_MountHFSPlusVolume() because the last person who
1687		// mounted it could have messed things up behind our back
1688		// (so we need to go find the .journal file, make sure it's
1689		// the right size, re-sync up if it was moved, etc).
1690		//
1691		if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
1692			&& (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
1693			&& !jnl_disable) {
1694
1695			// if we're able to init the journal, mark the mount
1696			// point as journaled.
1697			//
1698			if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) {
1699				vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1700			} else {
1701				if (retval == EROFS) {
1702					// EROFS is a special error code that means the volume has an external
1703					// journal which we couldn't find.  in that case we do not want to
1704					// rewrite the volume header - we'll just refuse to mount the volume.
1705					if (HFS_MOUNT_DEBUG) {
1706						printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n");
1707					}
1708					retval = EINVAL;
1709					goto error_exit;
1710				}
1711
1712				// if the journal failed to open, then set the lastMountedVersion
1713				// to be "FSK!" which fsck_hfs will see and force the fsck instead
1714				// of just bailing out because the volume is journaled.
1715				if (!ronly) {
1716					if (HFS_MOUNT_DEBUG) {
1717						printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n");
1718					}
1719
1720					HFSPlusVolumeHeader *jvhp;
1721
1722				    hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1723
1724				    if (mdb_offset == 0) {
1725					mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1726				    }
1727
1728				    bp = NULL;
1729				    retval = (int)buf_meta_bread(devvp,
1730						    HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1731						    phys_blksize, cred, &bp);
1732				    if (retval == 0) {
1733					jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1734
1735					if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1736						printf ("hfs(1): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1737					    jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1738					    buf_bwrite(bp);
1739					} else {
1740					    buf_brelse(bp);
1741					}
1742					bp = NULL;
1743				    } else if (bp) {
1744					buf_brelse(bp);
1745					// clear this so the error exit path won't try to use it
1746					bp = NULL;
1747				    }
1748				}
1749
1750				// if this isn't the root device just bail out.
1751				// If it is the root device we just continue on
1752				// in the hopes that fsck_hfs will be able to
1753				// fix any damage that exists on the volume.
1754				if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1755					if (HFS_MOUNT_DEBUG) {
1756						printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n");
1757					}
1758				    retval = EINVAL;
1759				    goto error_exit;
1760				}
1761			}
1762		}
1763		// XXXdbg
1764
1765		/* Either the journal is replayed successfully, or there
1766		 * was nothing to replay, or no journal exists.  In any case,
1767		 * return success.
1768		 */
1769		if (journal_replay_only) {
1770			retval = 0;
1771			goto error_exit;
1772		}
1773
1774		(void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
1775
1776		retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1777		/*
1778		 * If the backend didn't like our physical blocksize
1779		 * then retry with physical blocksize of 512.
1780		 */
1781		if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) {
1782			printf("hfs_mountfs: could not use physical block size "
1783					"(%d) switching to 512\n", log_blksize);
1784			log_blksize = 512;
1785			if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1786				if (HFS_MOUNT_DEBUG) {
1787					printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n");
1788				}
1789				retval = ENXIO;
1790				goto error_exit;
1791			}
1792			if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1793				if (HFS_MOUNT_DEBUG) {
1794					printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n");
1795				}
1796				retval = ENXIO;
1797				goto error_exit;
1798			}
1799			devvp->v_specsize = log_blksize;
1800			/* Note: relative block count adjustment (in case this is an embedded volume). */
1801			hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize;
1802			hfsmp->hfs_logical_block_size = log_blksize;
1803			hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize;
1804
1805			hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1806
1807			if (hfsmp->jnl && hfsmp->jvp == devvp) {
1808			    // close and re-open this with the new block size
1809			    journal_close(hfsmp->jnl);
1810			    hfsmp->jnl = NULL;
1811			    if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
1812					vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1813				} else {
1814					// if the journal failed to open, then set the lastMountedVersion
1815					// to be "FSK!" which fsck_hfs will see and force the fsck instead
1816					// of just bailing out because the volume is journaled.
1817					if (!ronly) {
1818						if (HFS_MOUNT_DEBUG) {
1819							printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n");
1820						}
1821				    	HFSPlusVolumeHeader *jvhp;
1822
1823				    	hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1824
1825				    	if (mdb_offset == 0) {
1826							mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1827				    	}
1828
1829				   	 	bp = NULL;
1830				    	retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1831							phys_blksize, cred, &bp);
1832				    	if (retval == 0) {
1833							jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1834
1835							if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1836								printf ("hfs(2): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1837					    		jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1838					    		buf_bwrite(bp);
1839							} else {
1840					    		buf_brelse(bp);
1841							}
1842							bp = NULL;
1843				    	} else if (bp) {
1844							buf_brelse(bp);
1845							// clear this so the error exit path won't try to use it
1846							bp = NULL;
1847				    	}
1848					}
1849
1850					// if this isn't the root device just bail out.
1851					// If it is the root device we just continue on
1852					// in the hopes that fsck_hfs will be able to
1853					// fix any damage that exists on the volume.
1854					if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1855						if (HFS_MOUNT_DEBUG) {
1856							printf("hfs_mountfs: hfs_early_journal_init (2) failed \n");
1857						}
1858				    	retval = EINVAL;
1859				    	goto error_exit;
1860					}
1861				}
1862			}
1863
1864			/* Try again with a smaller block size... */
1865			retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1866			if (retval && HFS_MOUNT_DEBUG) {
1867				printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval);
1868			}
1869		}
1870		if (retval)
1871			(void) hfs_relconverter(0);
1872	}
1873
1874	// save off a snapshot of the mtime from the previous mount
1875	// (for matador).
1876	hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime;
1877
1878	if ( retval ) {
1879		if (HFS_MOUNT_DEBUG) {
1880			printf("hfs_mountfs: encountered failure %d \n", retval);
1881		}
1882		goto error_exit;
1883	}
1884
1885	mp->mnt_vfsstat.f_fsid.val[0] = dev;
1886	mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp);
1887	vfs_setmaxsymlen(mp, 0);
1888
1889	mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR;
1890#if NAMEDSTREAMS
1891	mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1892#endif
1893	if ((hfsmp->hfs_flags & HFS_STANDARD) == 0 ) {
1894		/* Tell VFS that we support directory hard links. */
1895		mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS;
1896	}
1897#if CONFIG_HFS_STD
1898	else {
1899		/* HFS standard doesn't support extended readdir! */
1900		mount_set_noreaddirext (mp);
1901	}
1902#endif
1903
1904	if (args) {
1905		/*
1906		 * Set the free space warning levels for a non-root volume:
1907		 *
1908		 * Set the "danger" limit to 1% of the volume size or 100MB, whichever
1909		 * is less.  Set the "warning" limit to 2% of the volume size or 150MB,
1910		 * whichever is less.  And last, set the "desired" freespace level to
1911		 * to 3% of the volume size or 200MB, whichever is less.
1912		 */
1913		hfsmp->hfs_freespace_notify_dangerlimit =
1914			MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1915				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION);
1916		hfsmp->hfs_freespace_notify_warninglimit =
1917			MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1918				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION);
1919		hfsmp->hfs_freespace_notify_desiredlevel =
1920			MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1921				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION);
1922	} else {
1923		/*
1924		 * Set the free space warning levels for the root volume:
1925		 *
1926		 * Set the "danger" limit to 5% of the volume size or 512MB, whichever
1927		 * is less.  Set the "warning" limit to 10% of the volume size or 1GB,
1928		 * whichever is less.  And last, set the "desired" freespace level to
1929		 * to 11% of the volume size or 1.25GB, whichever is less.
1930		 */
1931		hfsmp->hfs_freespace_notify_dangerlimit =
1932			MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1933				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION);
1934		hfsmp->hfs_freespace_notify_warninglimit =
1935			MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1936				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION);
1937		hfsmp->hfs_freespace_notify_desiredlevel =
1938			MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1939				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION);
1940	};
1941
1942	/* Check if the file system exists on virtual device, like disk image */
1943	if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) {
1944		if (isvirtual) {
1945			hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE;
1946		}
1947	}
1948
1949	/* do not allow ejectability checks on the root device */
1950	if (isroot == 0) {
1951		if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 &&
1952				IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
1953			hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
1954			if (hfsmp->hfs_syncer == NULL) {
1955				printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
1956						mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
1957			}
1958		}
1959	}
1960
1961	printf("hfs: mounted %s on device %s\n", (hfsmp->vcbVN ? (const char*) hfsmp->vcbVN : "unknown"),
1962            (devvp->v_name ? devvp->v_name : (isroot ? "root_device": "unknown device")));
1963
1964	/*
1965	 * Start looking for free space to drop below this level and generate a
1966	 * warning immediately if needed:
1967	 */
1968	hfsmp->hfs_notification_conditions = 0;
1969	hfs_generate_volume_notifications(hfsmp);
1970
1971	if (ronly == 0) {
1972		(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1973	}
1974	FREE(mdbp, M_TEMP);
1975	return (0);
1976
1977error_exit:
1978	if (bp)
1979		buf_brelse(bp);
1980	if (mdbp)
1981		FREE(mdbp, M_TEMP);
1982
1983	if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
1984		vnode_clearmountedon(hfsmp->jvp);
1985		(void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, vfs_context_kernel());
1986		hfsmp->jvp = NULL;
1987	}
1988	if (hfsmp) {
1989		if (hfsmp->hfs_devvp) {
1990			vnode_rele(hfsmp->hfs_devvp);
1991		}
1992		hfs_locks_destroy(hfsmp);
1993		hfs_delete_chash(hfsmp);
1994		hfs_idhash_destroy (hfsmp);
1995
1996		FREE(hfsmp, M_HFSMNT);
1997		vfs_setfsprivate(mp, NULL);
1998	}
1999        return (retval);
2000}
2001
2002
2003/*
2004 * Make a filesystem operational.
2005 * Nothing to do at the moment.
2006 */
2007/* ARGSUSED */
2008static int
2009hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context)
2010{
2011	return (0);
2012}
2013
2014
2015/*
2016 * unmount system call
2017 */
2018int
2019hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
2020{
2021	struct proc *p = vfs_context_proc(context);
2022	struct hfsmount *hfsmp = VFSTOHFS(mp);
2023	int retval = E_NONE;
2024	int flags;
2025	int force;
2026	int started_tr = 0;
2027
2028	flags = 0;
2029	force = 0;
2030	if (mntflags & MNT_FORCE) {
2031		flags |= FORCECLOSE;
2032		force = 1;
2033	}
2034
2035	printf("hfs: unmount initiated on %s on device %s\n",
2036			(hfsmp->vcbVN ? (const char*) hfsmp->vcbVN : "unknown"),
2037			(hfsmp->hfs_devvp ? ((hfsmp->hfs_devvp->v_name ? hfsmp->hfs_devvp->v_name : "unknown device")) : "unknown device"));
2038
2039	if ((retval = hfs_flushfiles(mp, flags, p)) && !force)
2040 		return (retval);
2041
2042	if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
2043		(void) hfs_recording_suspend(hfsmp);
2044
2045    // Tidy up the syncer
2046	if (hfsmp->hfs_syncer)
2047	{
2048        hfs_syncer_lock(hfsmp);
2049
2050        /* First, make sure everything else knows we don't want any more
2051           requests queued. */
2052        thread_call_t syncer = hfsmp->hfs_syncer;
2053        hfsmp->hfs_syncer = NULL;
2054
2055        hfs_syncer_unlock(hfsmp);
2056
2057        // Now deal with requests that are outstanding
2058        if (hfsmp->hfs_sync_incomplete) {
2059            if (thread_call_cancel(syncer)) {
2060                // We managed to cancel the timer so we're done
2061                hfsmp->hfs_sync_incomplete = FALSE;
2062            } else {
2063                // Syncer must be running right now so we have to wait
2064                hfs_syncer_lock(hfsmp);
2065                while (hfsmp->hfs_sync_incomplete)
2066                    hfs_syncer_wait(hfsmp);
2067                hfs_syncer_unlock(hfsmp);
2068            }
2069        }
2070
2071        // Now we're safe to free the syncer
2072		thread_call_free(syncer);
2073	}
2074
2075	if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) {
2076		if (hfsmp->hfs_summary_table) {
2077			int err = 0;
2078			/*
2079		 	 * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress
2080			 */
2081			if (hfsmp->hfs_allocation_vp) {
2082				err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2083			}
2084			FREE (hfsmp->hfs_summary_table, M_TEMP);
2085			hfsmp->hfs_summary_table = NULL;
2086			hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE;
2087
2088			if (err == 0 && hfsmp->hfs_allocation_vp){
2089				hfs_unlock (VTOC(hfsmp->hfs_allocation_vp));
2090			}
2091
2092		}
2093	}
2094
2095	/*
2096	 * Flush out the b-trees, volume bitmap and Volume Header
2097	 */
2098	if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2099		retval = hfs_start_transaction(hfsmp);
2100		if (retval == 0) {
2101		    started_tr = 1;
2102		} else if (!force) {
2103		    goto err_exit;
2104		}
2105
2106		if (hfsmp->hfs_startup_vp) {
2107			(void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2108			retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p);
2109			hfs_unlock(VTOC(hfsmp->hfs_startup_vp));
2110			if (retval && !force)
2111				goto err_exit;
2112		}
2113
2114		if (hfsmp->hfs_attribute_vp) {
2115			(void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2116			retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p);
2117			hfs_unlock(VTOC(hfsmp->hfs_attribute_vp));
2118			if (retval && !force)
2119				goto err_exit;
2120		}
2121
2122		(void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2123		retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p);
2124		hfs_unlock(VTOC(hfsmp->hfs_catalog_vp));
2125		if (retval && !force)
2126			goto err_exit;
2127
2128		(void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2129		retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p);
2130		hfs_unlock(VTOC(hfsmp->hfs_extents_vp));
2131		if (retval && !force)
2132			goto err_exit;
2133
2134		if (hfsmp->hfs_allocation_vp) {
2135			(void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2136			retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p);
2137			hfs_unlock(VTOC(hfsmp->hfs_allocation_vp));
2138			if (retval && !force)
2139				goto err_exit;
2140		}
2141
2142		if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) {
2143			retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p);
2144			if (retval && !force)
2145				goto err_exit;
2146		}
2147
2148		/* If runtime corruption was detected, indicate that the volume
2149		 * was not unmounted cleanly.
2150		 */
2151		if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2152			HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2153		} else {
2154			HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
2155		}
2156
2157		if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2158			int i;
2159			u_int32_t min_start = hfsmp->totalBlocks;
2160
2161			// set the nextAllocation pointer to the smallest free block number
2162			// we've seen so on the next mount we won't rescan unnecessarily
2163			lck_spin_lock(&hfsmp->vcbFreeExtLock);
2164			for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) {
2165				if (hfsmp->vcbFreeExt[i].startBlock < min_start) {
2166					min_start = hfsmp->vcbFreeExt[i].startBlock;
2167				}
2168			}
2169			lck_spin_unlock(&hfsmp->vcbFreeExtLock);
2170			if (min_start < hfsmp->nextAllocation) {
2171				hfsmp->nextAllocation = min_start;
2172			}
2173		}
2174
2175		retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2176		if (retval) {
2177			HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2178			if (!force)
2179				goto err_exit;	/* could not flush everything */
2180		}
2181
2182		if (started_tr) {
2183		    hfs_end_transaction(hfsmp);
2184		    started_tr = 0;
2185		}
2186	}
2187
2188	if (hfsmp->jnl) {
2189		hfs_journal_flush(hfsmp, FALSE);
2190	}
2191
2192	/*
2193	 *	Invalidate our caches and release metadata vnodes
2194	 */
2195	(void) hfsUnmount(hfsmp, p);
2196
2197#if CONFIG_HFS_STD
2198	if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2199		(void) hfs_relconverter(hfsmp->hfs_encoding);
2200	}
2201#endif
2202
2203	// XXXdbg
2204	if (hfsmp->jnl) {
2205	    journal_close(hfsmp->jnl);
2206	    hfsmp->jnl = NULL;
2207	}
2208
2209	VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
2210
2211	if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2212	    vnode_clearmountedon(hfsmp->jvp);
2213	    retval = VNOP_CLOSE(hfsmp->jvp,
2214	                       hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE,
2215			       vfs_context_kernel());
2216	    vnode_put(hfsmp->jvp);
2217	    hfsmp->jvp = NULL;
2218	}
2219	// XXXdbg
2220
2221	/*
2222	 * Last chance to dump unreferenced system files.
2223	 */
2224	(void) vflush(mp, NULLVP, FORCECLOSE);
2225
2226#if HFS_SPARSE_DEV
2227	/* Drop our reference on the backing fs (if any). */
2228	if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) {
2229		struct vnode * tmpvp;
2230
2231		hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2232		tmpvp = hfsmp->hfs_backingfs_rootvp;
2233		hfsmp->hfs_backingfs_rootvp = NULLVP;
2234		vnode_rele(tmpvp);
2235	}
2236#endif /* HFS_SPARSE_DEV */
2237
2238	vnode_rele(hfsmp->hfs_devvp);
2239
2240	hfs_locks_destroy(hfsmp);
2241	hfs_delete_chash(hfsmp);
2242	hfs_idhash_destroy(hfsmp);
2243	FREE(hfsmp, M_HFSMNT);
2244
2245	return (0);
2246
2247  err_exit:
2248	if (started_tr) {
2249		hfs_end_transaction(hfsmp);
2250	}
2251	return retval;
2252}
2253
2254
2255/*
2256 * Return the root of a filesystem.
2257 */
2258static int
2259hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context)
2260{
2261	return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0);
2262}
2263
2264
2265/*
2266 * Do operations associated with quotas
2267 */
2268#if !QUOTA
2269static int
2270hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context)
2271{
2272	return (ENOTSUP);
2273}
2274#else
2275static int
2276hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context)
2277{
2278	struct proc *p = vfs_context_proc(context);
2279	int cmd, type, error;
2280
2281	if (uid == ~0U)
2282		uid = kauth_cred_getuid(vfs_context_ucred(context));
2283	cmd = cmds >> SUBCMDSHIFT;
2284
2285	switch (cmd) {
2286	case Q_SYNC:
2287	case Q_QUOTASTAT:
2288		break;
2289	case Q_GETQUOTA:
2290		if (uid == kauth_cred_getuid(vfs_context_ucred(context)))
2291			break;
2292		/* fall through */
2293	default:
2294		if ( (error = vfs_context_suser(context)) )
2295			return (error);
2296	}
2297
2298	type = cmds & SUBCMDMASK;
2299	if ((u_int)type >= MAXQUOTAS)
2300		return (EINVAL);
2301	if (vfs_busy(mp, LK_NOWAIT))
2302		return (0);
2303
2304	switch (cmd) {
2305
2306	case Q_QUOTAON:
2307		error = hfs_quotaon(p, mp, type, datap);
2308		break;
2309
2310	case Q_QUOTAOFF:
2311		error = hfs_quotaoff(p, mp, type);
2312		break;
2313
2314	case Q_SETQUOTA:
2315		error = hfs_setquota(mp, uid, type, datap);
2316		break;
2317
2318	case Q_SETUSE:
2319		error = hfs_setuse(mp, uid, type, datap);
2320		break;
2321
2322	case Q_GETQUOTA:
2323		error = hfs_getquota(mp, uid, type, datap);
2324		break;
2325
2326	case Q_SYNC:
2327		error = hfs_qsync(mp);
2328		break;
2329
2330	case Q_QUOTASTAT:
2331		error = hfs_quotastat(mp, type, datap);
2332		break;
2333
2334	default:
2335		error = EINVAL;
2336		break;
2337	}
2338	vfs_unbusy(mp);
2339
2340	return (error);
2341}
2342#endif /* QUOTA */
2343
2344/* Subtype is composite of bits */
2345#define HFS_SUBTYPE_JOURNALED      0x01
2346#define HFS_SUBTYPE_CASESENSITIVE  0x02
2347/* bits 2 - 6 reserved */
2348#define HFS_SUBTYPE_STANDARDHFS    0x80
2349
2350/*
2351 * Get file system statistics.
2352 */
2353int
2354hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context)
2355{
2356	ExtendedVCB *vcb = VFSTOVCB(mp);
2357	struct hfsmount *hfsmp = VFSTOHFS(mp);
2358	u_int32_t freeCNIDs;
2359	u_int16_t subtype = 0;
2360
2361	freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)vcb->vcbNxtCNID;
2362
2363	sbp->f_bsize = (u_int32_t)vcb->blockSize;
2364	sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0);
2365	sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks);
2366	sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0));
2367	sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1));
2368	sbp->f_files = (u_int64_t)((u_int32_t )(vcb->totalBlocks - 2));  /* max files is constrained by total blocks */
2369	sbp->f_ffree = (u_int64_t)((u_int32_t )(MIN(freeCNIDs, sbp->f_bavail)));
2370
2371	/*
2372	 * Subtypes (flavors) for HFS
2373	 *   0:   Mac OS Extended
2374	 *   1:   Mac OS Extended (Journaled)
2375	 *   2:   Mac OS Extended (Case Sensitive)
2376	 *   3:   Mac OS Extended (Case Sensitive, Journaled)
2377	 *   4 - 127:   Reserved
2378	 * 128:   Mac OS Standard
2379	 *
2380	 */
2381	if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) {
2382		/* HFS+ & variants */
2383		if (hfsmp->jnl) {
2384			subtype |= HFS_SUBTYPE_JOURNALED;
2385		}
2386		if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) {
2387			subtype |= HFS_SUBTYPE_CASESENSITIVE;
2388		}
2389	}
2390#if CONFIG_HFS_STD
2391	else {
2392		/* HFS standard */
2393		subtype = HFS_SUBTYPE_STANDARDHFS;
2394	}
2395#endif
2396	sbp->f_fssubtype = subtype;
2397
2398	return (0);
2399}
2400
2401
2402//
2403// XXXdbg -- this is a callback to be used by the journal to
2404//           get meta data blocks flushed out to disk.
2405//
2406// XXXdbg -- be smarter and don't flush *every* block on each
2407//           call.  try to only flush some so we don't wind up
2408//           being too synchronous.
2409//
2410__private_extern__
2411void
2412hfs_sync_metadata(void *arg)
2413{
2414	struct mount *mp = (struct mount *)arg;
2415	struct hfsmount *hfsmp;
2416	ExtendedVCB *vcb;
2417	buf_t	bp;
2418	int  retval;
2419	daddr64_t priIDSector;
2420	hfsmp = VFSTOHFS(mp);
2421	vcb = HFSTOVCB(hfsmp);
2422
2423	// now make sure the super block is flushed
2424	priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
2425				  HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
2426
2427	retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2428			HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
2429			hfsmp->hfs_physical_block_size, NOCRED, &bp);
2430	if ((retval != 0 ) && (retval != ENXIO)) {
2431		printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n",
2432		       (int)priIDSector, retval);
2433	}
2434
2435	if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2436	    buf_bwrite(bp);
2437	} else if (bp) {
2438	    buf_brelse(bp);
2439	}
2440
2441	// the alternate super block...
2442	// XXXdbg - we probably don't need to do this each and every time.
2443	//          hfs_btreeio.c:FlushAlternate() should flag when it was
2444	//          written...
2445	if (hfsmp->hfs_alt_id_sector) {
2446		retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2447				HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
2448				hfsmp->hfs_physical_block_size, NOCRED, &bp);
2449		if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2450		    buf_bwrite(bp);
2451		} else if (bp) {
2452		    buf_brelse(bp);
2453		}
2454	}
2455}
2456
2457
2458struct hfs_sync_cargs {
2459        kauth_cred_t cred;
2460        struct proc  *p;
2461        int    waitfor;
2462        int    error;
2463};
2464
2465
2466static int
2467hfs_sync_callback(struct vnode *vp, void *cargs)
2468{
2469	struct cnode *cp;
2470	struct hfs_sync_cargs *args;
2471	int error;
2472
2473	args = (struct hfs_sync_cargs *)cargs;
2474
2475	if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) {
2476		return (VNODE_RETURNED);
2477	}
2478	cp = VTOC(vp);
2479
2480	if ((cp->c_flag & C_MODIFIED) ||
2481	    (cp->c_touch_acctime | cp->c_touch_chgtime | cp->c_touch_modtime) ||
2482	    vnode_hasdirtyblks(vp)) {
2483	        error = hfs_fsync(vp, args->waitfor, 0, args->p);
2484
2485		if (error)
2486		        args->error = error;
2487	}
2488	hfs_unlock(cp);
2489	return (VNODE_RETURNED);
2490}
2491
2492
2493
2494/*
2495 * Go through the disk queues to initiate sandbagged IO;
2496 * go through the inodes to write those that have been modified;
2497 * initiate the writing of the super block if it has been modified.
2498 *
2499 * Note: we are always called with the filesystem marked `MPBUSY'.
2500 */
2501int
2502hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
2503{
2504	struct proc *p = vfs_context_proc(context);
2505	struct cnode *cp;
2506	struct hfsmount *hfsmp;
2507	ExtendedVCB *vcb;
2508	struct vnode *meta_vp[4];
2509	int i;
2510	int error, allerror = 0;
2511	struct hfs_sync_cargs args;
2512
2513	hfsmp = VFSTOHFS(mp);
2514
2515	/*
2516	 * hfs_changefs might be manipulating vnodes so back off
2517	 */
2518	if (hfsmp->hfs_flags & HFS_IN_CHANGEFS)
2519		return (0);
2520
2521	if (hfsmp->hfs_flags & HFS_READ_ONLY)
2522		return (EROFS);
2523
2524	/* skip over frozen volumes */
2525	if (!lck_rw_try_lock_shared(&hfsmp->hfs_insync))
2526		return 0;
2527
2528	args.cred = kauth_cred_get();
2529	args.waitfor = waitfor;
2530	args.p = p;
2531	args.error = 0;
2532	/*
2533	 * hfs_sync_callback will be called for each vnode
2534	 * hung off of this mount point... the vnode will be
2535	 * properly referenced and unreferenced around the callback
2536	 */
2537	vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args);
2538
2539	if (args.error)
2540	        allerror = args.error;
2541
2542	vcb = HFSTOVCB(hfsmp);
2543
2544	meta_vp[0] = vcb->extentsRefNum;
2545	meta_vp[1] = vcb->catalogRefNum;
2546	meta_vp[2] = vcb->allocationsRefNum;  /* This is NULL for standard HFS */
2547	meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */
2548
2549	/* Now sync our three metadata files */
2550	for (i = 0; i < 4; ++i) {
2551		struct vnode *btvp;
2552
2553		btvp = meta_vp[i];;
2554		if ((btvp==0) || (vnode_mount(btvp) != mp))
2555			continue;
2556
2557		/* XXX use hfs_systemfile_lock instead ? */
2558		(void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2559		cp = VTOC(btvp);
2560
2561		if (((cp->c_flag &  C_MODIFIED) == 0) &&
2562		    (cp->c_touch_acctime == 0) &&
2563		    (cp->c_touch_chgtime == 0) &&
2564		    (cp->c_touch_modtime == 0) &&
2565		    vnode_hasdirtyblks(btvp) == 0) {
2566			hfs_unlock(VTOC(btvp));
2567			continue;
2568		}
2569		error = vnode_get(btvp);
2570		if (error) {
2571			hfs_unlock(VTOC(btvp));
2572			continue;
2573		}
2574		if ((error = hfs_fsync(btvp, waitfor, 0, p)))
2575			allerror = error;
2576
2577		hfs_unlock(cp);
2578		vnode_put(btvp);
2579	};
2580
2581
2582#if CONFIG_HFS_STD
2583	/*
2584	 * Force stale file system control information to be flushed.
2585	 */
2586	if (vcb->vcbSigWord == kHFSSigWord) {
2587		if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) {
2588			allerror = error;
2589		}
2590	}
2591#endif
2592
2593#if QUOTA
2594	hfs_qsync(mp);
2595#endif /* QUOTA */
2596
2597	hfs_hotfilesync(hfsmp, vfs_context_kernel());
2598
2599	/*
2600	 * Write back modified superblock.
2601	 */
2602	if (IsVCBDirty(vcb)) {
2603		error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
2604		if (error)
2605			allerror = error;
2606	}
2607
2608	if (hfsmp->jnl) {
2609	    hfs_journal_flush(hfsmp, FALSE);
2610	}
2611
2612	lck_rw_unlock_shared(&hfsmp->hfs_insync);
2613	return (allerror);
2614}
2615
2616
2617/*
2618 * File handle to vnode
2619 *
2620 * Have to be really careful about stale file handles:
2621 * - check that the cnode id is valid
2622 * - call hfs_vget() to get the locked cnode
2623 * - check for an unallocated cnode (i_mode == 0)
2624 * - check that the given client host has export rights and return
2625 *   those rights via. exflagsp and credanonp
2626 */
2627static int
2628hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context)
2629{
2630	struct hfsfid *hfsfhp;
2631	struct vnode *nvp;
2632	int result;
2633
2634	*vpp = NULL;
2635	hfsfhp = (struct hfsfid *)fhp;
2636
2637	if (fhlen < (int)sizeof(struct hfsfid))
2638		return (EINVAL);
2639
2640	result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0);
2641	if (result) {
2642		if (result == ENOENT)
2643			result = ESTALE;
2644		return result;
2645	}
2646
2647	/*
2648	 * We used to use the create time as the gen id of the file handle,
2649	 * but it is not static enough because it can change at any point
2650	 * via system calls.  We still don't have another volume ID or other
2651	 * unique identifier to use for a generation ID across reboots that
2652	 * persists until the file is removed.  Using only the CNID exposes
2653	 * us to the potential wrap-around case, but as of 2/2008, it would take
2654	 * over 2 months to wrap around if the machine did nothing but allocate
2655	 * CNIDs.  Using some kind of wrap counter would only be effective if
2656	 * each file had the wrap counter associated with it.  For now,
2657	 * we use only the CNID to identify the file as it's good enough.
2658	 */
2659
2660	*vpp = nvp;
2661
2662	hfs_unlock(VTOC(nvp));
2663	return (0);
2664}
2665
2666
2667/*
2668 * Vnode pointer to File handle
2669 */
2670/* ARGSUSED */
2671static int
2672hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context)
2673{
2674	struct cnode *cp;
2675	struct hfsfid *hfsfhp;
2676
2677	if (ISHFS(VTOVCB(vp)))
2678		return (ENOTSUP);	/* hfs standard is not exportable */
2679
2680	if (*fhlenp < (int)sizeof(struct hfsfid))
2681		return (EOVERFLOW);
2682
2683	cp = VTOC(vp);
2684	hfsfhp = (struct hfsfid *)fhp;
2685	/* only the CNID is used to identify the file now */
2686	hfsfhp->hfsfid_cnid = htonl(cp->c_fileid);
2687	hfsfhp->hfsfid_gen = htonl(cp->c_fileid);
2688	*fhlenp = sizeof(struct hfsfid);
2689
2690	return (0);
2691}
2692
2693
2694/*
2695 * Initialize HFS filesystems, done only once per boot.
2696 *
2697 * HFS is not a kext-based file system.  This makes it difficult to find
2698 * out when the last HFS file system was unmounted and call hfs_uninit()
2699 * to deallocate data structures allocated in hfs_init().  Therefore we
2700 * never deallocate memory allocated by lock attribute and group initializations
2701 * in this function.
2702 */
2703static int
2704hfs_init(__unused struct vfsconf *vfsp)
2705{
2706	static int done = 0;
2707
2708	if (done)
2709		return (0);
2710	done = 1;
2711	hfs_chashinit();
2712	hfs_converterinit();
2713
2714	BTReserveSetup();
2715
2716	hfs_lock_attr    = lck_attr_alloc_init();
2717	hfs_group_attr   = lck_grp_attr_alloc_init();
2718	hfs_mutex_group  = lck_grp_alloc_init("hfs-mutex", hfs_group_attr);
2719	hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr);
2720	hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr);
2721
2722#if HFS_COMPRESSION
2723	decmpfs_init();
2724#endif
2725
2726	return (0);
2727}
2728
2729
2730/*
2731 * Destroy all locks, mutexes and spinlocks in hfsmp on unmount or failed mount
2732 */
2733static void
2734hfs_locks_destroy(struct hfsmount *hfsmp)
2735{
2736
2737	lck_mtx_destroy(&hfsmp->hfs_mutex, hfs_mutex_group);
2738	lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group);
2739	lck_rw_destroy(&hfsmp->hfs_global_lock, hfs_rwlock_group);
2740	lck_rw_destroy(&hfsmp->hfs_insync, hfs_rwlock_group);
2741	lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group);
2742
2743	return;
2744}
2745
2746
2747static int
2748hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp)
2749{
2750	struct hfsmount * hfsmp;
2751	char fstypename[MFSNAMELEN];
2752
2753	if (vp == NULL)
2754		return (EINVAL);
2755
2756	if (!vnode_isvroot(vp))
2757		return (EINVAL);
2758
2759	vnode_vfsname(vp, fstypename);
2760	if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0)
2761		return (EINVAL);
2762
2763	hfsmp = VTOHFS(vp);
2764
2765	if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2766		return (EINVAL);
2767
2768	*hfsmpp = hfsmp;
2769
2770	return (0);
2771}
2772
2773// XXXdbg
2774#include <sys/filedesc.h>
2775
2776/*
2777 * HFS filesystem related variables.
2778 */
2779int
2780hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp,
2781			user_addr_t newp, size_t newlen, vfs_context_t context)
2782{
2783	struct proc *p = vfs_context_proc(context);
2784	int error;
2785	struct hfsmount *hfsmp;
2786
2787	/* all sysctl names at this level are terminal */
2788
2789	if (name[0] == HFS_ENCODINGBIAS) {
2790		int bias;
2791
2792		bias = hfs_getencodingbias();
2793		error = sysctl_int(oldp, oldlenp, newp, newlen, &bias);
2794		if (error == 0 && newp)
2795			hfs_setencodingbias(bias);
2796		return (error);
2797
2798	} else if (name[0] == HFS_EXTEND_FS) {
2799		u_int64_t  newsize;
2800		vnode_t vp = vfs_context_cwd(context);
2801
2802		if (newp == USER_ADDR_NULL || vp == NULLVP)
2803			return (EINVAL);
2804		if ((error = hfs_getmountpoint(vp, &hfsmp)))
2805			return (error);
2806		error = sysctl_quad(oldp, oldlenp, newp, newlen, (quad_t *)&newsize);
2807		if (error)
2808			return (error);
2809
2810		error = hfs_extendfs(hfsmp, newsize, context);
2811		return (error);
2812
2813	} else if (name[0] == HFS_ENCODINGHINT) {
2814		size_t bufsize;
2815		size_t bytes;
2816		u_int32_t hint;
2817		u_int16_t *unicode_name = NULL;
2818		char *filename = NULL;
2819
2820		if ((newlen <= 0) || (newlen > MAXPATHLEN))
2821			return (EINVAL);
2822
2823		bufsize = MAX(newlen * 3, MAXPATHLEN);
2824		MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK);
2825		if (filename == NULL) {
2826			error = ENOMEM;
2827			goto encodinghint_exit;
2828		}
2829		MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK);
2830		if (filename == NULL) {
2831			error = ENOMEM;
2832			goto encodinghint_exit;
2833		}
2834
2835		error = copyin(newp, (caddr_t)filename, newlen);
2836		if (error == 0) {
2837			error = utf8_decodestr((u_int8_t *)filename, newlen - 1, unicode_name,
2838			                       &bytes, bufsize, 0, UTF_DECOMPOSED);
2839			if (error == 0) {
2840				hint = hfs_pickencoding(unicode_name, bytes / 2);
2841				error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, (int32_t *)&hint);
2842			}
2843		}
2844
2845encodinghint_exit:
2846		if (unicode_name)
2847			FREE(unicode_name, M_TEMP);
2848		if (filename)
2849			FREE(filename, M_TEMP);
2850		return (error);
2851
2852	} else if (name[0] == HFS_ENABLE_JOURNALING) {
2853		// make the file system journaled...
2854		vnode_t vp = vfs_context_cwd(context);
2855		vnode_t jvp;
2856		ExtendedVCB *vcb;
2857		struct cat_attr jnl_attr;
2858	    struct cat_attr	jinfo_attr;
2859		struct cat_fork jnl_fork;
2860		struct cat_fork jinfo_fork;
2861		buf_t jib_buf;
2862		uint64_t jib_blkno;
2863		uint32_t tmpblkno;
2864		uint64_t journal_byte_offset;
2865		uint64_t journal_size;
2866		vnode_t jib_vp = NULLVP;
2867		struct JournalInfoBlock local_jib;
2868		int err = 0;
2869		void *jnl = NULL;
2870		int lockflags;
2871
2872		/* Only root can enable journaling */
2873		if (!kauth_cred_issuser(kauth_cred_get())) {
2874			return (EPERM);
2875		}
2876		if (vp == NULLVP)
2877		        return EINVAL;
2878
2879		hfsmp = VTOHFS(vp);
2880		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2881			return EROFS;
2882		}
2883		if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2884			printf("hfs: can't make a plain hfs volume journaled.\n");
2885			return EINVAL;
2886		}
2887
2888		if (hfsmp->jnl) {
2889		    printf("hfs: volume @ mp %p is already journaled!\n", vnode_mount(vp));
2890		    return EAGAIN;
2891		}
2892		vcb = HFSTOVCB(hfsmp);
2893
2894		/* Set up local copies of the initialization info */
2895		tmpblkno = (uint32_t) name[1];
2896		jib_blkno = (uint64_t) tmpblkno;
2897		journal_byte_offset = (uint64_t) name[2];
2898		journal_byte_offset *= hfsmp->blockSize;
2899		journal_byte_offset += hfsmp->hfsPlusIOPosOffset;
2900		journal_size = (uint64_t)((unsigned)name[3]);
2901
2902		lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2903		if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
2904			BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
2905
2906			printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
2907			hfs_systemfile_unlock(hfsmp, lockflags);
2908			return EINVAL;
2909		}
2910		hfs_systemfile_unlock(hfsmp, lockflags);
2911
2912		// make sure these both exist!
2913		if (   GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
2914			|| GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) {
2915
2916			return EINVAL;
2917		}
2918
2919		/*
2920		 * At this point, we have a copy of the metadata that lives in the catalog for the
2921		 * journal info block.  Compare that the journal info block's single extent matches
2922		 * that which was passed into this sysctl.
2923		 *
2924		 * If it is different, deny the journal enable call.
2925		 */
2926		if (jinfo_fork.cf_blocks > 1) {
2927			/* too many blocks */
2928			return EINVAL;
2929		}
2930
2931		if (jinfo_fork.cf_extents[0].startBlock != jib_blkno) {
2932			/* Wrong block */
2933			return EINVAL;
2934		}
2935
2936		/*
2937		 * We want to immediately purge the vnode for the JIB.
2938		 *
2939		 * Because it was written to from userland, there's probably
2940		 * a vnode somewhere in the vnode cache (possibly with UBC backed blocks).
2941		 * So we bring the vnode into core, then immediately do whatever
2942		 * we can to flush/vclean it out.  This is because those blocks will be
2943		 * interpreted as user data, which may be treated separately on some platforms
2944		 * than metadata.  If the vnode is gone, then there cannot be backing blocks
2945		 * in the UBC.
2946		 */
2947		if (hfs_vget (hfsmp, jinfo_attr.ca_fileid, &jib_vp, 1, 0)) {
2948			return EINVAL;
2949		}
2950		/*
2951		 * Now we have a vnode for the JIB. recycle it. Because we hold an iocount
2952		 * on the vnode, we'll just mark it for termination when the last iocount
2953		 * (hopefully ours), is dropped.
2954		 */
2955		vnode_recycle (jib_vp);
2956		err = vnode_put (jib_vp);
2957		if (err) {
2958			return EINVAL;
2959		}
2960
2961		/* Initialize the local copy of the JIB (just like hfs.util) */
2962		memset (&local_jib, 'Z', sizeof(struct JournalInfoBlock));
2963		local_jib.flags = SWAP_BE32(kJIJournalInFSMask);
2964		/* Note that the JIB's offset is in bytes */
2965		local_jib.offset = SWAP_BE64(journal_byte_offset);
2966		local_jib.size = SWAP_BE64(journal_size);
2967
2968		/*
2969		 * Now write out the local JIB.  This essentially overwrites the userland
2970		 * copy of the JIB.  Read it as BLK_META to treat it as a metadata read/write.
2971		 */
2972		jib_buf = buf_getblk (hfsmp->hfs_devvp,
2973				jib_blkno * (hfsmp->blockSize / hfsmp->hfs_logical_block_size),
2974				hfsmp->blockSize, 0, 0, BLK_META);
2975		char* buf_ptr = (char*) buf_dataptr (jib_buf);
2976
2977		/* Zero out the portion of the block that won't contain JIB data */
2978		memset (buf_ptr, 0, hfsmp->blockSize);
2979
2980		bcopy(&local_jib, buf_ptr, sizeof(local_jib));
2981		if (buf_bwrite (jib_buf)) {
2982			return EIO;
2983		}
2984
2985		/* Force a flush track cache */
2986		(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
2987
2988
2989		/* Now proceed with full volume sync */
2990		hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context);
2991
2992		printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
2993			   (off_t)name[2], (off_t)name[3]);
2994
2995		//
2996		// XXXdbg - note that currently (Sept, 08) hfs_util does not support
2997		//          enabling the journal on a separate device so it is safe
2998		//          to just copy hfs_devvp here.  If hfs_util gets the ability
2999		//          to dynamically enable the journal on a separate device then
3000		//          we will have to do the same thing as hfs_early_journal_init()
3001		//          to locate and open the journal device.
3002		//
3003		jvp = hfsmp->hfs_devvp;
3004		jnl = journal_create(jvp, journal_byte_offset, journal_size,
3005							 hfsmp->hfs_devvp,
3006							 hfsmp->hfs_logical_block_size,
3007							 0,
3008							 0,
3009							 hfs_sync_metadata, hfsmp->hfs_mp,
3010							 hfsmp->hfs_mp);
3011
3012		/*
3013		 * Set up the trim callback function so that we can add
3014		 * recently freed extents to the free extent cache once
3015		 * the transaction that freed them is written to the
3016		 * journal on disk.
3017		 */
3018		if (jnl)
3019			journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp);
3020
3021		if (jnl == NULL) {
3022			printf("hfs: FAILED to create the journal!\n");
3023			if (jvp && jvp != hfsmp->hfs_devvp) {
3024				vnode_clearmountedon(jvp);
3025				VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
3026			}
3027			jvp = NULL;
3028
3029			return EINVAL;
3030		}
3031
3032		hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3033
3034		/*
3035		 * Flush all dirty metadata buffers.
3036		 */
3037		buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl");
3038		buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl");
3039		buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl");
3040		buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl");
3041		if (hfsmp->hfs_attribute_vp)
3042			buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl");
3043
3044		HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
3045		HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
3046		hfsmp->jvp = jvp;
3047		hfsmp->jnl = jnl;
3048
3049		// save this off for the hack-y check in hfs_remove()
3050		hfsmp->jnl_start        = (u_int32_t)name[2];
3051		hfsmp->jnl_size         = (off_t)((unsigned)name[3]);
3052		hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
3053		hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
3054
3055		vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3056
3057		hfs_unlock_global (hfsmp);
3058		hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3059
3060		{
3061			fsid_t fsid;
3062
3063			fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3064			fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3065			vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3066		}
3067		return 0;
3068	} else if (name[0] == HFS_DISABLE_JOURNALING) {
3069		// clear the journaling bit
3070		vnode_t vp = vfs_context_cwd(context);
3071
3072		/* Only root can disable journaling */
3073		if (!kauth_cred_issuser(kauth_cred_get())) {
3074			return (EPERM);
3075		}
3076		if (vp == NULLVP)
3077		        return EINVAL;
3078
3079		hfsmp = VTOHFS(vp);
3080
3081		/*
3082		 * Disabling journaling is disallowed on volumes with directory hard links
3083		 * because we have not tested the relevant code path.
3084		 */
3085		if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){
3086			printf("hfs: cannot disable journaling on volumes with directory hardlinks\n");
3087			return EPERM;
3088		}
3089
3090		printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp));
3091
3092		hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3093
3094		// Lights out for you buddy!
3095		journal_close(hfsmp->jnl);
3096		hfsmp->jnl = NULL;
3097
3098		if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
3099			vnode_clearmountedon(hfsmp->jvp);
3100			VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
3101			vnode_put(hfsmp->jvp);
3102		}
3103		hfsmp->jvp = NULL;
3104		vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3105		hfsmp->jnl_start        = 0;
3106		hfsmp->hfs_jnlinfoblkid = 0;
3107		hfsmp->hfs_jnlfileid    = 0;
3108
3109		HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
3110
3111		hfs_unlock_global (hfsmp);
3112
3113		hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3114
3115		{
3116			fsid_t fsid;
3117
3118			fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3119			fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3120			vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3121		}
3122		return 0;
3123	} else if (name[0] == HFS_GET_JOURNAL_INFO) {
3124		vnode_t vp = vfs_context_cwd(context);
3125		off_t jnl_start, jnl_size;
3126
3127		if (vp == NULLVP)
3128		        return EINVAL;
3129
3130		/* 64-bit processes won't work with this sysctl -- can't fit a pointer into an int! */
3131		if (proc_is64bit(current_proc()))
3132			return EINVAL;
3133
3134		hfsmp = VTOHFS(vp);
3135	    if (hfsmp->jnl == NULL) {
3136			jnl_start = 0;
3137			jnl_size  = 0;
3138	    } else {
3139			jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
3140			jnl_size  = (off_t)hfsmp->jnl_size;
3141	    }
3142
3143	    if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) {
3144			return error;
3145		}
3146	    if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) {
3147			return error;
3148		}
3149
3150		return 0;
3151	} else if (name[0] == HFS_SET_PKG_EXTENSIONS) {
3152
3153	    return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]);
3154
3155	} else if (name[0] == VFS_CTL_QUERY) {
3156    	struct sysctl_req *req;
3157    	union union_vfsidctl vc;
3158    	struct mount *mp;
3159 	    struct vfsquery vq;
3160
3161		req = CAST_DOWN(struct sysctl_req *, oldp);	/* we're new style vfs sysctl. */
3162
3163        error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32));
3164		if (error) return (error);
3165
3166		mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */
3167        if (mp == NULL) return (ENOENT);
3168
3169		hfsmp = VFSTOHFS(mp);
3170		bzero(&vq, sizeof(vq));
3171		vq.vq_flags = hfsmp->hfs_notification_conditions;
3172		return SYSCTL_OUT(req, &vq, sizeof(vq));;
3173	} else if (name[0] == HFS_REPLAY_JOURNAL) {
3174		vnode_t devvp = NULL;
3175		int device_fd;
3176		if (namelen != 2) {
3177			return (EINVAL);
3178		}
3179		device_fd = name[1];
3180		error = file_vnode(device_fd, &devvp);
3181		if (error) {
3182			return error;
3183		}
3184		error = vnode_getwithref(devvp);
3185		if (error) {
3186			file_drop(device_fd);
3187			return error;
3188		}
3189		error = hfs_journal_replay(devvp, context);
3190		file_drop(device_fd);
3191		vnode_put(devvp);
3192		return error;
3193	} else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) {
3194		hfs_resize_debug = 1;
3195		printf ("hfs_sysctl: Enabled volume resize debugging.\n");
3196		return 0;
3197	}
3198
3199	return (ENOTSUP);
3200}
3201
3202/*
3203 * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support
3204 * the build_path ioctl.  We use it to leverage the code below that updates
3205 * the origin list cache if necessary
3206 */
3207
3208int
3209hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context)
3210{
3211	int error;
3212	int lockflags;
3213	struct hfsmount *hfsmp;
3214
3215	hfsmp = VFSTOHFS(mp);
3216
3217	error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0);
3218	if (error)
3219		return (error);
3220
3221	/*
3222	 * ADLs may need to have their origin state updated
3223	 * since build_path needs a valid parent.  The same is true
3224	 * for hardlinked files as well.  There isn't a race window here
3225	 * in re-acquiring the cnode lock since we aren't pulling any data
3226	 * out of the cnode; instead, we're going to the catalog.
3227	 */
3228	if ((VTOC(*vpp)->c_flag & C_HARDLINK) &&
3229	    (hfs_lock(VTOC(*vpp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) == 0)) {
3230		cnode_t *cp = VTOC(*vpp);
3231		struct cat_desc cdesc;
3232
3233		if (!hfs_haslinkorigin(cp)) {
3234			lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3235			error = cat_findname(hfsmp, (cnid_t)ino, &cdesc);
3236			hfs_systemfile_unlock(hfsmp, lockflags);
3237			if (error == 0) {
3238				if ((cdesc.cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3239					(cdesc.cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid)) {
3240					hfs_savelinkorigin(cp, cdesc.cd_parentcnid);
3241				}
3242				cat_releasedesc(&cdesc);
3243			}
3244		}
3245		hfs_unlock(cp);
3246	}
3247	return (0);
3248}
3249
3250
3251/*
3252 * Look up an HFS object by ID.
3253 *
3254 * The object is returned with an iocount reference and the cnode locked.
3255 *
3256 * If the object is a file then it will represent the data fork.
3257 */
3258int
3259hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted)
3260{
3261	struct vnode *vp = NULLVP;
3262	struct cat_desc cndesc;
3263	struct cat_attr cnattr;
3264	struct cat_fork cnfork;
3265	u_int32_t linkref = 0;
3266	int error;
3267
3268	/* Check for cnids that should't be exported. */
3269	if ((cnid < kHFSFirstUserCatalogNodeID) &&
3270	    (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) {
3271		return (ENOENT);
3272	}
3273	/* Don't export our private directories. */
3274	if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid ||
3275	    cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) {
3276		return (ENOENT);
3277	}
3278	/*
3279	 * Check the hash first
3280	 */
3281	vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted);
3282	if (vp) {
3283		*vpp = vp;
3284		return(0);
3285	}
3286
3287	bzero(&cndesc, sizeof(cndesc));
3288	bzero(&cnattr, sizeof(cnattr));
3289	bzero(&cnfork, sizeof(cnfork));
3290
3291	/*
3292	 * Not in hash, lookup in catalog
3293	 */
3294	if (cnid == kHFSRootParentID) {
3295		static char hfs_rootname[] = "/";
3296
3297		cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0];
3298		cndesc.cd_namelen = 1;
3299		cndesc.cd_parentcnid = kHFSRootParentID;
3300		cndesc.cd_cnid = kHFSRootFolderID;
3301		cndesc.cd_flags = CD_ISDIR;
3302
3303		cnattr.ca_fileid = kHFSRootFolderID;
3304		cnattr.ca_linkcount = 1;
3305		cnattr.ca_entries = 1;
3306		cnattr.ca_dircount = 1;
3307		cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
3308	} else {
3309		int lockflags;
3310		cnid_t pid;
3311		const char *nameptr;
3312
3313		lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3314		error = cat_idlookup(hfsmp, cnid, 0, 0, &cndesc, &cnattr, &cnfork);
3315		hfs_systemfile_unlock(hfsmp, lockflags);
3316
3317		if (error) {
3318			*vpp = NULL;
3319			return (error);
3320		}
3321
3322		/*
3323		 * Check for a raw hardlink inode and save its linkref.
3324		 */
3325		pid = cndesc.cd_parentcnid;
3326		nameptr = (const char *)cndesc.cd_nameptr;
3327
3328		if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3329		    (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) {
3330			linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10);
3331
3332		} else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3333		           (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) {
3334			linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10);
3335
3336		} else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3337		           (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) {
3338			*vpp = NULL;
3339			cat_releasedesc(&cndesc);
3340			return (ENOENT);  /* open unlinked file */
3341		}
3342	}
3343
3344	/*
3345	 * Finish initializing cnode descriptor for hardlinks.
3346	 *
3347	 * We need a valid name and parent for reverse lookups.
3348	 */
3349	if (linkref) {
3350		cnid_t lastid;
3351		struct cat_desc linkdesc;
3352		int linkerr = 0;
3353
3354		cnattr.ca_linkref = linkref;
3355		bzero (&linkdesc, sizeof (linkdesc));
3356
3357		/*
3358		 * If the caller supplied the raw inode value, then we don't know exactly
3359		 * which hardlink they wanted. It's likely that they acquired the raw inode
3360		 * value BEFORE the item became a hardlink, in which case, they probably
3361		 * want the oldest link.  So request the oldest link from the catalog.
3362		 *
3363		 * Unfortunately, this requires that we iterate through all N hardlinks. On the plus
3364		 * side, since we know that we want the last linkID, we can also have this one
3365		 * call give us back the name of the last ID, since it's going to have it in-hand...
3366		 */
3367		linkerr = hfs_lookup_lastlink (hfsmp, linkref, &lastid, &linkdesc);
3368		if ((linkerr == 0) && (lastid != 0)) {
3369			/*
3370			 * Release any lingering buffers attached to our local descriptor.
3371			 * Then copy the name and other business into the cndesc
3372			 */
3373			cat_releasedesc (&cndesc);
3374			bcopy (&linkdesc, &cndesc, sizeof(linkdesc));
3375		}
3376		/* If it failed, the linkref code will just use whatever it had in-hand below. */
3377	}
3378
3379	if (linkref) {
3380		int newvnode_flags = 0;
3381
3382		error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
3383								&cnfork, &vp, &newvnode_flags);
3384		if (error == 0) {
3385			VTOC(vp)->c_flag |= C_HARDLINK;
3386			vnode_setmultipath(vp);
3387		}
3388	} else {
3389		struct componentname cn;
3390		int newvnode_flags = 0;
3391
3392		/* Supply hfs_getnewvnode with a component name. */
3393		MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
3394		cn.cn_nameiop = LOOKUP;
3395		cn.cn_flags = ISLASTCN | HASBUF;
3396		cn.cn_context = NULL;
3397		cn.cn_pnlen = MAXPATHLEN;
3398		cn.cn_nameptr = cn.cn_pnbuf;
3399		cn.cn_namelen = cndesc.cd_namelen;
3400		cn.cn_hash = 0;
3401		cn.cn_consume = 0;
3402		bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1);
3403
3404		error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr,
3405								&cnfork, &vp, &newvnode_flags);
3406
3407		if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) {
3408			hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid);
3409		}
3410		FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI);
3411	}
3412	cat_releasedesc(&cndesc);
3413
3414	*vpp = vp;
3415	if (vp && skiplock) {
3416		hfs_unlock(VTOC(vp));
3417	}
3418	return (error);
3419}
3420
3421
3422/*
3423 * Flush out all the files in a filesystem.
3424 */
3425static int
3426#if QUOTA
3427hfs_flushfiles(struct mount *mp, int flags, struct proc *p)
3428#else
3429hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p)
3430#endif /* QUOTA */
3431{
3432	struct hfsmount *hfsmp;
3433	struct vnode *skipvp = NULLVP;
3434	int error;
3435	int accounted_root_usecounts;
3436#if QUOTA
3437	int i;
3438#endif
3439
3440	hfsmp = VFSTOHFS(mp);
3441
3442	accounted_root_usecounts = 0;
3443#if QUOTA
3444	/*
3445	 * The open quota files have an indirect reference on
3446	 * the root directory vnode.  We must account for this
3447	 * extra reference when doing the intial vflush.
3448	 */
3449	if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3450		/* Find out how many quota files we have open. */
3451		for (i = 0; i < MAXQUOTAS; i++) {
3452			if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP)
3453				++accounted_root_usecounts;
3454		}
3455	}
3456#endif /* QUOTA */
3457	if (hfsmp->hfs_flags & HFS_CS) {
3458		++accounted_root_usecounts;
3459	}
3460
3461	if (accounted_root_usecounts > 0) {
3462		/* Obtain the root vnode so we can skip over it. */
3463		skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0);
3464	}
3465
3466	error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags);
3467	if (error != 0)
3468		return(error);
3469
3470	error = vflush(mp, skipvp, SKIPSYSTEM | flags);
3471
3472	if (skipvp) {
3473		/*
3474		 * See if there are additional references on the
3475		 * root vp besides the ones obtained from the open
3476		 * quota files and CoreStorage.
3477		 */
3478		if ((error == 0) &&
3479		    (vnode_isinuse(skipvp,  accounted_root_usecounts))) {
3480			error = EBUSY;  /* root directory is still open */
3481		}
3482		hfs_unlock(VTOC(skipvp));
3483		/* release the iocount from the hfs_chash_getvnode call above. */
3484		vnode_put(skipvp);
3485	}
3486	if (error && (flags & FORCECLOSE) == 0)
3487		return (error);
3488
3489#if QUOTA
3490	if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3491		for (i = 0; i < MAXQUOTAS; i++) {
3492			if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP)
3493				continue;
3494			hfs_quotaoff(p, mp, i);
3495		}
3496	}
3497#endif /* QUOTA */
3498	if (hfsmp->hfs_flags & HFS_CS) {
3499		error = VNOP_IOCTL(hfsmp->hfs_devvp, _DKIOCCSSETFSVNODE,
3500		    (caddr_t)NULL, 0, vfs_context_kernel());
3501		vnode_rele(skipvp);
3502		printf("hfs_flushfiles: VNOP_IOCTL(_DKIOCCSSETFSVNODE) failed with error code %d\n",
3503		    error);
3504
3505		/* ignore the CS error and proceed with the unmount. */
3506		error = 0;
3507	}
3508	if (skipvp) {
3509		error = vflush(mp, NULLVP, SKIPSYSTEM | flags);
3510	}
3511
3512	return (error);
3513}
3514
3515/*
3516 * Update volume encoding bitmap (HFS Plus only)
3517 *
3518 * Mark a legacy text encoding as in-use (as needed)
3519 * in the volume header of this HFS+ filesystem.
3520 */
3521__private_extern__
3522void
3523hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding)
3524{
3525#define  kIndexMacUkrainian	48  /* MacUkrainian encoding is 152 */
3526#define  kIndexMacFarsi		49  /* MacFarsi encoding is 140 */
3527
3528	u_int32_t	index;
3529
3530	switch (encoding) {
3531	case kTextEncodingMacUkrainian:
3532		index = kIndexMacUkrainian;
3533		break;
3534	case kTextEncodingMacFarsi:
3535		index = kIndexMacFarsi;
3536		break;
3537	default:
3538		index = encoding;
3539		break;
3540	}
3541
3542	/* Only mark the encoding as in-use if it wasn't already set */
3543	if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) {
3544		hfs_lock_mount (hfsmp);
3545		hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index);
3546		MarkVCBDirty(hfsmp);
3547		hfs_unlock_mount(hfsmp);
3548	}
3549}
3550
3551/*
3552 * Update volume stats
3553 *
3554 * On journal volumes this will cause a volume header flush
3555 */
3556int
3557hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
3558{
3559	struct timeval tv;
3560
3561	microtime(&tv);
3562
3563	hfs_lock_mount (hfsmp);
3564
3565	MarkVCBDirty(hfsmp);
3566	hfsmp->hfs_mtime = tv.tv_sec;
3567
3568	switch (op) {
3569	case VOL_UPDATE:
3570		break;
3571	case VOL_MKDIR:
3572		if (hfsmp->hfs_dircount != 0xFFFFFFFF)
3573			++hfsmp->hfs_dircount;
3574		if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3575			++hfsmp->vcbNmRtDirs;
3576		break;
3577	case VOL_RMDIR:
3578		if (hfsmp->hfs_dircount != 0)
3579			--hfsmp->hfs_dircount;
3580		if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3581			--hfsmp->vcbNmRtDirs;
3582		break;
3583	case VOL_MKFILE:
3584		if (hfsmp->hfs_filecount != 0xFFFFFFFF)
3585			++hfsmp->hfs_filecount;
3586		if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3587			++hfsmp->vcbNmFls;
3588		break;
3589	case VOL_RMFILE:
3590		if (hfsmp->hfs_filecount != 0)
3591			--hfsmp->hfs_filecount;
3592		if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3593			--hfsmp->vcbNmFls;
3594		break;
3595	}
3596
3597	hfs_unlock_mount (hfsmp);
3598
3599	if (hfsmp->jnl) {
3600		hfs_flushvolumeheader(hfsmp, 0, 0);
3601	}
3602
3603	return (0);
3604}
3605
3606
3607#if CONFIG_HFS_STD
3608static int
3609hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
3610{
3611	ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3612	struct filefork *fp;
3613	HFSMasterDirectoryBlock	*mdb;
3614	struct buf *bp = NULL;
3615	int retval;
3616	int sector_size;
3617	ByteCount namelen;
3618
3619	sector_size = hfsmp->hfs_logical_block_size;
3620	retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp);
3621	if (retval) {
3622		if (bp)
3623			buf_brelse(bp);
3624		return retval;
3625	}
3626
3627	hfs_lock_mount (hfsmp);
3628
3629	mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size));
3630
3631	mdb->drCrDate	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime)));
3632	mdb->drLsMod	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod)));
3633	mdb->drAtrb	= SWAP_BE16 (vcb->vcbAtrb);
3634	mdb->drNmFls	= SWAP_BE16 (vcb->vcbNmFls);
3635	mdb->drAllocPtr	= SWAP_BE16 (vcb->nextAllocation);
3636	mdb->drClpSiz	= SWAP_BE32 (vcb->vcbClpSiz);
3637	mdb->drNxtCNID	= SWAP_BE32 (vcb->vcbNxtCNID);
3638	mdb->drFreeBks	= SWAP_BE16 (vcb->freeBlocks);
3639
3640	namelen = strlen((char *)vcb->vcbVN);
3641	retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN);
3642	/* Retry with MacRoman in case that's how it was exported. */
3643	if (retval)
3644		retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN);
3645
3646	mdb->drVolBkUp	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp)));
3647	mdb->drWrCnt	= SWAP_BE32 (vcb->vcbWrCnt);
3648	mdb->drNmRtDirs	= SWAP_BE16 (vcb->vcbNmRtDirs);
3649	mdb->drFilCnt	= SWAP_BE32 (vcb->vcbFilCnt);
3650	mdb->drDirCnt	= SWAP_BE32 (vcb->vcbDirCnt);
3651
3652	bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo));
3653
3654	fp = VTOF(vcb->extentsRefNum);
3655	mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3656	mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3657	mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3658	mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3659	mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3660	mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3661	mdb->drXTFlSize	= SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3662	mdb->drXTClpSiz	= SWAP_BE32 (fp->ff_clumpsize);
3663	FTOC(fp)->c_flag &= ~C_MODIFIED;
3664
3665	fp = VTOF(vcb->catalogRefNum);
3666	mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3667	mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3668	mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3669	mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3670	mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3671	mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3672	mdb->drCTFlSize	= SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3673	mdb->drCTClpSiz	= SWAP_BE32 (fp->ff_clumpsize);
3674	FTOC(fp)->c_flag &= ~C_MODIFIED;
3675
3676	MarkVCBClean( vcb );
3677
3678	hfs_unlock_mount (hfsmp);
3679
3680	/* If requested, flush out the alternate MDB */
3681	if (altflush) {
3682		struct buf *alt_bp = NULL;
3683
3684		if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sector_size, NOCRED, &alt_bp) == 0) {
3685			bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize);
3686
3687			(void) VNOP_BWRITE(alt_bp);
3688		} else if (alt_bp)
3689			buf_brelse(alt_bp);
3690	}
3691
3692	if (waitfor != MNT_WAIT)
3693		buf_bawrite(bp);
3694	else
3695		retval = VNOP_BWRITE(bp);
3696
3697	return (retval);
3698}
3699#endif
3700
3701/*
3702 *  Flush any dirty in-memory mount data to the on-disk
3703 *  volume header.
3704 *
3705 *  Note: the on-disk volume signature is intentionally
3706 *  not flushed since the on-disk "H+" and "HX" signatures
3707 *  are always stored in-memory as "H+".
3708 */
3709int
3710hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
3711{
3712	ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3713	struct filefork *fp;
3714	HFSPlusVolumeHeader *volumeHeader, *altVH;
3715	int retval;
3716	struct buf *bp, *alt_bp;
3717	int i;
3718	daddr64_t priIDSector;
3719	int critical;
3720	u_int16_t  signature;
3721	u_int16_t  hfsversion;
3722
3723	if (hfsmp->hfs_flags & HFS_READ_ONLY) {
3724		return(0);
3725	}
3726#if CONFIG_HFS_STD
3727	if (hfsmp->hfs_flags & HFS_STANDARD) {
3728		return hfs_flushMDB(hfsmp, waitfor, altflush);
3729	}
3730#endif
3731	critical = altflush;
3732	priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
3733				  HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
3734
3735	if (hfs_start_transaction(hfsmp) != 0) {
3736	    return EINVAL;
3737	}
3738
3739	bp = NULL;
3740	alt_bp = NULL;
3741
3742	retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3743			HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
3744			hfsmp->hfs_physical_block_size, NOCRED, &bp);
3745	if (retval) {
3746		printf("hfs: err %d reading VH blk (vol=%s)\n", retval, vcb->vcbVN);
3747		goto err_exit;
3748	}
3749
3750	volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) +
3751			HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3752
3753	/*
3754	 * Sanity check what we just read.  If it's bad, try the alternate
3755	 * instead.
3756	 */
3757	signature = SWAP_BE16 (volumeHeader->signature);
3758	hfsversion   = SWAP_BE16 (volumeHeader->version);
3759	if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3760	    (hfsversion < kHFSPlusVersion) || (hfsversion > 100) ||
3761	    (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) {
3762		printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d%s\n",
3763		      vcb->vcbVN, signature, hfsversion,
3764		      SWAP_BE32 (volumeHeader->blockSize),
3765		      hfsmp->hfs_alt_id_sector ? "; trying alternate" : "");
3766		hfs_mark_volume_inconsistent(hfsmp);
3767
3768		if (hfsmp->hfs_alt_id_sector) {
3769			retval = buf_meta_bread(hfsmp->hfs_devvp,
3770			    HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3771			    hfsmp->hfs_physical_block_size, NOCRED, &alt_bp);
3772			if (retval) {
3773				printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN);
3774				goto err_exit;
3775			}
3776
3777			altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) +
3778				HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size));
3779			signature = SWAP_BE16(altVH->signature);
3780			hfsversion = SWAP_BE16(altVH->version);
3781
3782			if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3783			    (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) ||
3784			    (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) {
3785				printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n",
3786				    vcb->vcbVN, signature, hfsversion,
3787				    SWAP_BE32(altVH->blockSize));
3788				retval = EIO;
3789				goto err_exit;
3790			}
3791
3792			/* The alternate is plausible, so use it. */
3793			bcopy(altVH, volumeHeader, kMDBSize);
3794			buf_brelse(alt_bp);
3795			alt_bp = NULL;
3796		} else {
3797			/* No alternate VH, nothing more we can do. */
3798			retval = EIO;
3799			goto err_exit;
3800		}
3801	}
3802
3803	if (hfsmp->jnl) {
3804		journal_modify_block_start(hfsmp->jnl, bp);
3805	}
3806
3807	/*
3808	 * For embedded HFS+ volumes, update create date if it changed
3809	 * (ie from a setattrlist call)
3810	 */
3811	if ((vcb->hfsPlusIOPosOffset != 0) &&
3812	    (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) {
3813		struct buf *bp2;
3814		HFSMasterDirectoryBlock	*mdb;
3815
3816		retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3817				HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys),
3818				hfsmp->hfs_physical_block_size, NOCRED, &bp2);
3819		if (retval) {
3820			if (bp2)
3821				buf_brelse(bp2);
3822			retval = 0;
3823		} else {
3824			mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) +
3825				HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3826
3827			if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
3828			  {
3829				if (hfsmp->jnl) {
3830				    journal_modify_block_start(hfsmp->jnl, bp2);
3831				}
3832
3833				mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);	/* pick up the new create date */
3834
3835				if (hfsmp->jnl) {
3836					journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL);
3837				} else {
3838					(void) VNOP_BWRITE(bp2);		/* write out the changes */
3839				}
3840			  }
3841			else
3842			  {
3843				buf_brelse(bp2);						/* just release it */
3844			  }
3845		  }
3846	}
3847
3848	hfs_lock_mount (hfsmp);
3849
3850	/* Note: only update the lower 16 bits worth of attributes */
3851	volumeHeader->attributes       = SWAP_BE32 (vcb->vcbAtrb);
3852	volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock);
3853	if (hfsmp->jnl) {
3854		volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
3855	} else {
3856		volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
3857	}
3858	volumeHeader->createDate	= SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
3859	volumeHeader->modifyDate	= SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
3860	volumeHeader->backupDate	= SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
3861	volumeHeader->fileCount		= SWAP_BE32 (vcb->vcbFilCnt);
3862	volumeHeader->folderCount	= SWAP_BE32 (vcb->vcbDirCnt);
3863	volumeHeader->totalBlocks	= SWAP_BE32 (vcb->totalBlocks);
3864	volumeHeader->freeBlocks	= SWAP_BE32 (vcb->freeBlocks);
3865	volumeHeader->nextAllocation	= SWAP_BE32 (vcb->nextAllocation);
3866	volumeHeader->rsrcClumpSize	= SWAP_BE32 (vcb->vcbClpSiz);
3867	volumeHeader->dataClumpSize	= SWAP_BE32 (vcb->vcbClpSiz);
3868	volumeHeader->nextCatalogID	= SWAP_BE32 (vcb->vcbNxtCNID);
3869	volumeHeader->writeCount	= SWAP_BE32 (vcb->vcbWrCnt);
3870	volumeHeader->encodingsBitmap	= SWAP_BE64 (vcb->encodingsBitmap);
3871
3872	if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) {
3873		bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo));
3874		critical = 1;
3875	}
3876
3877	/*
3878	 * System files are only dirty when altflush is set.
3879	 */
3880	if (altflush == 0) {
3881		goto done;
3882	}
3883
3884	/* Sync Extents over-flow file meta data */
3885	fp = VTOF(vcb->extentsRefNum);
3886	if (FTOC(fp)->c_flag & C_MODIFIED) {
3887		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3888			volumeHeader->extentsFile.extents[i].startBlock	=
3889				SWAP_BE32 (fp->ff_extents[i].startBlock);
3890			volumeHeader->extentsFile.extents[i].blockCount	=
3891				SWAP_BE32 (fp->ff_extents[i].blockCount);
3892		}
3893		volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size);
3894		volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3895		volumeHeader->extentsFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3896		FTOC(fp)->c_flag &= ~C_MODIFIED;
3897	}
3898
3899	/* Sync Catalog file meta data */
3900	fp = VTOF(vcb->catalogRefNum);
3901	if (FTOC(fp)->c_flag & C_MODIFIED) {
3902		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3903			volumeHeader->catalogFile.extents[i].startBlock	=
3904				SWAP_BE32 (fp->ff_extents[i].startBlock);
3905			volumeHeader->catalogFile.extents[i].blockCount	=
3906				SWAP_BE32 (fp->ff_extents[i].blockCount);
3907		}
3908		volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size);
3909		volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3910		volumeHeader->catalogFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3911		FTOC(fp)->c_flag &= ~C_MODIFIED;
3912	}
3913
3914	/* Sync Allocation file meta data */
3915	fp = VTOF(vcb->allocationsRefNum);
3916	if (FTOC(fp)->c_flag & C_MODIFIED) {
3917		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3918			volumeHeader->allocationFile.extents[i].startBlock =
3919				SWAP_BE32 (fp->ff_extents[i].startBlock);
3920			volumeHeader->allocationFile.extents[i].blockCount =
3921				SWAP_BE32 (fp->ff_extents[i].blockCount);
3922		}
3923		volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size);
3924		volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3925		volumeHeader->allocationFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3926		FTOC(fp)->c_flag &= ~C_MODIFIED;
3927	}
3928
3929	/* Sync Attribute file meta data */
3930	if (hfsmp->hfs_attribute_vp) {
3931		fp = VTOF(hfsmp->hfs_attribute_vp);
3932		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3933			volumeHeader->attributesFile.extents[i].startBlock =
3934				SWAP_BE32 (fp->ff_extents[i].startBlock);
3935			volumeHeader->attributesFile.extents[i].blockCount =
3936				SWAP_BE32 (fp->ff_extents[i].blockCount);
3937		}
3938		FTOC(fp)->c_flag &= ~C_MODIFIED;
3939		volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size);
3940		volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3941		volumeHeader->attributesFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3942	}
3943
3944	/* Sync Startup file meta data */
3945	if (hfsmp->hfs_startup_vp) {
3946		fp = VTOF(hfsmp->hfs_startup_vp);
3947		if (FTOC(fp)->c_flag & C_MODIFIED) {
3948			for (i = 0; i < kHFSPlusExtentDensity; i++) {
3949				volumeHeader->startupFile.extents[i].startBlock =
3950					SWAP_BE32 (fp->ff_extents[i].startBlock);
3951				volumeHeader->startupFile.extents[i].blockCount =
3952					SWAP_BE32 (fp->ff_extents[i].blockCount);
3953			}
3954			volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size);
3955			volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3956			volumeHeader->startupFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3957			FTOC(fp)->c_flag &= ~C_MODIFIED;
3958		}
3959	}
3960
3961done:
3962	MarkVCBClean(hfsmp);
3963	hfs_unlock_mount (hfsmp);
3964
3965	/* If requested, flush out the alternate volume header */
3966	if (altflush && hfsmp->hfs_alt_id_sector) {
3967		if (buf_meta_bread(hfsmp->hfs_devvp,
3968				HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3969				hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) {
3970			if (hfsmp->jnl) {
3971				journal_modify_block_start(hfsmp->jnl, alt_bp);
3972			}
3973
3974			bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) +
3975					HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size),
3976					kMDBSize);
3977
3978			if (hfsmp->jnl) {
3979				journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL);
3980			} else {
3981				(void) VNOP_BWRITE(alt_bp);
3982			}
3983		} else if (alt_bp)
3984			buf_brelse(alt_bp);
3985	}
3986
3987	if (hfsmp->jnl) {
3988		journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
3989	} else {
3990		if (waitfor != MNT_WAIT)
3991			buf_bawrite(bp);
3992		else {
3993		    retval = VNOP_BWRITE(bp);
3994		    /* When critical data changes, flush the device cache */
3995		    if (critical && (retval == 0)) {
3996			(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
3997					 NULL, FWRITE, NULL);
3998		    }
3999		}
4000	}
4001	hfs_end_transaction(hfsmp);
4002
4003	return (retval);
4004
4005err_exit:
4006	if (alt_bp)
4007		buf_brelse(alt_bp);
4008	if (bp)
4009		buf_brelse(bp);
4010	hfs_end_transaction(hfsmp);
4011	return retval;
4012}
4013
4014
4015/*
4016 * Extend a file system.
4017 */
4018int
4019hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
4020{
4021	struct proc *p = vfs_context_proc(context);
4022	kauth_cred_t cred = vfs_context_ucred(context);
4023	struct  vnode *vp;
4024	struct  vnode *devvp;
4025	struct  buf *bp;
4026	struct  filefork *fp = NULL;
4027	ExtendedVCB  *vcb;
4028	struct  cat_fork forkdata;
4029	u_int64_t  oldsize;
4030	u_int64_t  newblkcnt;
4031	u_int64_t  prev_phys_block_count;
4032	u_int32_t  addblks;
4033	u_int64_t  sector_count;
4034	u_int32_t  sector_size;
4035	u_int32_t  phys_sector_size;
4036	u_int32_t  overage_blocks;
4037	daddr64_t  prev_alt_sector;
4038	daddr_t	   bitmapblks;
4039	int  lockflags = 0;
4040	int  error;
4041	int64_t oldBitmapSize;
4042	Boolean  usedExtendFileC = false;
4043	int transaction_begun = 0;
4044
4045	devvp = hfsmp->hfs_devvp;
4046	vcb = HFSTOVCB(hfsmp);
4047
4048	/*
4049	 * - HFS Plus file systems only.
4050	 * - Journaling must be enabled.
4051	 * - No embedded volumes.
4052	 */
4053	if ((vcb->vcbSigWord == kHFSSigWord) ||
4054	     (hfsmp->jnl == NULL) ||
4055	     (vcb->hfsPlusIOPosOffset != 0)) {
4056		return (EPERM);
4057	}
4058	/*
4059	 * If extending file system by non-root, then verify
4060	 * ownership and check permissions.
4061	 */
4062	if (suser(cred, NULL)) {
4063		error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0);
4064
4065		if (error)
4066			return (error);
4067		error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0);
4068		if (error == 0) {
4069			error = hfs_write_access(vp, cred, p, false);
4070		}
4071		hfs_unlock(VTOC(vp));
4072		vnode_put(vp);
4073		if (error)
4074			return (error);
4075
4076		error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context);
4077		if (error)
4078			return (error);
4079	}
4080	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&sector_size, 0, context)) {
4081		return (ENXIO);
4082	}
4083	if (sector_size != hfsmp->hfs_logical_block_size) {
4084		return (ENXIO);
4085	}
4086	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&sector_count, 0, context)) {
4087		return (ENXIO);
4088	}
4089	if ((sector_size * sector_count) < newsize) {
4090		printf("hfs_extendfs: not enough space on device (vol=%s)\n", hfsmp->vcbVN);
4091		return (ENOSPC);
4092	}
4093	error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context);
4094	if (error) {
4095		if ((error != ENOTSUP) && (error != ENOTTY)) {
4096			return (ENXIO);
4097		}
4098		/* If ioctl is not supported, force physical and logical sector size to be same */
4099		phys_sector_size = sector_size;
4100	}
4101	oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4102
4103	/*
4104	 * Validate new size.
4105	 */
4106	if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) {
4107		printf("hfs_extendfs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4108		return (EINVAL);
4109	}
4110	newblkcnt = newsize / vcb->blockSize;
4111	if (newblkcnt > (u_int64_t)0xFFFFFFFF) {
4112		printf ("hfs_extendfs: current blockSize=%u too small for newsize=%qu\n", hfsmp->blockSize, newsize);
4113		return (EOVERFLOW);
4114	}
4115
4116	addblks = newblkcnt - vcb->totalBlocks;
4117
4118	if (hfs_resize_debug) {
4119		printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks);
4120		printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks);
4121	}
4122	printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks);
4123
4124	hfs_lock_mount (hfsmp);
4125	if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4126		hfs_unlock_mount(hfsmp);
4127		error = EALREADY;
4128		goto out;
4129	}
4130	hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4131	hfs_unlock_mount (hfsmp);
4132
4133	/* Start with a clean journal. */
4134	hfs_journal_flush(hfsmp, TRUE);
4135
4136	/*
4137	 * Enclose changes inside a transaction.
4138	 */
4139	if (hfs_start_transaction(hfsmp) != 0) {
4140		error = EINVAL;
4141		goto out;
4142	}
4143	transaction_begun = 1;
4144
4145
4146	/* Update the hfsmp fields for the physical information about the device */
4147	prev_phys_block_count = hfsmp->hfs_logical_block_count;
4148	prev_alt_sector = hfsmp->hfs_alt_id_sector;
4149
4150	hfsmp->hfs_logical_block_count = sector_count;
4151	/*
4152	 * Note that the new AltVH location must be based on the device's EOF rather than the new
4153	 * filesystem's EOF, so we use logical_block_count here rather than newsize.
4154	 */
4155	hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) +
4156	                          HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count);
4157	hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size;
4158
4159
4160	/*
4161	 * Note: we take the attributes lock in case we have an attribute data vnode
4162	 * which needs to change size.
4163	 */
4164	lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4165	vp = vcb->allocationsRefNum;
4166	fp = VTOF(vp);
4167	bcopy(&fp->ff_data, &forkdata, sizeof(forkdata));
4168
4169	/*
4170	 * Calculate additional space required (if any) by allocation bitmap.
4171	 */
4172	oldBitmapSize = fp->ff_size;
4173	bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize;
4174	if (bitmapblks > (daddr_t)fp->ff_blocks)
4175		bitmapblks -= fp->ff_blocks;
4176	else
4177		bitmapblks = 0;
4178
4179	/*
4180	 * The allocation bitmap can contain unused bits that are beyond end of
4181	 * current volume's allocation blocks.  Usually they are supposed to be
4182	 * zero'ed out but there can be cases where they might be marked as used.
4183	 * After extending the file system, those bits can represent valid
4184	 * allocation blocks, so we mark all the bits from the end of current
4185	 * volume to end of allocation bitmap as "free".
4186	 *
4187	 * Figure out the number of overage blocks before proceeding though,
4188	 * so we don't add more bytes to our I/O than necessary.
4189	 * First figure out the total number of blocks representable by the
4190	 * end of the bitmap file vs. the total number of blocks in the new FS.
4191	 * Then subtract away the number of blocks in the current FS.  This is how much
4192	 * we can mark as free right now without having to grow the bitmap file.
4193	 */
4194	overage_blocks = fp->ff_blocks * vcb->blockSize * 8;
4195	overage_blocks = MIN (overage_blocks, newblkcnt);
4196   	overage_blocks -= vcb->totalBlocks;
4197
4198	BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks);
4199
4200	if (bitmapblks > 0) {
4201		daddr64_t blkno;
4202		daddr_t blkcnt;
4203		off_t bytesAdded;
4204
4205		/*
4206		 * Get the bitmap's current size (in allocation blocks) so we know
4207		 * where to start zero filling once the new space is added.  We've
4208		 * got to do this before the bitmap is grown.
4209		 */
4210		blkno  = (daddr64_t)fp->ff_blocks;
4211
4212		/*
4213		 * Try to grow the allocation file in the normal way, using allocation
4214		 * blocks already existing in the file system.  This way, we might be
4215		 * able to grow the bitmap contiguously, or at least in the metadata
4216		 * zone.
4217		 */
4218		error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0,
4219				kEFAllMask | kEFNoClumpMask | kEFReserveMask
4220				| kEFMetadataMask | kEFContigMask, &bytesAdded);
4221
4222		if (error == 0) {
4223			usedExtendFileC = true;
4224		} else {
4225			/*
4226			 * If the above allocation failed, fall back to allocating the new
4227			 * extent of the bitmap from the space we're going to add.  Since those
4228			 * blocks don't yet belong to the file system, we have to update the
4229			 * extent list directly, and manually adjust the file size.
4230			 */
4231			bytesAdded = 0;
4232			error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks);
4233			if (error) {
4234				printf("hfs_extendfs: error %d adding extents\n", error);
4235				goto out;
4236			}
4237			fp->ff_blocks += bitmapblks;
4238			VTOC(vp)->c_blocks = fp->ff_blocks;
4239			VTOC(vp)->c_flag |= C_MODIFIED;
4240		}
4241
4242		/*
4243		 * Update the allocation file's size to include the newly allocated
4244		 * blocks.  Note that ExtendFileC doesn't do this, which is why this
4245		 * statement is outside the above "if" statement.
4246		 */
4247		fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4248
4249		/*
4250		 * Zero out the new bitmap blocks.
4251		 */
4252		{
4253
4254			bp = NULL;
4255			blkcnt = bitmapblks;
4256			while (blkcnt > 0) {
4257				error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp);
4258				if (error) {
4259					if (bp) {
4260						buf_brelse(bp);
4261					}
4262					break;
4263				}
4264				bzero((char *)buf_dataptr(bp), vcb->blockSize);
4265				buf_markaged(bp);
4266				error = (int)buf_bwrite(bp);
4267				if (error)
4268					break;
4269				--blkcnt;
4270				++blkno;
4271			}
4272		}
4273		if (error) {
4274			printf("hfs_extendfs: error %d clearing blocks\n", error);
4275			goto out;
4276		}
4277		/*
4278		 * Mark the new bitmap space as allocated.
4279		 *
4280		 * Note that ExtendFileC will have marked any blocks it allocated, so
4281		 * this is only needed if we used AddFileExtent.  Also note that this
4282		 * has to come *after* the zero filling of new blocks in the case where
4283		 * we used AddFileExtent (since the part of the bitmap we're touching
4284		 * is in those newly allocated blocks).
4285		 */
4286		if (!usedExtendFileC) {
4287			error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks);
4288			if (error) {
4289				printf("hfs_extendfs: error %d setting bitmap\n", error);
4290				goto out;
4291			}
4292			vcb->freeBlocks -= bitmapblks;
4293		}
4294	}
4295	/*
4296	 * Mark the new alternate VH as allocated.
4297	 */
4298	if (vcb->blockSize == 512)
4299		error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2);
4300	else
4301		error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1);
4302	if (error) {
4303		printf("hfs_extendfs: error %d setting bitmap (VH)\n", error);
4304		goto out;
4305	}
4306	/*
4307	 * Mark the old alternate VH as free.
4308	 */
4309	if (vcb->blockSize == 512)
4310		(void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2);
4311	else
4312		(void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1);
4313	/*
4314	 * Adjust file system variables for new space.
4315	 */
4316	vcb->totalBlocks += addblks;
4317	vcb->freeBlocks += addblks;
4318	MarkVCBDirty(vcb);
4319	error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4320	if (error) {
4321		printf("hfs_extendfs: couldn't flush volume headers (%d)", error);
4322		/*
4323		 * Restore to old state.
4324		 */
4325		if (usedExtendFileC) {
4326			(void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp),
4327								 FTOC(fp)->c_fileid, false);
4328		} else {
4329			fp->ff_blocks -= bitmapblks;
4330			fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4331			/*
4332			 * No need to mark the excess blocks free since those bitmap blocks
4333			 * are no longer part of the bitmap.  But we do need to undo the
4334			 * effect of the "vcb->freeBlocks -= bitmapblks" above.
4335			 */
4336			vcb->freeBlocks += bitmapblks;
4337		}
4338		vcb->totalBlocks -= addblks;
4339		vcb->freeBlocks -= addblks;
4340		hfsmp->hfs_logical_block_count = prev_phys_block_count;
4341		hfsmp->hfs_alt_id_sector = prev_alt_sector;
4342		MarkVCBDirty(vcb);
4343		if (vcb->blockSize == 512) {
4344			if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) {
4345				hfs_mark_volume_inconsistent(hfsmp);
4346			}
4347		} else {
4348			if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) {
4349				hfs_mark_volume_inconsistent(hfsmp);
4350			}
4351		}
4352		goto out;
4353	}
4354	/*
4355	 * Invalidate the old alternate volume header.
4356	 */
4357	bp = NULL;
4358	if (prev_alt_sector) {
4359		if (buf_meta_bread(hfsmp->hfs_devvp,
4360				HFS_PHYSBLK_ROUNDDOWN(prev_alt_sector, hfsmp->hfs_log_per_phys),
4361				hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) {
4362			journal_modify_block_start(hfsmp->jnl, bp);
4363
4364			bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize);
4365
4366			journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
4367		} else if (bp) {
4368			buf_brelse(bp);
4369		}
4370	}
4371
4372	/*
4373	 * Update the metadata zone size based on current volume size
4374	 */
4375	hfs_metadatazone_init(hfsmp, false);
4376
4377	/*
4378	 * Adjust the size of hfsmp->hfs_attrdata_vp
4379	 */
4380	if (hfsmp->hfs_attrdata_vp) {
4381		struct cnode *attr_cp;
4382		struct filefork *attr_fp;
4383
4384		if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4385			attr_cp = VTOC(hfsmp->hfs_attrdata_vp);
4386			attr_fp = VTOF(hfsmp->hfs_attrdata_vp);
4387
4388			attr_cp->c_blocks = newblkcnt;
4389			attr_fp->ff_blocks = newblkcnt;
4390			attr_fp->ff_extents[0].blockCount = newblkcnt;
4391			attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4392			ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size);
4393			vnode_put(hfsmp->hfs_attrdata_vp);
4394		}
4395	}
4396
4397	/*
4398	 * Update the R/B Tree if necessary.  Since we don't have to drop the systemfile
4399	 * locks in the middle of these operations like we do in the truncate case
4400	 * where we have to relocate files, we can only update the red-black tree
4401	 * if there were actual changes made to the bitmap.  Also, we can't really scan the
4402	 * new portion of the bitmap before it has been allocated. The BlockMarkAllocated
4403	 * routines are smart enough to avoid the r/b tree if the portion they are manipulating is
4404	 * not currently controlled by the tree.
4405	 *
4406	 * We only update hfsmp->allocLimit if totalBlocks actually increased.
4407	 */
4408	if (error == 0) {
4409		UpdateAllocLimit(hfsmp, hfsmp->totalBlocks);
4410	}
4411
4412	/* Release all locks and sync up journal content before
4413	 * checking and extending, if required, the journal
4414	 */
4415	if (lockflags) {
4416		hfs_systemfile_unlock(hfsmp, lockflags);
4417		lockflags = 0;
4418	}
4419	if (transaction_begun) {
4420		hfs_end_transaction(hfsmp);
4421		hfs_journal_flush(hfsmp, TRUE);
4422		transaction_begun = 0;
4423	}
4424
4425	/* Increase the journal size, if required. */
4426	error = hfs_extend_journal(hfsmp, sector_size, sector_count, context);
4427	if (error) {
4428		printf ("hfs_extendfs: Could not extend journal size\n");
4429		goto out_noalloc;
4430	}
4431
4432	/* Log successful extending */
4433	printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n",
4434	       hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize));
4435
4436out:
4437	if (error && fp) {
4438		/* Restore allocation fork. */
4439		bcopy(&forkdata, &fp->ff_data, sizeof(forkdata));
4440		VTOC(vp)->c_blocks = fp->ff_blocks;
4441
4442	}
4443
4444out_noalloc:
4445	hfs_lock_mount (hfsmp);
4446	hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4447	hfs_unlock_mount (hfsmp);
4448	if (lockflags) {
4449		hfs_systemfile_unlock(hfsmp, lockflags);
4450	}
4451	if (transaction_begun) {
4452		hfs_end_transaction(hfsmp);
4453		hfs_journal_flush(hfsmp, FALSE);
4454		/* Just to be sure, sync all data to the disk */
4455		(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4456	}
4457	if (error) {
4458		printf ("hfs_extentfs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN);
4459	}
4460
4461	return MacToVFSError(error);
4462}
4463
4464#define HFS_MIN_SIZE  (32LL * 1024LL * 1024LL)
4465
4466/*
4467 * Truncate a file system (while still mounted).
4468 */
4469int
4470hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
4471{
4472	struct  buf *bp = NULL;
4473	u_int64_t oldsize;
4474	u_int32_t newblkcnt;
4475	u_int32_t reclaimblks = 0;
4476	int lockflags = 0;
4477	int transaction_begun = 0;
4478	Boolean updateFreeBlocks = false;
4479	Boolean disable_sparse = false;
4480	int error = 0;
4481
4482	hfs_lock_mount (hfsmp);
4483	if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4484		hfs_unlock_mount (hfsmp);
4485		return (EALREADY);
4486	}
4487	hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4488	hfsmp->hfs_resize_blocksmoved = 0;
4489	hfsmp->hfs_resize_totalblocks = 0;
4490	hfsmp->hfs_resize_progress = 0;
4491	hfs_unlock_mount (hfsmp);
4492
4493	/*
4494	 * - Journaled HFS Plus volumes only.
4495	 * - No embedded volumes.
4496	 */
4497	if ((hfsmp->jnl == NULL) ||
4498	    (hfsmp->hfsPlusIOPosOffset != 0)) {
4499		error = EPERM;
4500		goto out;
4501	}
4502	oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4503	newblkcnt = newsize / hfsmp->blockSize;
4504	reclaimblks = hfsmp->totalBlocks - newblkcnt;
4505
4506	if (hfs_resize_debug) {
4507		printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1));
4508		printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks);
4509	}
4510
4511	/* Make sure new size is valid. */
4512	if ((newsize < HFS_MIN_SIZE) ||
4513	    (newsize >= oldsize) ||
4514	    (newsize % hfsmp->hfs_logical_block_size) ||
4515	    (newsize % hfsmp->hfs_physical_block_size)) {
4516		printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4517		error = EINVAL;
4518		goto out;
4519	}
4520
4521	/*
4522	 * Make sure that the file system has enough free blocks reclaim.
4523	 *
4524	 * Before resize, the disk is divided into four zones -
4525	 * 	A. Allocated_Stationary - These are allocated blocks that exist
4526	 * 	   before the new end of disk.  These blocks will not be
4527	 * 	   relocated or modified during resize.
4528	 * 	B. Free_Stationary - These are free blocks that exist before the
4529	 * 	   new end of disk.  These blocks can be used for any new
4530	 * 	   allocations during resize, including allocation for relocating
4531	 * 	   data from the area of disk being reclaimed.
4532	 * 	C. Allocated_To-Reclaim - These are allocated blocks that exist
4533	 *         beyond the new end of disk.  These blocks need to be reclaimed
4534	 *         during resize by allocating equal number of blocks in Free
4535	 *         Stationary zone and copying the data.
4536	 *      D. Free_To-Reclaim - These are free blocks that exist beyond the
4537	 *         new end of disk.  Nothing special needs to be done to reclaim
4538	 *         them.
4539	 *
4540	 * Total number of blocks on the disk before resize:
4541	 * ------------------------------------------------
4542	 * 	Total Blocks = Allocated_Stationary + Free_Stationary +
4543	 * 	               Allocated_To-Reclaim + Free_To-Reclaim
4544	 *
4545	 * Total number of blocks that need to be reclaimed:
4546	 * ------------------------------------------------
4547	 *	Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim
4548	 *
4549	 * Note that the check below also makes sure that we have enough space
4550	 * to relocate data from Allocated_To-Reclaim to Free_Stationary.
4551	 * Therefore we do not need to check total number of blocks to relocate
4552	 * later in the code.
4553	 *
4554	 * The condition below gets converted to:
4555	 *
4556	 * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim
4557	 *
4558	 * which is equivalent to:
4559	 *
4560	 *              Allocated To-Reclaim >= Free Stationary
4561	 */
4562	if (reclaimblks >= hfs_freeblks(hfsmp, 1)) {
4563		printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
4564		error = ENOSPC;
4565		goto out;
4566	}
4567
4568	/* Start with a clean journal. */
4569	hfs_journal_flush(hfsmp, TRUE);
4570
4571	if (hfs_start_transaction(hfsmp) != 0) {
4572		error = EINVAL;
4573		goto out;
4574	}
4575	transaction_begun = 1;
4576
4577	/* Take the bitmap lock to update the alloc limit field */
4578	lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4579
4580	/*
4581	 * Prevent new allocations from using the part we're trying to truncate.
4582	 *
4583	 * NOTE: allocLimit is set to the allocation block number where the new
4584	 * alternate volume header will be.  That way there will be no files to
4585	 * interfere with allocating the new alternate volume header, and no files
4586	 * in the allocation blocks beyond (i.e. the blocks we're trying to
4587	 * truncate away.
4588	 *
4589	 * Also shrink the red-black tree if needed.
4590	 */
4591	if (hfsmp->blockSize == 512) {
4592		error = UpdateAllocLimit (hfsmp, newblkcnt - 2);
4593	}
4594	else {
4595		error = UpdateAllocLimit (hfsmp, newblkcnt - 1);
4596	}
4597
4598	/* Sparse devices use first fit allocation which is not ideal
4599	 * for volume resize which requires best fit allocation.  If a
4600	 * sparse device is being truncated, disable the sparse device
4601	 * property temporarily for the duration of resize.  Also reset
4602	 * the free extent cache so that it is rebuilt as sorted by
4603	 * totalBlocks instead of startBlock.
4604	 *
4605	 * Note that this will affect all allocations on the volume and
4606	 * ideal fix would be just to modify resize-related allocations,
4607	 * but it will result in complexity like handling of two free
4608	 * extent caches sorted differently, etc.  So we stick to this
4609	 * solution for now.
4610	 */
4611	hfs_lock_mount (hfsmp);
4612	if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
4613		hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
4614		ResetVCBFreeExtCache(hfsmp);
4615		disable_sparse = true;
4616	}
4617
4618	/*
4619	 * Update the volume free block count to reflect the total number
4620	 * of free blocks that will exist after a successful resize.
4621	 * Relocation of extents will result in no net change in the total
4622	 * free space on the disk.  Therefore the code that allocates
4623	 * space for new extent and deallocates the old extent explicitly
4624	 * prevents updating the volume free block count.  It will also
4625	 * prevent false disk full error when the number of blocks in
4626	 * an extent being relocated is more than the free blocks that
4627	 * will exist after the volume is resized.
4628	 */
4629	hfsmp->freeBlocks -= reclaimblks;
4630	updateFreeBlocks = true;
4631	hfs_unlock_mount(hfsmp);
4632
4633	if (lockflags) {
4634		hfs_systemfile_unlock(hfsmp, lockflags);
4635		lockflags = 0;
4636	}
4637
4638	/*
4639	 * Update the metadata zone size to match the new volume size,
4640	 * and if it too less, metadata zone might be disabled.
4641	 */
4642	hfs_metadatazone_init(hfsmp, false);
4643
4644	/*
4645	 * If some files have blocks at or beyond the location of the
4646	 * new alternate volume header, recalculate free blocks and
4647	 * reclaim blocks.  Otherwise just update free blocks count.
4648	 *
4649	 * The current allocLimit is set to the location of new alternate
4650	 * volume header, and reclaimblks are the total number of blocks
4651	 * that need to be reclaimed.  So the check below is really
4652	 * ignoring the blocks allocated for old alternate volume header.
4653	 */
4654	if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
4655		/*
4656		 * hfs_reclaimspace will use separate transactions when
4657		 * relocating files (so we don't overwhelm the journal).
4658		 */
4659		hfs_end_transaction(hfsmp);
4660		transaction_begun = 0;
4661
4662		/* Attempt to reclaim some space. */
4663		error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context);
4664		if (error != 0) {
4665			printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error);
4666			error = ENOSPC;
4667			goto out;
4668		}
4669		if (hfs_start_transaction(hfsmp) != 0) {
4670			error = EINVAL;
4671			goto out;
4672		}
4673		transaction_begun = 1;
4674
4675		/* Check if we're clear now. */
4676		error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks);
4677		if (error != 0) {
4678			printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error);
4679			error = EAGAIN;  /* tell client to try again */
4680			goto out;
4681		}
4682	}
4683
4684	/*
4685	 * Note: we take the attributes lock in case we have an attribute data vnode
4686	 * which needs to change size.
4687	 */
4688	lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4689
4690	/*
4691	 * Allocate last 1KB for alternate volume header.
4692	 */
4693	error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1);
4694	if (error) {
4695		printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error);
4696		goto out;
4697	}
4698
4699	/*
4700	 * Mark the old alternate volume header as free.
4701	 * We don't bother shrinking allocation bitmap file.
4702	 */
4703	if (hfsmp->blockSize == 512)
4704		(void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2);
4705	else
4706		(void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1);
4707
4708	/*
4709	 * Invalidate the existing alternate volume header.
4710	 *
4711	 * Don't include this in a transaction (don't call journal_modify_block)
4712	 * since this block will be outside of the truncated file system!
4713	 */
4714	if (hfsmp->hfs_alt_id_sector) {
4715		error = buf_meta_bread(hfsmp->hfs_devvp,
4716				HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
4717				hfsmp->hfs_physical_block_size, NOCRED, &bp);
4718		if (error == 0) {
4719			bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize);
4720			(void) VNOP_BWRITE(bp);
4721		} else {
4722			if (bp) {
4723				buf_brelse(bp);
4724			}
4725		}
4726		bp = NULL;
4727	}
4728
4729	/* Log successful shrinking. */
4730	printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n",
4731	       hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks);
4732
4733	/*
4734	 * Adjust file system variables and flush them to disk.
4735	 */
4736	hfsmp->totalBlocks = newblkcnt;
4737	hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size;
4738	hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
4739
4740	/*
4741	 * Note that although the logical block size is updated here, it is only done for
4742	 * the benefit of the partition management software.  The logical block count change
4743	 * has not yet actually been propagated to the disk device yet.
4744	 */
4745
4746	hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count);
4747	MarkVCBDirty(hfsmp);
4748	error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4749	if (error)
4750		panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error);
4751
4752	/*
4753	 * Adjust the size of hfsmp->hfs_attrdata_vp
4754	 */
4755	if (hfsmp->hfs_attrdata_vp) {
4756		struct cnode *cp;
4757		struct filefork *fp;
4758
4759		if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4760			cp = VTOC(hfsmp->hfs_attrdata_vp);
4761			fp = VTOF(hfsmp->hfs_attrdata_vp);
4762
4763			cp->c_blocks = newblkcnt;
4764			fp->ff_blocks = newblkcnt;
4765			fp->ff_extents[0].blockCount = newblkcnt;
4766			fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4767			ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size);
4768			vnode_put(hfsmp->hfs_attrdata_vp);
4769		}
4770	}
4771
4772out:
4773	/*
4774	 * Update the allocLimit to acknowledge the last one or two blocks now.
4775	 * Add it to the tree as well if necessary.
4776	 */
4777	UpdateAllocLimit (hfsmp, hfsmp->totalBlocks);
4778
4779	hfs_lock_mount (hfsmp);
4780	if (disable_sparse == true) {
4781		/* Now that resize is completed, set the volume to be sparse
4782		 * device again so that all further allocations will be first
4783		 * fit instead of best fit.  Reset free extent cache so that
4784		 * it is rebuilt.
4785		 */
4786		hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
4787		ResetVCBFreeExtCache(hfsmp);
4788	}
4789
4790	if (error && (updateFreeBlocks == true)) {
4791		hfsmp->freeBlocks += reclaimblks;
4792	}
4793
4794	if (hfsmp->nextAllocation >= hfsmp->allocLimit) {
4795		hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1;
4796	}
4797	hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4798	hfs_unlock_mount (hfsmp);
4799
4800	/* On error, reset the metadata zone for original volume size */
4801	if (error && (updateFreeBlocks == true)) {
4802		hfs_metadatazone_init(hfsmp, false);
4803	}
4804
4805	if (lockflags) {
4806		hfs_systemfile_unlock(hfsmp, lockflags);
4807	}
4808	if (transaction_begun) {
4809		hfs_end_transaction(hfsmp);
4810		hfs_journal_flush(hfsmp, FALSE);
4811		/* Just to be sure, sync all data to the disk */
4812		(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4813	}
4814
4815	if (error) {
4816		printf ("hfs_truncatefs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN);
4817	}
4818
4819	return MacToVFSError(error);
4820}
4821
4822
4823/*
4824 * Invalidate the physical block numbers associated with buffer cache blocks
4825 * in the given extent of the given vnode.
4826 */
4827struct hfs_inval_blk_no {
4828	daddr64_t sectorStart;
4829	daddr64_t sectorCount;
4830};
4831static int
4832hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in)
4833{
4834	daddr64_t blkno;
4835	struct hfs_inval_blk_no *args;
4836
4837	blkno = buf_blkno(bp);
4838	args = args_in;
4839
4840	if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount)
4841		buf_setblkno(bp, buf_lblkno(bp));
4842
4843	return BUF_RETURNED;
4844}
4845static void
4846hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount)
4847{
4848	struct hfs_inval_blk_no args;
4849	args.sectorStart = sectorStart;
4850	args.sectorCount = sectorCount;
4851
4852	buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args);
4853}
4854
4855
4856/*
4857 * Copy the contents of an extent to a new location.  Also invalidates the
4858 * physical block number of any buffer cache block in the copied extent
4859 * (so that if the block is written, it will go through VNOP_BLOCKMAP to
4860 * determine the new physical block number).
4861 *
4862 * At this point, for regular files, we hold the truncate lock exclusive
4863 * and the cnode lock exclusive.
4864 */
4865static int
4866hfs_copy_extent(
4867	struct hfsmount *hfsmp,
4868	struct vnode *vp,		/* The file whose extent is being copied. */
4869	u_int32_t oldStart,		/* The start of the source extent. */
4870	u_int32_t newStart,		/* The start of the destination extent. */
4871	u_int32_t blockCount,	/* The number of allocation blocks to copy. */
4872	vfs_context_t context)
4873{
4874	int err = 0;
4875	size_t bufferSize;
4876	void *buffer = NULL;
4877	struct vfsioattr ioattr;
4878	buf_t bp = NULL;
4879	off_t resid;
4880	size_t ioSize;
4881	u_int32_t ioSizeSectors;	/* Device sectors in this I/O */
4882	daddr64_t srcSector, destSector;
4883	u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4884#if CONFIG_PROTECT
4885	int cpenabled = 0;
4886#endif
4887
4888	/*
4889	 * Sanity check that we have locked the vnode of the file we're copying.
4890	 *
4891	 * But since hfs_systemfile_lock() doesn't actually take the lock on
4892	 * the allocation file if a journal is active, ignore the check if the
4893	 * file being copied is the allocation file.
4894	 */
4895	struct cnode *cp = VTOC(vp);
4896	if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread())
4897		panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp);
4898
4899#if CONFIG_PROTECT
4900	/*
4901	 * Prepare the CP blob and get it ready for use, if necessary.
4902	 *
4903	 * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs),
4904	 * because they are implicitly protected via the media key on iOS.  As such, they
4905	 * must not be relocated except with the media key.  So it is OK to not pass down
4906	 * a special cpentry to the IOMedia/LwVM code for handling.
4907	 */
4908	if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) {
4909		int cp_err = 0;
4910		/*
4911		 * Ideally, the file whose extents we are about to manipulate is using the
4912		 * newer offset-based IVs so that we can manipulate it regardless of the
4913		 * current lock state.  However, we must maintain support for older-style
4914		 * EAs.
4915		 *
4916		 * For the older EA case, the IV was tied to the device LBA for file content.
4917		 * This means that encrypted data cannot be moved from one location to another
4918		 * in the filesystem without garbling the IV data.  As a result, we need to
4919		 * access the file's plaintext because we cannot do our AES-symmetry trick
4920		 * here.  This requires that we attempt a key-unwrap here (via cp_handle_relocate)
4921		 * to make forward progress.  If the keys are unavailable then we will
4922		 * simply stop the resize in its tracks here since we cannot move
4923		 * this extent at this time.
4924		 */
4925		if ((cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) == 0) {
4926			cp_err = cp_handle_relocate(cp, hfsmp);
4927		}
4928
4929		if (cp_err) {
4930			printf ("hfs_copy_extent: cp_handle_relocate failed (%d) \n", cp_err);
4931			return cp_err;
4932		}
4933
4934		cpenabled = 1;
4935	}
4936#endif
4937
4938
4939	/*
4940	 * Determine the I/O size to use
4941	 *
4942	 * NOTE: Many external drives will result in an ioSize of 128KB.
4943	 * TODO: Should we use a larger buffer, doing several consecutive
4944	 * reads, then several consecutive writes?
4945	 */
4946	vfs_ioattr(hfsmp->hfs_mp, &ioattr);
4947	bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt);
4948	if (kmem_alloc(kernel_map, (vm_offset_t*) &buffer, bufferSize))
4949		return ENOMEM;
4950
4951	/* Get a buffer for doing the I/O */
4952	bp = buf_alloc(hfsmp->hfs_devvp);
4953	buf_setdataptr(bp, (uintptr_t)buffer);
4954
4955	resid = (off_t) blockCount * (off_t) hfsmp->blockSize;
4956	srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4957	destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4958	while (resid > 0) {
4959		ioSize = MIN(bufferSize, (size_t) resid);
4960		ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size;
4961
4962		/* Prepare the buffer for reading */
4963		buf_reset(bp, B_READ);
4964		buf_setsize(bp, ioSize);
4965		buf_setcount(bp, ioSize);
4966		buf_setblkno(bp, srcSector);
4967		buf_setlblkno(bp, srcSector);
4968
4969		/*
4970		 * Note that because this is an I/O to the device vp
4971		 * it is correct to have lblkno and blkno both point to the
4972		 * start sector being read from.  If it were being issued against the
4973		 * underlying file then that would be different.
4974		 */
4975
4976		/* Attach the new CP blob  to the buffer if needed */
4977#if CONFIG_PROTECT
4978		if (cpenabled) {
4979			if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4980				/* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */
4981				cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT;
4982				buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4983			}
4984			else {
4985				/*
4986				 * Use the cnode's cp key.  This file is tied to the
4987				 * LBAs of the physical blocks that it occupies.
4988				 */
4989				buf_setcpaddr (bp, cp->c_cpentry);
4990			}
4991
4992			/* Initialize the content protection file offset to start at 0 */
4993			buf_setcpoff (bp, 0);
4994		}
4995#endif
4996
4997		/* Do the read */
4998		err = VNOP_STRATEGY(bp);
4999		if (!err)
5000			err = buf_biowait(bp);
5001		if (err) {
5002#if CONFIG_PROTECT
5003			/* Turn the flag off in error cases. */
5004			if (cpenabled) {
5005				cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
5006			}
5007#endif
5008			printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err);
5009			break;
5010		}
5011
5012		/* Prepare the buffer for writing */
5013		buf_reset(bp, B_WRITE);
5014		buf_setsize(bp, ioSize);
5015		buf_setcount(bp, ioSize);
5016		buf_setblkno(bp, destSector);
5017		buf_setlblkno(bp, destSector);
5018		if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl))
5019			buf_markfua(bp);
5020
5021#if CONFIG_PROTECT
5022		/* Attach the CP to the buffer if needed */
5023		if (cpenabled) {
5024			if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
5025				buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
5026			}
5027			else {
5028				/*
5029				 * Use the cnode's CP key.  This file is still tied
5030				 * to the LBAs of the physical blocks that it occupies.
5031				 */
5032				buf_setcpaddr (bp, cp->c_cpentry);
5033			}
5034			/*
5035			 * The last STRATEGY call may have updated the cp file offset behind our
5036			 * back, so we cannot trust it.  Re-initialize the content protection
5037			 * file offset back to 0 before initiating the write portion of this I/O.
5038			 */
5039			buf_setcpoff (bp, 0);
5040		}
5041#endif
5042
5043		/* Do the write */
5044		vnode_startwrite(hfsmp->hfs_devvp);
5045		err = VNOP_STRATEGY(bp);
5046		if (!err) {
5047			err = buf_biowait(bp);
5048		}
5049#if CONFIG_PROTECT
5050		/* Turn the flag off regardless once the strategy call finishes. */
5051		if (cpenabled) {
5052			cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
5053		}
5054#endif
5055		if (err) {
5056			printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err);
5057			break;
5058		}
5059
5060		resid -= ioSize;
5061		srcSector += ioSizeSectors;
5062		destSector += ioSizeSectors;
5063	}
5064	if (bp)
5065		buf_free(bp);
5066	if (buffer)
5067		kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize);
5068
5069	/* Make sure all writes have been flushed to disk. */
5070	if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) {
5071		err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
5072		if (err) {
5073			printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err);
5074			err = 0;	/* Don't fail the copy. */
5075		}
5076	}
5077
5078	if (!err)
5079		hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock);
5080
5081	return err;
5082}
5083
5084
5085/* Structure to store state of reclaiming extents from a
5086 * given file.  hfs_reclaim_file()/hfs_reclaim_xattr()
5087 * initializes the values in this structure which are then
5088 * used by code that reclaims and splits the extents.
5089 */
5090struct hfs_reclaim_extent_info {
5091	struct vnode *vp;
5092	u_int32_t fileID;
5093	u_int8_t forkType;
5094	u_int8_t is_dirlink;                 /* Extent belongs to directory hard link */
5095	u_int8_t is_sysfile;                 /* Extent belongs to system file */
5096	u_int8_t is_xattr;                   /* Extent belongs to extent-based xattr */
5097	u_int8_t extent_index;
5098	int lockflags;                       /* Locks that reclaim and split code should grab before modifying the extent record */
5099	u_int32_t blocks_relocated;          /* Total blocks relocated for this file till now */
5100	u_int32_t recStartBlock;             /* File allocation block number (FABN) for current extent record */
5101	u_int32_t cur_blockCount;            /* Number of allocation blocks that have been checked for reclaim */
5102	struct filefork *catalog_fp;         /* If non-NULL, extent is from catalog record */
5103	union record {
5104		HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */
5105		HFSPlusAttrRecord xattr;     /* Attribute record for large EAs */
5106	} record;
5107	HFSPlusExtentDescriptor *extents;    /* Pointer to current extent record being processed.
5108					      * For catalog extent record, points to the correct
5109					      * extent information in filefork.  For overflow extent
5110					      * record, or xattr record, points to extent record
5111					      * in the structure above
5112					      */
5113	struct cat_desc *dirlink_desc;
5114	struct cat_attr *dirlink_attr;
5115	struct filefork *dirlink_fork;	      /* For directory hard links, fp points actually to this */
5116	struct BTreeIterator *iterator;       /* Shared read/write iterator, hfs_reclaim_file/xattr()
5117                                               * use it for reading and hfs_reclaim_extent()/hfs_split_extent()
5118					       * use it for writing updated extent record
5119					       */
5120	struct FSBufferDescriptor btdata;     /* Shared btdata for reading/writing extent record, same as iterator above */
5121	u_int16_t recordlen;
5122	int overflow_count;                   /* For debugging, counter for overflow extent record */
5123	FCB *fcb;                             /* Pointer to the current btree being traversed */
5124};
5125
5126/*
5127 * Split the current extent into two extents, with first extent
5128 * to contain given number of allocation blocks.  Splitting of
5129 * extent creates one new extent entry which can result in
5130 * shifting of many entries through all the extent records of a
5131 * file, and/or creating a new extent record in the overflow
5132 * extent btree.
5133 *
5134 * Example:
5135 * The diagram below represents two consecutive extent records,
5136 * for simplicity, lets call them record X and X+1 respectively.
5137 * Interesting extent entries have been denoted by letters.
5138 * If the letter is unchanged before and after split, it means
5139 * that the extent entry was not modified during the split.
5140 * A '.' means that the entry remains unchanged after the split
5141 * and is not relevant for our example.  A '0' means that the
5142 * extent entry is empty.
5143 *
5144 * If there isn't sufficient contiguous free space to relocate
5145 * an extent (extent "C" below), we will have to break the one
5146 * extent into multiple smaller extents, and relocate each of
5147 * the smaller extents individually.  The way we do this is by
5148 * finding the largest contiguous free space that is currently
5149 * available (N allocation blocks), and then convert extent "C"
5150 * into two extents, C1 and C2, that occupy exactly the same
5151 * allocation blocks as extent C.  Extent C1 is the first
5152 * N allocation blocks of extent C, and extent C2 is the remainder
5153 * of extent C.  Then we can relocate extent C1 since we know
5154 * we have enough contiguous free space to relocate it in its
5155 * entirety.  We then repeat the process starting with extent C2.
5156 *
5157 * In record X, only the entries following entry C are shifted, and
5158 * the original entry C is replaced with two entries C1 and C2 which
5159 * are actually two extent entries for contiguous allocation blocks.
5160 *
5161 * Note that the entry E from record X is shifted into record X+1 as
5162 * the new first entry.  Since the first entry of record X+1 is updated,
5163 * the FABN will also get updated with the blockCount of entry E.
5164 * This also results in shifting of all extent entries in record X+1.
5165 * Note that the number of empty entries after the split has been
5166 * changed from 3 to 2.
5167 *
5168 * Before:
5169 *               record X                           record X+1
5170 *  ---------------------===---------     ---------------------------------
5171 *  | A | . | . | . | B | C | D | E |     | F | . | . | . | G | 0 | 0 | 0 |
5172 *  ---------------------===---------     ---------------------------------
5173 *
5174 * After:
5175 *  ---------------------=======-----     ---------------------------------
5176 *  | A | . | . | . | B | C1| C2| D |     | E | F | . | . | . | G | 0 | 0 |
5177 *  ---------------------=======-----     ---------------------------------
5178 *
5179 *  C1.startBlock = C.startBlock
5180 *  C1.blockCount = N
5181 *
5182 *  C2.startBlock = C.startBlock + N
5183 *  C2.blockCount = C.blockCount - N
5184 *
5185 *                                        FABN = old FABN - E.blockCount
5186 *
5187 * Inputs:
5188 *	extent_info -   This is the structure that contains state about
5189 *	                the current file, extent, and extent record that
5190 *	                is being relocated.  This structure is shared
5191 *	                among code that traverses through all the extents
5192 *	                of the file, code that relocates extents, and
5193 *	                code that splits the extent.
5194 *	newBlockCount - The blockCount of the extent to be split after
5195 *	                successfully split operation.
5196 * Output:
5197 * 	Zero on success, non-zero on failure.
5198 */
5199static int
5200hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount)
5201{
5202	int error = 0;
5203	int index = extent_info->extent_index;
5204	int i;
5205	HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */
5206	HFSPlusExtentDescriptor last_extent;
5207	HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */
5208	HFSPlusExtentRecord *extents_rec = NULL;
5209	HFSPlusExtentKey *extents_key = NULL;
5210	HFSPlusAttrRecord *xattr_rec = NULL;
5211	HFSPlusAttrKey *xattr_key = NULL;
5212	struct BTreeIterator iterator;
5213	struct FSBufferDescriptor btdata;
5214	uint16_t reclen;
5215	uint32_t read_recStartBlock;	/* Starting allocation block number to read old extent record */
5216	uint32_t write_recStartBlock;	/* Starting allocation block number to insert newly updated extent record */
5217	Boolean create_record = false;
5218	Boolean is_xattr;
5219	struct cnode *cp;
5220
5221	is_xattr = extent_info->is_xattr;
5222	extents = extent_info->extents;
5223	cp = VTOC(extent_info->vp);
5224
5225	if (newBlockCount == 0) {
5226		if (hfs_resize_debug) {
5227			printf ("hfs_split_extent: No splitting required for newBlockCount=0\n");
5228		}
5229		return error;
5230	}
5231
5232	if (hfs_resize_debug) {
5233		printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount);
5234	}
5235
5236	/* Extents overflow btree can not have more than 8 extents.
5237	 * No split allowed if the 8th extent is already used.
5238	 */
5239	if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) {
5240		printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n");
5241		error = ENOSPC;
5242		goto out;
5243	}
5244
5245	/* Determine the starting allocation block number for the following
5246	 * overflow extent record, if any, before the current record
5247	 * gets modified.
5248	 */
5249	read_recStartBlock = extent_info->recStartBlock;
5250	for (i = 0; i < kHFSPlusExtentDensity; i++) {
5251		if (extents[i].blockCount == 0) {
5252			break;
5253		}
5254		read_recStartBlock += extents[i].blockCount;
5255	}
5256
5257	/* Shift and split */
5258	if (index == kHFSPlusExtentDensity-1) {
5259		/* The new extent created after split will go into following overflow extent record */
5260		shift_extent.startBlock = extents[index].startBlock + newBlockCount;
5261		shift_extent.blockCount = extents[index].blockCount - newBlockCount;
5262
5263		/* Last extent in the record will be split, so nothing to shift */
5264	} else {
5265		/* Splitting of extents can result in at most of one
5266		 * extent entry to be shifted into following overflow extent
5267		 * record.  So, store the last extent entry for later.
5268		 */
5269		shift_extent = extents[kHFSPlusExtentDensity-1];
5270		if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) {
5271			printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount);
5272		}
5273
5274		/* Start shifting extent information from the end of the extent
5275		 * record to the index where we want to insert the new extent.
5276		 * Note that kHFSPlusExtentDensity-1 is already saved above, and
5277		 * does not need to be shifted.  The extent entry that is being
5278		 * split does not get shifted.
5279		 */
5280		for (i = kHFSPlusExtentDensity-2; i > index; i--) {
5281			if (hfs_resize_debug) {
5282				if (extents[i].blockCount) {
5283					printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount);
5284				}
5285			}
5286			extents[i+1] = extents[i];
5287		}
5288	}
5289
5290	if (index == kHFSPlusExtentDensity-1) {
5291		/* The second half of the extent being split will be the overflow
5292		 * entry that will go into following overflow extent record.  The
5293		 * value has been stored in 'shift_extent' above, so there is
5294		 * nothing to be done here.
5295		 */
5296	} else {
5297		/* Update the values in the second half of the extent being split
5298		 * before updating the first half of the split.  Note that the
5299		 * extent to split or first half of the split is at index 'index'
5300		 * and a new extent or second half of the split will be inserted at
5301		 * 'index+1' or into following overflow extent record.
5302		 */
5303		extents[index+1].startBlock = extents[index].startBlock + newBlockCount;
5304		extents[index+1].blockCount = extents[index].blockCount - newBlockCount;
5305	}
5306	/* Update the extent being split, only the block count will change */
5307	extents[index].blockCount = newBlockCount;
5308
5309	if (hfs_resize_debug) {
5310		printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount);
5311		if (index != kHFSPlusExtentDensity-1) {
5312			printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount);
5313		} else {
5314			printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount);
5315		}
5316	}
5317
5318	/* Write out information about the newly split extent to the disk */
5319	if (extent_info->catalog_fp) {
5320		/* (extent_info->catalog_fp != NULL) means the newly split
5321		 * extent exists in the catalog record.  This means that
5322		 * the cnode was updated.  Therefore, to write out the changes,
5323		 * mark the cnode as modified.   We cannot call hfs_update()
5324		 * in this function because the caller hfs_reclaim_extent()
5325		 * is holding the catalog lock currently.
5326		 */
5327		cp->c_flag |= C_MODIFIED;
5328	} else {
5329		/* The newly split extent is for large EAs or is in overflow
5330		 * extent record, so update it directly in the btree using the
5331		 * iterator information from the shared extent_info structure
5332	 	 */
5333		error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5334				&(extent_info->btdata), extent_info->recordlen);
5335		if (error) {
5336			printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error);
5337			goto out;
5338		}
5339	}
5340
5341	/* No extent entry to be shifted into another extent overflow record */
5342	if (shift_extent.blockCount == 0) {
5343		if (hfs_resize_debug) {
5344			printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n");
5345		}
5346		error = 0;
5347		goto out;
5348	}
5349
5350	/* The overflow extent entry has to be shifted into an extent
5351	 * overflow record.  This means that we might have to shift
5352	 * extent entries from all subsequent overflow records by one.
5353	 * We start iteration from the first record to the last record,
5354	 * and shift the extent entry from one record to another.
5355	 * We might have to create a new extent record for the last
5356	 * extent entry for the file.
5357	 */
5358
5359	/* Initialize iterator to search the next record */
5360	bzero(&iterator, sizeof(iterator));
5361	if (is_xattr) {
5362		/* Copy the key from the iterator that was used to update the modified attribute record. */
5363		xattr_key = (HFSPlusAttrKey *)&(iterator.key);
5364		bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey));
5365		/* Note: xattr_key->startBlock will be initialized later in the iteration loop */
5366
5367		MALLOC(xattr_rec, HFSPlusAttrRecord *,
5368				sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK);
5369		if (xattr_rec == NULL) {
5370			error = ENOMEM;
5371			goto out;
5372		}
5373		btdata.bufferAddress = xattr_rec;
5374		btdata.itemSize = sizeof(HFSPlusAttrRecord);
5375		btdata.itemCount = 1;
5376		extents = xattr_rec->overflowExtents.extents;
5377	} else {
5378		/* Initialize the extent key for the current file */
5379		extents_key = (HFSPlusExtentKey *) &(iterator.key);
5380		extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5381		extents_key->forkType = extent_info->forkType;
5382		extents_key->fileID = extent_info->fileID;
5383		/* Note: extents_key->startBlock will be initialized later in the iteration loop */
5384
5385		MALLOC(extents_rec, HFSPlusExtentRecord *,
5386				sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK);
5387		if (extents_rec == NULL) {
5388			error = ENOMEM;
5389			goto out;
5390		}
5391		btdata.bufferAddress = extents_rec;
5392		btdata.itemSize = sizeof(HFSPlusExtentRecord);
5393		btdata.itemCount = 1;
5394		extents = extents_rec[0];
5395	}
5396
5397	/* The overflow extent entry has to be shifted into an extent
5398	 * overflow record.  This means that we might have to shift
5399	 * extent entries from all subsequent overflow records by one.
5400	 * We start iteration from the first record to the last record,
5401	 * examine one extent record in each iteration and shift one
5402	 * extent entry from one record to another.  We might have to
5403	 * create a new extent record for the last extent entry for the
5404	 * file.
5405	 *
5406	 * If shift_extent.blockCount is non-zero, it means that there is
5407	 * an extent entry that needs to be shifted into the next
5408	 * overflow extent record.  We keep on going till there are no such
5409	 * entries left to be shifted.  This will also change the starting
5410	 * allocation block number of the extent record which is part of
5411	 * the key for the extent record in each iteration.  Note that
5412	 * because the extent record key is changing while we are searching,
5413	 * the record can not be updated directly, instead it has to be
5414	 * deleted and inserted again.
5415	 */
5416	while (shift_extent.blockCount) {
5417		if (hfs_resize_debug) {
5418			printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock);
5419		}
5420
5421		/* Search if there is any existing overflow extent record
5422		 * that matches the current file and the logical start block
5423		 * number.
5424		 *
5425		 * For this, the logical start block number in the key is
5426		 * the value calculated based on the logical start block
5427		 * number of the current extent record and the total number
5428		 * of blocks existing in the current extent record.
5429		 */
5430		if (is_xattr) {
5431			xattr_key->startBlock = read_recStartBlock;
5432		} else {
5433			extents_key->startBlock = read_recStartBlock;
5434		}
5435		error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator);
5436		if (error) {
5437			if (error != btNotFound) {
5438				printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5439				goto out;
5440			}
5441			/* No matching record was found, so create a new extent record.
5442			 * Note:  Since no record was found, we can't rely on the
5443			 * btree key in the iterator any longer.  This will be initialized
5444			 * later before we insert the record.
5445			 */
5446			create_record = true;
5447		}
5448
5449		/* The extra extent entry from the previous record is being inserted
5450		 * as the first entry in the current extent record.  This will change
5451		 * the file allocation block number (FABN) of the current extent
5452		 * record, which is the startBlock value from the extent record key.
5453		 * Since one extra entry is being inserted in the record, the new
5454		 * FABN for the record will less than old FABN by the number of blocks
5455		 * in the new extent entry being inserted at the start.  We have to
5456		 * do this before we update read_recStartBlock to point at the
5457		 * startBlock of the following record.
5458		 */
5459		write_recStartBlock = read_recStartBlock - shift_extent.blockCount;
5460		if (hfs_resize_debug) {
5461			if (create_record) {
5462				printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock);
5463			}
5464		}
5465
5466		/* Now update the read_recStartBlock to account for total number
5467		 * of blocks in this extent record.  It will now point to the
5468		 * starting allocation block number for the next extent record.
5469		 */
5470		for (i = 0; i < kHFSPlusExtentDensity; i++) {
5471			if (extents[i].blockCount == 0) {
5472				break;
5473			}
5474			read_recStartBlock += extents[i].blockCount;
5475		}
5476
5477		if (create_record == true) {
5478			/* Initialize new record content with only one extent entry */
5479			bzero(extents, sizeof(HFSPlusExtentRecord));
5480			/* The new record will contain only one extent entry */
5481			extents[0] = shift_extent;
5482			/* There are no more overflow extents to be shifted */
5483			shift_extent.startBlock = shift_extent.blockCount = 0;
5484
5485			if (is_xattr) {
5486				/* BTSearchRecord above returned btNotFound,
5487				 * but since the attribute btree is never empty
5488				 * if we are trying to insert new overflow
5489				 * record for the xattrs, the extents_key will
5490				 * contain correct data.  So we don't need to
5491				 * re-initialize it again like below.
5492				 */
5493
5494				/* Initialize the new xattr record */
5495				xattr_rec->recordType = kHFSPlusAttrExtents;
5496				xattr_rec->overflowExtents.reserved = 0;
5497				reclen = sizeof(HFSPlusAttrExtents);
5498			} else {
5499				/* BTSearchRecord above returned btNotFound,
5500				 * which means that extents_key content might
5501				 * not correspond to the record that we are
5502				 * trying to create, especially when the extents
5503				 * overflow btree is empty.  So we reinitialize
5504				 * the extents_key again always.
5505				 */
5506				extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5507				extents_key->forkType = extent_info->forkType;
5508				extents_key->fileID = extent_info->fileID;
5509
5510				/* Initialize the new extent record */
5511				reclen = sizeof(HFSPlusExtentRecord);
5512			}
5513		} else {
5514			/* The overflow extent entry from previous record will be
5515			 * the first entry in this extent record.  If the last
5516			 * extent entry in this record is valid, it will be shifted
5517			 * into the following extent record as its first entry.  So
5518			 * save the last entry before shifting entries in current
5519			 * record.
5520			 */
5521			last_extent = extents[kHFSPlusExtentDensity-1];
5522
5523			/* Shift all entries by one index towards the end */
5524			for (i = kHFSPlusExtentDensity-2; i >= 0; i--) {
5525				extents[i+1] = extents[i];
5526			}
5527
5528			/* Overflow extent entry saved from previous record
5529			 * is now the first entry in the current record.
5530			 */
5531			extents[0] = shift_extent;
5532
5533			if (hfs_resize_debug) {
5534				printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock);
5535			}
5536
5537			/* The last entry from current record will be the
5538			 * overflow entry which will be the first entry for
5539			 * the following extent record.
5540			 */
5541			shift_extent = last_extent;
5542
5543			/* Since the key->startBlock is being changed for this record,
5544			 * it should be deleted and inserted with the new key.
5545			 */
5546			error = BTDeleteRecord(extent_info->fcb, &iterator);
5547			if (error) {
5548				printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5549				goto out;
5550			}
5551			if (hfs_resize_debug) {
5552				printf ("hfs_split_extent: Deleted extent record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock));
5553			}
5554		}
5555
5556		/* Insert the newly created or modified extent record */
5557		bzero(&iterator.hint, sizeof(iterator.hint));
5558		if (is_xattr) {
5559			xattr_key->startBlock = write_recStartBlock;
5560		} else {
5561			extents_key->startBlock = write_recStartBlock;
5562		}
5563		error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen);
5564		if (error) {
5565			printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error);
5566			goto out;
5567		}
5568		if (hfs_resize_debug) {
5569			printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock);
5570		}
5571	}
5572
5573out:
5574	/*
5575	 * Extents overflow btree or attributes btree headers might have
5576	 * been modified during the split/shift operation, so flush the
5577	 * changes to the disk while we are inside journal transaction.
5578	 * We should only be able to generate I/O that modifies the B-Tree
5579	 * header nodes while we're in the middle of a journal transaction.
5580	 * Otherwise it might result in panic during unmount.
5581	 */
5582	BTFlushPath(extent_info->fcb);
5583
5584	if (extents_rec) {
5585		FREE (extents_rec, M_TEMP);
5586	}
5587	if (xattr_rec) {
5588		FREE (xattr_rec, M_TEMP);
5589	}
5590	return error;
5591}
5592
5593
5594/*
5595 * Relocate an extent if it lies beyond the expected end of volume.
5596 *
5597 * This function is called for every extent of the file being relocated.
5598 * It allocates space for relocation, copies the data, deallocates
5599 * the old extent, and update corresponding on-disk extent.  If the function
5600 * does not find contiguous space to  relocate an extent, it splits the
5601 * extent in smaller size to be able to relocate it out of the area of
5602 * disk being reclaimed.  As an optimization, if an extent lies partially
5603 * in the area of the disk being reclaimed, it is split so that we only
5604 * have to relocate the area that was overlapping with the area of disk
5605 * being reclaimed.
5606 *
5607 * Note that every extent is relocated in its own transaction so that
5608 * they do not overwhelm the journal.  This function handles the extent
5609 * record that exists in the catalog record, extent record from overflow
5610 * extents btree, and extents for large EAs.
5611 *
5612 * Inputs:
5613 *	extent_info - This is the structure that contains state about
5614 *	              the current file, extent, and extent record that
5615 *	              is being relocated.  This structure is shared
5616 *	              among code that traverses through all the extents
5617 *	              of the file, code that relocates extents, and
5618 *	              code that splits the extent.
5619 */
5620static int
5621hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context)
5622{
5623	int error = 0;
5624	int index;
5625	struct cnode *cp;
5626	u_int32_t oldStartBlock;
5627	u_int32_t oldBlockCount;
5628	u_int32_t newStartBlock;
5629	u_int32_t newBlockCount;
5630	u_int32_t roundedBlockCount;
5631	uint16_t node_size;
5632	uint32_t remainder_blocks;
5633	u_int32_t alloc_flags;
5634	int blocks_allocated = false;
5635
5636	index = extent_info->extent_index;
5637	cp = VTOC(extent_info->vp);
5638
5639	oldStartBlock = extent_info->extents[index].startBlock;
5640	oldBlockCount = extent_info->extents[index].blockCount;
5641
5642	if (0 && hfs_resize_debug) {
5643		printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount);
5644	}
5645
5646	/* If the current extent lies completely within allocLimit,
5647	 * it does not require any relocation.
5648	 */
5649	if ((oldStartBlock + oldBlockCount) <= allocLimit) {
5650		extent_info->cur_blockCount += oldBlockCount;
5651		return error;
5652	}
5653
5654	/* Every extent should be relocated in its own transaction
5655	 * to make sure that we don't overflow the journal buffer.
5656	 */
5657	error = hfs_start_transaction(hfsmp);
5658	if (error) {
5659		return error;
5660	}
5661	extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK);
5662
5663	/* Check if the extent lies partially in the area to reclaim,
5664	 * i.e. it starts before allocLimit and ends beyond allocLimit.
5665	 * We have already skipped extents that lie completely within
5666	 * allocLimit in the check above, so we only check for the
5667	 * startBlock.  If it lies partially, split it so that we
5668	 * only relocate part of the extent.
5669	 */
5670	if (oldStartBlock < allocLimit) {
5671		newBlockCount = allocLimit - oldStartBlock;
5672
5673		if (hfs_resize_debug) {
5674			int idx = extent_info->extent_index;
5675			printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5676		}
5677
5678		/* If the extent belongs to a btree, check and trim
5679		 * it to be multiple of the node size.
5680		 */
5681		if (extent_info->is_sysfile) {
5682			node_size = get_btree_nodesize(extent_info->vp);
5683			/* If the btree node size is less than the block size,
5684			 * splitting this extent will not split a node across
5685			 * different extents.  So we only check and trim if
5686			 * node size is more than the allocation block size.
5687			 */
5688			if (node_size > hfsmp->blockSize) {
5689				remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5690				if (remainder_blocks) {
5691					newBlockCount -= remainder_blocks;
5692					if (hfs_resize_debug) {
5693						printf ("hfs_reclaim_extent: Round-down newBlockCount to be multiple of nodeSize, node_allocblks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5694					}
5695				}
5696			}
5697			/* The newBlockCount is zero because of rounding-down so that
5698			 * btree nodes are not split across extents.  Therefore this
5699			 * straddling extent across resize-boundary does not require
5700			 * splitting.  Skip over to relocating of complete extent.
5701			 */
5702			if (newBlockCount == 0) {
5703				if (hfs_resize_debug) {
5704					printf ("hfs_reclaim_extent: After round-down newBlockCount=0, skip split, relocate full extent\n");
5705				}
5706				goto relocate_full_extent;
5707			}
5708		}
5709
5710		/* Split the extents into two parts --- the first extent lies
5711		 * completely within allocLimit and therefore does not require
5712		 * relocation.  The second extent will require relocation which
5713		 * will be handled when the caller calls this function again
5714		 * for the next extent.
5715		 */
5716		error = hfs_split_extent(extent_info, newBlockCount);
5717		if (error == 0) {
5718			/* Split success, no relocation required */
5719			goto out;
5720		}
5721		/* Split failed, so try to relocate entire extent */
5722		if (hfs_resize_debug) {
5723			int idx = extent_info->extent_index;
5724			printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks failed, relocate full extent\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5725		}
5726	}
5727
5728relocate_full_extent:
5729	/* At this point, the current extent requires relocation.
5730	 * We will try to allocate space equal to the size of the extent
5731	 * being relocated first to try to relocate it without splitting.
5732	 * If the allocation fails, we will try to allocate contiguous
5733	 * blocks out of metadata zone.  If that allocation also fails,
5734	 * then we will take a whatever contiguous block run is returned
5735	 * by the allocation, split the extent into two parts, and then
5736	 * relocate the first splitted extent.
5737	 */
5738	alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS;
5739	if (extent_info->is_sysfile) {
5740		alloc_flags |= HFS_ALLOC_METAZONE;
5741	}
5742
5743	error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags,
5744			&newStartBlock, &newBlockCount);
5745	if ((extent_info->is_sysfile == false) &&
5746	    ((error == dskFulErr) || (error == ENOSPC))) {
5747		/* For non-system files, try reallocating space in metadata zone */
5748		alloc_flags |= HFS_ALLOC_METAZONE;
5749		error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5750				alloc_flags, &newStartBlock, &newBlockCount);
5751	}
5752	if ((error == dskFulErr) || (error == ENOSPC)) {
5753		/* We did not find desired contiguous space for this extent.
5754		 * So don't worry about getting contiguity anymore.  Also, allow using
5755		 * blocks that were recently deallocated.
5756		 */
5757		alloc_flags &= ~HFS_ALLOC_FORCECONTIG;
5758		alloc_flags |= HFS_ALLOC_FLUSHTXN;
5759
5760		error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5761				alloc_flags, &newStartBlock, &newBlockCount);
5762		if (error) {
5763			printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5764			goto out;
5765		}
5766		blocks_allocated = true;
5767
5768		/* The number of blocks allocated is less than the requested
5769		 * number of blocks.  For btree extents, check and trim the
5770		 * extent to be multiple of the node size.
5771		 */
5772		if (extent_info->is_sysfile) {
5773			node_size = get_btree_nodesize(extent_info->vp);
5774			if (node_size > hfsmp->blockSize) {
5775				remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5776				if (remainder_blocks) {
5777					roundedBlockCount = newBlockCount - remainder_blocks;
5778					/* Free tail-end blocks of the newly allocated extent */
5779					BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount,
5780							       newBlockCount - roundedBlockCount,
5781							       HFS_ALLOC_SKIPFREEBLKS);
5782					newBlockCount = roundedBlockCount;
5783					if (hfs_resize_debug) {
5784						printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5785					}
5786					if (newBlockCount == 0) {
5787						printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID);
5788						error = ENOSPC;
5789						goto out;
5790					}
5791				}
5792			}
5793		}
5794
5795		/* The number of blocks allocated is less than the number of
5796		 * blocks requested, so split this extent --- the first extent
5797		 * will be relocated as part of this function call and the caller
5798		 * will handle relocating the second extent by calling this
5799		 * function again for the second extent.
5800		 */
5801		error = hfs_split_extent(extent_info, newBlockCount);
5802		if (error) {
5803			printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5804			goto out;
5805		}
5806		oldBlockCount = newBlockCount;
5807	}
5808	if (error) {
5809		printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5810		goto out;
5811	}
5812	blocks_allocated = true;
5813
5814	/* Copy data from old location to new location */
5815	error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock,
5816			newStartBlock, newBlockCount, context);
5817	if (error) {
5818		printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error);
5819		goto out;
5820	}
5821
5822	/* Update the extent record with the new start block information */
5823	extent_info->extents[index].startBlock = newStartBlock;
5824
5825	/* Sync the content back to the disk */
5826	if (extent_info->catalog_fp) {
5827		/* Update the extents in catalog record */
5828		if (extent_info->is_dirlink) {
5829			error = cat_update_dirlink(hfsmp, extent_info->forkType,
5830					extent_info->dirlink_desc, extent_info->dirlink_attr,
5831					&(extent_info->dirlink_fork->ff_data));
5832		} else {
5833			cp->c_flag |= C_MODIFIED;
5834			/* If this is a system file, sync volume headers on disk */
5835			if (extent_info->is_sysfile) {
5836				error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5837			}
5838		}
5839	} else {
5840		/* Replace record for extents overflow or extents-based xattrs */
5841		error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5842				&(extent_info->btdata), extent_info->recordlen);
5843	}
5844	if (error) {
5845		printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error);
5846		goto out;
5847	}
5848
5849	/* Deallocate the old extent */
5850	error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5851	if (error) {
5852		printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5853		goto out;
5854	}
5855	extent_info->blocks_relocated += newBlockCount;
5856
5857	if (hfs_resize_debug) {
5858		printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
5859	}
5860
5861out:
5862	if (error != 0) {
5863		if (blocks_allocated == true) {
5864			BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5865		}
5866	} else {
5867		/* On success, increment the total allocation blocks processed */
5868		extent_info->cur_blockCount += newBlockCount;
5869	}
5870
5871	hfs_systemfile_unlock(hfsmp, extent_info->lockflags);
5872
5873	/* For a non-system file, if an extent entry from catalog record
5874	 * was modified, sync the in-memory changes to the catalog record
5875	 * on disk before ending the transaction.
5876	 */
5877	 if ((extent_info->catalog_fp) &&
5878	     (extent_info->is_sysfile == false)) {
5879		(void) hfs_update(extent_info->vp, MNT_WAIT);
5880	}
5881
5882	hfs_end_transaction(hfsmp);
5883
5884	return error;
5885}
5886
5887/* Report intermediate progress during volume resize */
5888static void
5889hfs_truncatefs_progress(struct hfsmount *hfsmp)
5890{
5891	u_int32_t cur_progress = 0;
5892
5893	hfs_resize_progress(hfsmp, &cur_progress);
5894	if (cur_progress > (hfsmp->hfs_resize_progress + 9)) {
5895		printf("hfs_truncatefs: %d%% done...\n", cur_progress);
5896		hfsmp->hfs_resize_progress = cur_progress;
5897	}
5898	return;
5899}
5900
5901/*
5902 * Reclaim space at the end of a volume for given file and forktype.
5903 *
5904 * This routine attempts to move any extent which contains allocation blocks
5905 * at or after "allocLimit."  A separate transaction is used for every extent
5906 * that needs to be moved.  If there is not contiguous space available for
5907 * moving an extent, it can be split into smaller extents.  The contents of
5908 * any moved extents are read and written via the volume's device vnode --
5909 * NOT via "vp."  During the move, moved blocks which are part of a transaction
5910 * have their physical block numbers invalidated so they will eventually be
5911 * written to their new locations.
5912 *
5913 * This function is also called for directory hard links.  Directory hard links
5914 * are regular files with no data fork and resource fork that contains alias
5915 * information for backward compatibility with pre-Leopard systems.  However
5916 * non-Mac OS X implementation can add/modify data fork or resource fork
5917 * information to directory hard links, so we check, and if required, relocate
5918 * both data fork and resource fork.
5919 *
5920 * Inputs:
5921 *    hfsmp       The volume being resized.
5922 *    vp          The vnode for the system file.
5923 *    fileID	  ID of the catalog record that needs to be relocated
5924 *    forktype	  The type of fork that needs relocated,
5925 *    			kHFSResourceForkType for resource fork,
5926 *    			kHFSDataForkType for data fork
5927 *    allocLimit  Allocation limit for the new volume size,
5928 *    		  do not use this block or beyond.  All extents
5929 *    		  that use this block or any blocks beyond this limit
5930 *    		  will be relocated.
5931 *
5932 * Side Effects:
5933 * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
5934 * blocks that were relocated.
5935 */
5936static int
5937hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID,
5938		u_int8_t forktype, u_long allocLimit, vfs_context_t context)
5939{
5940	int error = 0;
5941	struct hfs_reclaim_extent_info *extent_info;
5942	int i;
5943	int lockflags = 0;
5944	struct cnode *cp;
5945	struct filefork *fp;
5946	int took_truncate_lock = false;
5947	int release_desc = false;
5948	HFSPlusExtentKey *key;
5949
5950	/* If there is no vnode for this file, then there's nothing to do. */
5951	if (vp == NULL) {
5952		return 0;
5953	}
5954
5955	cp = VTOC(vp);
5956
5957	if (hfs_resize_debug) {
5958		const char *filename = (const char *) cp->c_desc.cd_nameptr;
5959		int namelen = cp->c_desc.cd_namelen;
5960
5961		if (filename == NULL) {
5962			filename = "";
5963			namelen = 0;
5964		}
5965		printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename);
5966	}
5967
5968	MALLOC(extent_info, struct hfs_reclaim_extent_info *,
5969	       sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
5970	if (extent_info == NULL) {
5971		return ENOMEM;
5972	}
5973	bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
5974	extent_info->vp = vp;
5975	extent_info->fileID = fileID;
5976	extent_info->forkType = forktype;
5977	extent_info->is_sysfile = vnode_issystem(vp);
5978	if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) {
5979		extent_info->is_dirlink = true;
5980	}
5981	/* We always need allocation bitmap and extent btree lock */
5982	lockflags = SFL_BITMAP | SFL_EXTENTS;
5983	if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) {
5984		lockflags |= SFL_CATALOG;
5985	} else if (fileID == kHFSAttributesFileID) {
5986		lockflags |= SFL_ATTRIBUTE;
5987	} else if (fileID == kHFSStartupFileID) {
5988		lockflags |= SFL_STARTUP;
5989	}
5990	extent_info->lockflags = lockflags;
5991	extent_info->fcb = VTOF(hfsmp->hfs_extents_vp);
5992
5993	/* Flush data associated with current file on disk.
5994	 *
5995	 * If the current vnode is directory hard link, no flushing of
5996	 * journal or vnode is required.  The current kernel does not
5997	 * modify data/resource fork of directory hard links, so nothing
5998	 * will be in the cache.  If a directory hard link is newly created,
5999	 * the resource fork data is written directly using devvp and
6000	 * the code that actually relocates data (hfs_copy_extent()) also
6001	 * uses devvp for its I/O --- so they will see a consistent copy.
6002	 */
6003	if (extent_info->is_sysfile) {
6004		/* If the current vnode is system vnode, flush journal
6005		 * to make sure that all data is written to the disk.
6006		 */
6007		error = hfs_journal_flush(hfsmp, TRUE);
6008		if (error) {
6009			printf ("hfs_reclaim_file: journal_flush returned %d\n", error);
6010			goto out;
6011		}
6012	} else if (extent_info->is_dirlink == false) {
6013		/* Flush all blocks associated with this regular file vnode.
6014		 * Normally there should not be buffer cache blocks for regular
6015		 * files, but for objects like symlinks, we can have buffer cache
6016		 * blocks associated with the vnode.  Therefore we call
6017		 * buf_flushdirtyblks() also.
6018		 */
6019		buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file");
6020
6021		hfs_unlock(cp);
6022		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
6023		took_truncate_lock = true;
6024		(void) cluster_push(vp, 0);
6025		error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
6026		if (error) {
6027			goto out;
6028		}
6029
6030		/* If the file no longer exists, nothing left to do */
6031		if (cp->c_flag & C_NOEXISTS) {
6032			error = 0;
6033			goto out;
6034		}
6035
6036		/* Wait for any in-progress writes to this vnode to complete, so that we'll
6037		 * be copying consistent bits.  (Otherwise, it's possible that an async
6038		 * write will complete to the old extent after we read from it.  That
6039		 * could lead to corruption.)
6040		 */
6041		error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file");
6042		if (error) {
6043			goto out;
6044		}
6045	}
6046
6047	if (hfs_resize_debug) {
6048		printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID);
6049	}
6050
6051	if (extent_info->is_dirlink) {
6052		MALLOC(extent_info->dirlink_desc, struct cat_desc *,
6053				sizeof(struct cat_desc), M_TEMP, M_WAITOK);
6054		MALLOC(extent_info->dirlink_attr, struct cat_attr *,
6055				sizeof(struct cat_attr), M_TEMP, M_WAITOK);
6056		MALLOC(extent_info->dirlink_fork, struct filefork *,
6057				sizeof(struct filefork), M_TEMP, M_WAITOK);
6058		if ((extent_info->dirlink_desc == NULL) ||
6059		    (extent_info->dirlink_attr == NULL) ||
6060		    (extent_info->dirlink_fork == NULL)) {
6061			error = ENOMEM;
6062			goto out;
6063		}
6064
6065		/* Lookup catalog record for directory hard link and
6066		 * create a fake filefork for the value looked up from
6067		 * the disk.
6068		 */
6069		fp = extent_info->dirlink_fork;
6070		bzero(extent_info->dirlink_fork, sizeof(struct filefork));
6071		extent_info->dirlink_fork->ff_cp = cp;
6072		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6073		error = cat_lookup_dirlink(hfsmp, fileID, forktype,
6074				extent_info->dirlink_desc, extent_info->dirlink_attr,
6075				&(extent_info->dirlink_fork->ff_data));
6076		hfs_systemfile_unlock(hfsmp, lockflags);
6077		if (error) {
6078			printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error);
6079			goto out;
6080		}
6081		release_desc = true;
6082	} else {
6083		fp = VTOF(vp);
6084	}
6085
6086	extent_info->catalog_fp = fp;
6087	extent_info->recStartBlock = 0;
6088	extent_info->extents = extent_info->catalog_fp->ff_extents;
6089	/* Relocate extents from the catalog record */
6090	for (i = 0; i < kHFSPlusExtentDensity; ++i) {
6091		if (fp->ff_extents[i].blockCount == 0) {
6092			break;
6093		}
6094		extent_info->extent_index = i;
6095		error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6096		if (error) {
6097			printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error);
6098			goto out;
6099		}
6100	}
6101
6102	/* If the number of allocation blocks processed for reclaiming
6103	 * are less than total number of blocks for the file, continuing
6104	 * working on overflow extents record.
6105	 */
6106	if (fp->ff_blocks <= extent_info->cur_blockCount) {
6107		if (0 && hfs_resize_debug) {
6108			printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6109		}
6110		goto out;
6111	}
6112
6113	if (hfs_resize_debug) {
6114		printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6115	}
6116
6117	MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6118	if (extent_info->iterator == NULL) {
6119		error = ENOMEM;
6120		goto out;
6121	}
6122	bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6123	key = (HFSPlusExtentKey *) &(extent_info->iterator->key);
6124	key->keyLength = kHFSPlusExtentKeyMaximumLength;
6125	key->forkType = forktype;
6126	key->fileID = fileID;
6127	key->startBlock = extent_info->cur_blockCount;
6128
6129	extent_info->btdata.bufferAddress = extent_info->record.overflow;
6130	extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord);
6131	extent_info->btdata.itemCount = 1;
6132
6133	extent_info->catalog_fp = NULL;
6134
6135	/* Search the first overflow extent with expected startBlock as 'cur_blockCount' */
6136	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6137	error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6138			&(extent_info->btdata), &(extent_info->recordlen),
6139			extent_info->iterator);
6140	hfs_systemfile_unlock(hfsmp, lockflags);
6141	while (error == 0) {
6142		extent_info->overflow_count++;
6143		extent_info->recStartBlock = key->startBlock;
6144		extent_info->extents = extent_info->record.overflow;
6145		for (i = 0; i < kHFSPlusExtentDensity; i++) {
6146			if (extent_info->record.overflow[i].blockCount == 0) {
6147				goto out;
6148			}
6149			extent_info->extent_index = i;
6150			error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6151			if (error) {
6152				printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error);
6153				goto out;
6154			}
6155		}
6156
6157		/* Look for more overflow records */
6158		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6159		error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6160				extent_info->iterator, &(extent_info->btdata),
6161				&(extent_info->recordlen));
6162		hfs_systemfile_unlock(hfsmp, lockflags);
6163		if (error) {
6164			break;
6165		}
6166		/* Stop when we encounter a different file or fork. */
6167		if ((key->fileID != fileID) || (key->forkType != forktype)) {
6168			break;
6169		}
6170	}
6171	if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6172		error = 0;
6173	}
6174
6175out:
6176	/* If any blocks were relocated, account them and report progress */
6177	if (extent_info->blocks_relocated) {
6178		hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6179		hfs_truncatefs_progress(hfsmp);
6180		if (fileID < kHFSFirstUserCatalogNodeID) {
6181			printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n",
6182					extent_info->blocks_relocated, fileID, hfsmp->vcbVN);
6183		}
6184	}
6185	if (extent_info->iterator) {
6186		FREE(extent_info->iterator, M_TEMP);
6187	}
6188	if (release_desc == true) {
6189		cat_releasedesc(extent_info->dirlink_desc);
6190	}
6191	if (extent_info->dirlink_desc) {
6192		FREE(extent_info->dirlink_desc, M_TEMP);
6193	}
6194	if (extent_info->dirlink_attr) {
6195		FREE(extent_info->dirlink_attr, M_TEMP);
6196	}
6197	if (extent_info->dirlink_fork) {
6198		FREE(extent_info->dirlink_fork, M_TEMP);
6199	}
6200	if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) {
6201		(void) hfs_update(vp, MNT_WAIT);
6202	}
6203	if (took_truncate_lock) {
6204		hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
6205	}
6206	if (extent_info) {
6207		FREE(extent_info, M_TEMP);
6208	}
6209	if (hfs_resize_debug) {
6210		printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error);
6211	}
6212
6213	return error;
6214}
6215
6216
6217/*
6218 * This journal_relocate callback updates the journal info block to point
6219 * at the new journal location.  This write must NOT be done using the
6220 * transaction.  We must write the block immediately.  We must also force
6221 * it to get to the media so that the new journal location will be seen by
6222 * the replay code before we can safely let journaled blocks be written
6223 * to their normal locations.
6224 *
6225 * The tests for journal_uses_fua below are mildly hacky.  Since the journal
6226 * and the file system are both on the same device, I'm leveraging what
6227 * the journal has decided about FUA.
6228 */
6229struct hfs_journal_relocate_args {
6230	struct hfsmount *hfsmp;
6231	vfs_context_t context;
6232	u_int32_t newStartBlock;
6233	u_int32_t newBlockCount;
6234};
6235
6236static errno_t
6237hfs_journal_relocate_callback(void *_args)
6238{
6239	int error;
6240	struct hfs_journal_relocate_args *args = _args;
6241	struct hfsmount *hfsmp = args->hfsmp;
6242	buf_t bp;
6243	JournalInfoBlock *jibp;
6244
6245	error = buf_meta_bread(hfsmp->hfs_devvp,
6246		hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6247		hfsmp->blockSize, vfs_context_ucred(args->context), &bp);
6248	if (error) {
6249		printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error);
6250		if (bp) {
6251        		buf_brelse(bp);
6252		}
6253		return error;
6254	}
6255	jibp = (JournalInfoBlock*) buf_dataptr(bp);
6256	jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize);
6257	jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize);
6258	if (journal_uses_fua(hfsmp->jnl))
6259		buf_markfua(bp);
6260	error = buf_bwrite(bp);
6261	if (error) {
6262		printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error);
6263		return error;
6264	}
6265	if (!journal_uses_fua(hfsmp->jnl)) {
6266		error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, args->context);
6267		if (error) {
6268			printf("hfs_journal_relocate_callback: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6269			error = 0;		/* Don't fail the operation. */
6270		}
6271	}
6272
6273	return error;
6274}
6275
6276
6277/* Type of resize operation in progress */
6278#define HFS_RESIZE_TRUNCATE	1
6279#define HFS_RESIZE_EXTEND	2
6280
6281/*
6282 * Core function to relocate the journal file.  This function takes the
6283 * journal size of the newly relocated journal --- the caller can
6284 * provide a new journal size if they want to change the size of
6285 * the journal.  The function takes care of updating the journal info
6286 * block and all other data structures correctly.
6287 *
6288 * Note: This function starts a transaction and grabs the btree locks.
6289 */
6290static int
6291hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context)
6292{
6293	int error;
6294	int journal_err;
6295	int lockflags;
6296	u_int32_t oldStartBlock;
6297	u_int32_t newStartBlock;
6298	u_int32_t oldBlockCount;
6299	u_int32_t newBlockCount;
6300	u_int32_t jnlBlockCount;
6301	u_int32_t alloc_skipfreeblks;
6302	struct cat_desc journal_desc;
6303	struct cat_attr journal_attr;
6304	struct cat_fork journal_fork;
6305	struct hfs_journal_relocate_args callback_args;
6306
6307	/* Calculate the number of allocation blocks required for the journal */
6308	jnlBlockCount = howmany(jnl_size, hfsmp->blockSize);
6309
6310	/*
6311	 * During truncatefs(), the volume free block count is updated
6312	 * before relocating data and reflects the total number of free
6313	 * blocks that will exist on volume after the resize is successful.
6314	 * This means that the allocation blocks required for relocation
6315	 * have already been reserved and accounted for in the free block
6316	 * count.  Therefore, block allocation and deallocation routines
6317	 * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS
6318	 * flag.
6319	 *
6320	 * This special handling is not required when the file system
6321	 * is being extended as we want all the allocated and deallocated
6322	 * blocks to be accounted for correctly.
6323	 */
6324	if (resize_type == HFS_RESIZE_TRUNCATE) {
6325		alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS;
6326	} else {
6327		alloc_skipfreeblks = 0;
6328	}
6329
6330	error = hfs_start_transaction(hfsmp);
6331	if (error) {
6332		printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error);
6333		return error;
6334	}
6335	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6336
6337	error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount,
6338			HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_FLUSHTXN | alloc_skipfreeblks,
6339			 &newStartBlock, &newBlockCount);
6340	if (error) {
6341		printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error);
6342		goto fail;
6343	}
6344	if (newBlockCount != jnlBlockCount) {
6345		printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount);
6346		goto free_fail;
6347	}
6348
6349	error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, 0, &journal_desc, &journal_attr, &journal_fork);
6350	if (error) {
6351		printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error);
6352		goto free_fail;
6353	}
6354
6355	oldStartBlock = journal_fork.cf_extents[0].startBlock;
6356	oldBlockCount = journal_fork.cf_extents[0].blockCount;
6357	error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks);
6358	if (error) {
6359		printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6360		goto free_fail;
6361	}
6362
6363	/* Update the catalog record for .journal */
6364	journal_fork.cf_size = newBlockCount * hfsmp->blockSize;
6365	journal_fork.cf_extents[0].startBlock = newStartBlock;
6366	journal_fork.cf_extents[0].blockCount = newBlockCount;
6367	journal_fork.cf_blocks = newBlockCount;
6368	error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL);
6369	cat_releasedesc(&journal_desc);  /* all done with cat descriptor */
6370	if (error) {
6371		printf("hfs_relocate_journal_file: cat_update returned %d\n", error);
6372		goto free_fail;
6373	}
6374
6375	/*
6376	 * If the journal is part of the file system, then tell the journal
6377	 * code about the new location.  If the journal is on an external
6378	 * device, then just keep using it as-is.
6379	 */
6380	if (hfsmp->jvp == hfsmp->hfs_devvp) {
6381		callback_args.hfsmp = hfsmp;
6382		callback_args.context = context;
6383		callback_args.newStartBlock = newStartBlock;
6384		callback_args.newBlockCount = newBlockCount;
6385
6386		error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize,
6387			(off_t)newBlockCount*hfsmp->blockSize, 0,
6388			hfs_journal_relocate_callback, &callback_args);
6389		if (error) {
6390			/* NOTE: journal_relocate will mark the journal invalid. */
6391			printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error);
6392			goto fail;
6393		}
6394		if (hfs_resize_debug) {
6395			printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
6396		}
6397		hfsmp->jnl_start = newStartBlock;
6398		hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize;
6399	}
6400
6401	hfs_systemfile_unlock(hfsmp, lockflags);
6402	error = hfs_end_transaction(hfsmp);
6403	if (error) {
6404		printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error);
6405	}
6406
6407	return error;
6408
6409free_fail:
6410	journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
6411	if (journal_err) {
6412		printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6413		hfs_mark_volume_inconsistent(hfsmp);
6414	}
6415fail:
6416	hfs_systemfile_unlock(hfsmp, lockflags);
6417	(void) hfs_end_transaction(hfsmp);
6418	if (hfs_resize_debug) {
6419		printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error);
6420	}
6421	return error;
6422}
6423
6424
6425/*
6426 * Relocate the journal file when the file system is being truncated.
6427 * We do not down-size the journal when the file system size is
6428 * reduced, so we always provide the current journal size to the
6429 * relocate code.
6430 */
6431static int
6432hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6433{
6434	int error = 0;
6435	u_int32_t startBlock;
6436	u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6437
6438	/*
6439	 * Figure out the location of the .journal file.  When the journal
6440	 * is on an external device, we need to look up the .journal file.
6441	 */
6442	if (hfsmp->jvp == hfsmp->hfs_devvp) {
6443		startBlock = hfsmp->jnl_start;
6444		blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6445	} else {
6446		u_int32_t fileid;
6447		u_int32_t old_jnlfileid;
6448		struct cat_attr attr;
6449		struct cat_fork fork;
6450
6451		/*
6452		 * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid
6453		 * is set, and it is trying to hide the .journal file.  So temporarily
6454		 * unset the field while calling GetFileInfo.
6455		 */
6456		old_jnlfileid = hfsmp->hfs_jnlfileid;
6457		hfsmp->hfs_jnlfileid = 0;
6458		fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork);
6459		hfsmp->hfs_jnlfileid = old_jnlfileid;
6460		if (fileid != old_jnlfileid) {
6461			printf("hfs_reclaim_journal_file: cannot find .journal file!\n");
6462			return EIO;
6463		}
6464
6465		startBlock = fork.cf_extents[0].startBlock;
6466		blockCount = fork.cf_extents[0].blockCount;
6467	}
6468
6469	if (startBlock + blockCount <= allocLimit) {
6470		/* The journal file does not require relocation */
6471		return 0;
6472	}
6473
6474	error = hfs_relocate_journal_file(hfsmp, blockCount * hfsmp->blockSize, HFS_RESIZE_TRUNCATE, context);
6475	if (error == 0) {
6476		hfsmp->hfs_resize_blocksmoved += blockCount;
6477		hfs_truncatefs_progress(hfsmp);
6478		printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n",
6479				blockCount, hfsmp->vcbVN);
6480	}
6481
6482	return error;
6483}
6484
6485
6486/*
6487 * Move the journal info block to a new location.  We have to make sure the
6488 * new copy of the journal info block gets to the media first, then change
6489 * the field in the volume header and the catalog record.
6490 */
6491static int
6492hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6493{
6494	int error;
6495	int journal_err;
6496	int lockflags;
6497	u_int32_t oldBlock;
6498	u_int32_t newBlock;
6499	u_int32_t blockCount;
6500	struct cat_desc jib_desc;
6501	struct cat_attr jib_attr;
6502	struct cat_fork jib_fork;
6503	buf_t old_bp, new_bp;
6504
6505	if (hfsmp->vcbJinfoBlock <= allocLimit) {
6506		/* The journal info block does not require relocation */
6507		return 0;
6508	}
6509
6510	error = hfs_start_transaction(hfsmp);
6511	if (error) {
6512		printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error);
6513		return error;
6514	}
6515	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6516
6517	error = BlockAllocate(hfsmp, 1, 1, 1,
6518			HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS | HFS_ALLOC_FLUSHTXN,
6519			&newBlock, &blockCount);
6520	if (error) {
6521		printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error);
6522		goto fail;
6523	}
6524	if (blockCount != 1) {
6525		printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount);
6526		goto free_fail;
6527	}
6528
6529	/* Copy the old journal info block content to the new location */
6530	error = buf_meta_bread(hfsmp->hfs_devvp,
6531		hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6532		hfsmp->blockSize, vfs_context_ucred(context), &old_bp);
6533	if (error) {
6534		printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error);
6535		if (old_bp) {
6536        		buf_brelse(old_bp);
6537		}
6538		goto free_fail;
6539	}
6540	new_bp = buf_getblk(hfsmp->hfs_devvp,
6541		newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6542		hfsmp->blockSize, 0, 0, BLK_META);
6543	bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize);
6544	buf_brelse(old_bp);
6545	if (journal_uses_fua(hfsmp->jnl))
6546		buf_markfua(new_bp);
6547	error = buf_bwrite(new_bp);
6548	if (error) {
6549		printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error);
6550		goto free_fail;
6551	}
6552	if (!journal_uses_fua(hfsmp->jnl)) {
6553		error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
6554		if (error) {
6555			printf("hfs_reclaim_journal_info_block: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6556			/* Don't fail the operation. */
6557		}
6558	}
6559
6560	/* Deallocate the old block once the new one has the new valid content */
6561	error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS);
6562	if (error) {
6563		printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6564		goto free_fail;
6565	}
6566
6567
6568	/* Update the catalog record for .journal_info_block */
6569	error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, 0, &jib_desc, &jib_attr, &jib_fork);
6570	if (error) {
6571		printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error);
6572		goto fail;
6573	}
6574	oldBlock = jib_fork.cf_extents[0].startBlock;
6575	jib_fork.cf_size = hfsmp->blockSize;
6576	jib_fork.cf_extents[0].startBlock = newBlock;
6577	jib_fork.cf_extents[0].blockCount = 1;
6578	jib_fork.cf_blocks = 1;
6579	error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL);
6580	cat_releasedesc(&jib_desc);  /* all done with cat descriptor */
6581	if (error) {
6582		printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error);
6583		goto fail;
6584	}
6585
6586	/* Update the pointer to the journal info block in the volume header. */
6587	hfsmp->vcbJinfoBlock = newBlock;
6588	error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
6589	if (error) {
6590		printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error);
6591		goto fail;
6592	}
6593	hfs_systemfile_unlock(hfsmp, lockflags);
6594	error = hfs_end_transaction(hfsmp);
6595	if (error) {
6596		printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error);
6597	}
6598	error = hfs_journal_flush(hfsmp, FALSE);
6599	if (error) {
6600		printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error);
6601	}
6602
6603	/* Account for the block relocated and print progress */
6604	hfsmp->hfs_resize_blocksmoved += 1;
6605	hfs_truncatefs_progress(hfsmp);
6606	if (!error) {
6607		printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n",
6608				hfsmp->vcbVN);
6609		if (hfs_resize_debug) {
6610			printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount);
6611		}
6612	}
6613	return error;
6614
6615free_fail:
6616	journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS);
6617	if (journal_err) {
6618		printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6619		hfs_mark_volume_inconsistent(hfsmp);
6620	}
6621
6622fail:
6623	hfs_systemfile_unlock(hfsmp, lockflags);
6624	(void) hfs_end_transaction(hfsmp);
6625	if (hfs_resize_debug) {
6626		printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error);
6627	}
6628	return error;
6629}
6630
6631
6632static u_int64_t
6633calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count)
6634{
6635	u_int64_t journal_size;
6636	u_int32_t journal_scale;
6637
6638#define DEFAULT_JOURNAL_SIZE (8*1024*1024)
6639#define MAX_JOURNAL_SIZE     (512*1024*1024)
6640
6641	/* Calculate the journal size for this volume.   We want
6642	 * at least 8 MB of journal for each 100 GB of disk space.
6643	 * We cap the size at 512 MB, unless the allocation block
6644	 * size is larger, in which case, we use one allocation
6645	 * block.
6646	 */
6647	journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024);
6648	journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1);
6649	if (journal_size > MAX_JOURNAL_SIZE) {
6650		journal_size = MAX_JOURNAL_SIZE;
6651	}
6652	if (journal_size < hfsmp->blockSize) {
6653		journal_size = hfsmp->blockSize;
6654	}
6655	return journal_size;
6656}
6657
6658
6659/*
6660 * Calculate the expected journal size based on current partition size.
6661 * If the size of the current journal is less than the calculated size,
6662 * force journal relocation with the new journal size.
6663 */
6664static int
6665hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context)
6666{
6667	int error = 0;
6668	u_int64_t calc_journal_size;
6669
6670	if (hfsmp->jvp != hfsmp->hfs_devvp) {
6671		if (hfs_resize_debug) {
6672			printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n");
6673		}
6674		return 0;
6675	}
6676
6677	calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count);
6678	if (calc_journal_size <= hfsmp->jnl_size) {
6679		/* The journal size requires no modification */
6680		goto out;
6681	}
6682
6683	if (hfs_resize_debug) {
6684		printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size);
6685	}
6686
6687	/* Extend the journal to the new calculated size */
6688	error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context);
6689	if (error == 0) {
6690		printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n",
6691				hfsmp->jnl_size, hfsmp->vcbVN);
6692	}
6693out:
6694	return error;
6695}
6696
6697
6698/*
6699 * This function traverses through all extended attribute records for a given
6700 * fileID, and calls function that reclaims data blocks that exist in the
6701 * area of the disk being reclaimed which in turn is responsible for allocating
6702 * new space, copying extent data, deallocating new space, and if required,
6703 * splitting the extent.
6704 *
6705 * Note: The caller has already acquired the cnode lock on the file.  Therefore
6706 * we are assured that no other thread would be creating/deleting/modifying
6707 * extended attributes for this file.
6708 *
6709 * Side Effects:
6710 * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
6711 * blocks that were relocated.
6712 *
6713 * Returns:
6714 * 	0 on success, non-zero on failure.
6715 */
6716static int
6717hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context)
6718{
6719	int error = 0;
6720	struct hfs_reclaim_extent_info *extent_info;
6721	int i;
6722	HFSPlusAttrKey *key;
6723	int *lockflags;
6724
6725	if (hfs_resize_debug) {
6726		printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID);
6727	}
6728
6729	MALLOC(extent_info, struct hfs_reclaim_extent_info *,
6730	       sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
6731	if (extent_info == NULL) {
6732		return ENOMEM;
6733	}
6734	bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
6735	extent_info->vp = vp;
6736	extent_info->fileID = fileID;
6737	extent_info->is_xattr = true;
6738	extent_info->is_sysfile = vnode_issystem(vp);
6739	extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp);
6740	lockflags = &(extent_info->lockflags);
6741	*lockflags = SFL_ATTRIBUTE | SFL_BITMAP;
6742
6743	/* Initialize iterator from the extent_info structure */
6744	MALLOC(extent_info->iterator, struct BTreeIterator *,
6745	       sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6746	if (extent_info->iterator == NULL) {
6747		error = ENOMEM;
6748		goto out;
6749	}
6750	bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6751
6752	/* Build attribute key */
6753	key = (HFSPlusAttrKey *)&(extent_info->iterator->key);
6754	error = hfs_buildattrkey(fileID, NULL, key);
6755	if (error) {
6756		goto out;
6757	}
6758
6759	/* Initialize btdata from extent_info structure.  Note that the
6760	 * buffer pointer actually points to the xattr record from the
6761	 * extent_info structure itself.
6762	 */
6763	extent_info->btdata.bufferAddress = &(extent_info->record.xattr);
6764	extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord);
6765	extent_info->btdata.itemCount = 1;
6766
6767	/*
6768	 * Sync all extent-based attribute data to the disk.
6769	 *
6770	 * All extent-based attribute data I/O is performed via cluster
6771	 * I/O using a virtual file that spans across entire file system
6772	 * space.
6773	 */
6774	hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
6775	(void)cluster_push(hfsmp->hfs_attrdata_vp, 0);
6776	error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr");
6777	hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_LOCK_DEFAULT);
6778	if (error) {
6779		goto out;
6780	}
6781
6782	/* Search for extended attribute for current file.  This
6783	 * will place the iterator before the first matching record.
6784	 */
6785	*lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6786	error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6787			&(extent_info->btdata), &(extent_info->recordlen),
6788			extent_info->iterator);
6789	hfs_systemfile_unlock(hfsmp, *lockflags);
6790	if (error) {
6791		if (error != btNotFound) {
6792			goto out;
6793		}
6794		/* btNotFound is expected here, so just mask it */
6795		error = 0;
6796	}
6797
6798	while (1) {
6799		/* Iterate to the next record */
6800		*lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6801		error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6802				extent_info->iterator, &(extent_info->btdata),
6803				&(extent_info->recordlen));
6804		hfs_systemfile_unlock(hfsmp, *lockflags);
6805
6806		/* Stop the iteration if we encounter end of btree or xattr with different fileID */
6807		if (error || key->fileID != fileID) {
6808			if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6809				error = 0;
6810			}
6811			break;
6812		}
6813
6814		/* We only care about extent-based EAs */
6815		if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) &&
6816		    (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) {
6817			continue;
6818		}
6819
6820		if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) {
6821			extent_info->overflow_count = 0;
6822			extent_info->extents = extent_info->record.xattr.forkData.theFork.extents;
6823		} else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) {
6824			extent_info->overflow_count++;
6825			extent_info->extents = extent_info->record.xattr.overflowExtents.extents;
6826		}
6827
6828		extent_info->recStartBlock = key->startBlock;
6829		for (i = 0; i < kHFSPlusExtentDensity; i++) {
6830			if (extent_info->extents[i].blockCount == 0) {
6831				break;
6832			}
6833			extent_info->extent_index = i;
6834			error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6835			if (error) {
6836				printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error);
6837				goto out;
6838			}
6839		}
6840	}
6841
6842out:
6843	/* If any blocks were relocated, account them and report progress */
6844	if (extent_info->blocks_relocated) {
6845		hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6846		hfs_truncatefs_progress(hfsmp);
6847	}
6848	if (extent_info->iterator) {
6849		FREE(extent_info->iterator, M_TEMP);
6850	}
6851	if (extent_info) {
6852		FREE(extent_info, M_TEMP);
6853	}
6854	if (hfs_resize_debug) {
6855		printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error);
6856	}
6857	return error;
6858}
6859
6860/*
6861 * Reclaim any extent-based extended attributes allocation blocks from
6862 * the area of the disk that is being truncated.
6863 *
6864 * The function traverses the attribute btree to find out the fileIDs
6865 * of the extended attributes that need to be relocated.  For every
6866 * file whose large EA requires relocation, it looks up the cnode and
6867 * calls hfs_reclaim_xattr() to do all the work for allocating
6868 * new space, copying data, deallocating old space, and if required,
6869 * splitting the extents.
6870 *
6871 * Inputs:
6872 * 	allocLimit    - starting block of the area being reclaimed
6873 *
6874 * Returns:
6875 *   	returns 0 on success, non-zero on failure.
6876 */
6877static int
6878hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6879{
6880	int error = 0;
6881	FCB *fcb;
6882	struct BTreeIterator *iterator = NULL;
6883	struct FSBufferDescriptor btdata;
6884	HFSPlusAttrKey *key;
6885	HFSPlusAttrRecord rec;
6886	int lockflags = 0;
6887	cnid_t prev_fileid = 0;
6888	struct vnode *vp;
6889	int need_relocate;
6890	int btree_operation;
6891	u_int32_t files_moved = 0;
6892	u_int32_t prev_blocksmoved;
6893	int i;
6894
6895	fcb = VTOF(hfsmp->hfs_attribute_vp);
6896	/* Store the value to print total blocks moved by this function in end */
6897	prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6898
6899	if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6900		return ENOMEM;
6901	}
6902	bzero(iterator, sizeof(*iterator));
6903	key = (HFSPlusAttrKey *)&iterator->key;
6904	btdata.bufferAddress = &rec;
6905	btdata.itemSize = sizeof(rec);
6906	btdata.itemCount = 1;
6907
6908	need_relocate = false;
6909	btree_operation = kBTreeFirstRecord;
6910	/* Traverse the attribute btree to find extent-based EAs to reclaim */
6911	while (1) {
6912		lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK);
6913		error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6914		hfs_systemfile_unlock(hfsmp, lockflags);
6915		if (error) {
6916			if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6917				error = 0;
6918			}
6919			break;
6920		}
6921		btree_operation = kBTreeNextRecord;
6922
6923		/* If the extents of current fileID were already relocated, skip it */
6924		if (prev_fileid == key->fileID) {
6925			continue;
6926		}
6927
6928		/* Check if any of the extents in the current record need to be relocated */
6929		need_relocate = false;
6930		switch(rec.recordType) {
6931			case kHFSPlusAttrForkData:
6932				for (i = 0; i < kHFSPlusExtentDensity; i++) {
6933					if (rec.forkData.theFork.extents[i].blockCount == 0) {
6934						break;
6935					}
6936					if ((rec.forkData.theFork.extents[i].startBlock +
6937					     rec.forkData.theFork.extents[i].blockCount) > allocLimit) {
6938						need_relocate = true;
6939						break;
6940					}
6941				}
6942				break;
6943
6944			case kHFSPlusAttrExtents:
6945				for (i = 0; i < kHFSPlusExtentDensity; i++) {
6946					if (rec.overflowExtents.extents[i].blockCount == 0) {
6947						break;
6948					}
6949					if ((rec.overflowExtents.extents[i].startBlock +
6950					     rec.overflowExtents.extents[i].blockCount) > allocLimit) {
6951						need_relocate = true;
6952						break;
6953					}
6954				}
6955				break;
6956		};
6957
6958		/* Continue iterating to next attribute record */
6959		if (need_relocate == false) {
6960			continue;
6961		}
6962
6963		/* Look up the vnode for corresponding file.  The cnode
6964		 * will be locked which will ensure that no one modifies
6965		 * the xattrs when we are relocating them.
6966		 *
6967		 * We want to allow open-unlinked files to be moved,
6968		 * so provide allow_deleted == 1 for hfs_vget().
6969		 */
6970		if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) {
6971			continue;
6972		}
6973
6974		error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context);
6975		hfs_unlock(VTOC(vp));
6976		vnode_put(vp);
6977		if (error) {
6978			printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error);
6979			break;
6980		}
6981		prev_fileid = key->fileID;
6982		files_moved++;
6983	}
6984
6985	if (files_moved) {
6986		printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n",
6987				(hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
6988				files_moved, hfsmp->vcbVN);
6989	}
6990
6991	kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6992	return error;
6993}
6994
6995/*
6996 * Reclaim blocks from regular files.
6997 *
6998 * This function iterates over all the record in catalog btree looking
6999 * for files with extents that overlap into the space we're trying to
7000 * free up.  If a file extent requires relocation, it looks up the vnode
7001 * and calls function to relocate the data.
7002 *
7003 * Returns:
7004 * 	Zero on success, non-zero on failure.
7005 */
7006static int
7007hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
7008{
7009	int error;
7010	FCB *fcb;
7011	struct BTreeIterator *iterator = NULL;
7012	struct FSBufferDescriptor btdata;
7013	int btree_operation;
7014	int lockflags;
7015	struct HFSPlusCatalogFile filerec;
7016	struct vnode *vp;
7017	struct vnode *rvp;
7018	struct filefork *datafork;
7019	u_int32_t files_moved = 0;
7020	u_int32_t prev_blocksmoved;
7021
7022#if CONFIG_PROTECT
7023	int keys_generated = 0;
7024#endif
7025
7026	fcb = VTOF(hfsmp->hfs_catalog_vp);
7027	/* Store the value to print total blocks moved by this function at the end */
7028	prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
7029
7030	if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
7031		error = ENOMEM;
7032		goto reclaim_filespace_done;
7033	}
7034
7035#if CONFIG_PROTECT
7036	/*
7037	 * For content-protected filesystems, we may need to relocate files that
7038	 * are encrypted.  If they use the new-style offset-based IVs, then
7039	 * we can move them regardless of the lock state.  We create a temporary
7040	 * key here that we use to read/write the data, then we discard it at the
7041	 * end of the function.
7042	 */
7043	if (cp_fs_protected (hfsmp->hfs_mp)) {
7044		int needs = 0;
7045		error = cp_needs_tempkeys(hfsmp, &needs);
7046
7047		if ((error == 0) && (needs)) {
7048			error = cp_entry_gentempkeys(&hfsmp->hfs_resize_cpentry, hfsmp);
7049			if (error == 0) {
7050				keys_generated = 1;
7051			}
7052		}
7053
7054		if (error) {
7055			printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error);
7056			goto reclaim_filespace_done;
7057		}
7058	}
7059
7060#endif
7061
7062	bzero(iterator, sizeof(*iterator));
7063
7064	btdata.bufferAddress = &filerec;
7065	btdata.itemSize = sizeof(filerec);
7066	btdata.itemCount = 1;
7067
7068	btree_operation = kBTreeFirstRecord;
7069	while (1) {
7070		lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
7071		error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
7072		hfs_systemfile_unlock(hfsmp, lockflags);
7073		if (error) {
7074			if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
7075				error = 0;
7076			}
7077			break;
7078		}
7079		btree_operation = kBTreeNextRecord;
7080
7081		if (filerec.recordType != kHFSPlusFileRecord) {
7082			continue;
7083		}
7084
7085		/* Check if any of the extents require relocation */
7086		if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) {
7087			continue;
7088		}
7089
7090		/* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */
7091		if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) {
7092			if (hfs_resize_debug) {
7093				printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID);
7094			}
7095			continue;
7096		}
7097
7098		/* If data fork exists or item is a directory hard link, relocate blocks */
7099		datafork = VTOF(vp);
7100		if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) {
7101			error = hfs_reclaim_file(hfsmp, vp, filerec.fileID,
7102					kHFSDataForkType, allocLimit, context);
7103			if (error)  {
7104				printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7105				hfs_unlock(VTOC(vp));
7106				vnode_put(vp);
7107				break;
7108			}
7109		}
7110
7111		/* If resource fork exists or item is a directory hard link, relocate blocks */
7112		if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) {
7113			if (vnode_isdir(vp)) {
7114				/* Resource fork vnode lookup is invalid for directory hard link.
7115				 * So we fake data fork vnode as resource fork vnode.
7116				 */
7117				rvp = vp;
7118			} else {
7119				error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
7120				if (error) {
7121					printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error);
7122					hfs_unlock(VTOC(vp));
7123					vnode_put(vp);
7124					break;
7125				}
7126				VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT;
7127			}
7128
7129			error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID,
7130					kHFSResourceForkType, allocLimit, context);
7131			if (error) {
7132				printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7133				hfs_unlock(VTOC(vp));
7134				vnode_put(vp);
7135				break;
7136			}
7137		}
7138
7139		/* The file forks were relocated successfully, now drop the
7140		 * cnode lock and vnode reference, and continue iterating to
7141		 * next catalog record.
7142		 */
7143		hfs_unlock(VTOC(vp));
7144		vnode_put(vp);
7145		files_moved++;
7146	}
7147
7148	if (files_moved) {
7149		printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n",
7150				(hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
7151				files_moved, hfsmp->vcbVN);
7152	}
7153
7154reclaim_filespace_done:
7155	if (iterator) {
7156		kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7157	}
7158
7159#if CONFIG_PROTECT
7160	if (keys_generated) {
7161		cp_entry_destroy(hfsmp->hfs_resize_cpentry);
7162		hfsmp->hfs_resize_cpentry = NULL;
7163	}
7164#endif
7165	return error;
7166}
7167
7168/*
7169 * Reclaim space at the end of a file system.
7170 *
7171 * Inputs -
7172 * 	allocLimit 	- start block of the space being reclaimed
7173 * 	reclaimblks 	- number of allocation blocks to reclaim
7174 */
7175static int
7176hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context)
7177{
7178	int error = 0;
7179
7180	/*
7181	 * Preflight the bitmap to find out total number of blocks that need
7182	 * relocation.
7183	 *
7184	 * Note: Since allocLimit is set to the location of new alternate volume
7185	 * header, the check below does not account for blocks allocated for old
7186	 * alternate volume header.
7187	 */
7188	error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks));
7189	if (error) {
7190		printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error);
7191		return error;
7192	}
7193	if (hfs_resize_debug) {
7194		printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks);
7195	}
7196
7197	/* Just to be safe, sync the content of the journal to the disk before we proceed */
7198	hfs_journal_flush(hfsmp, TRUE);
7199
7200	/* First, relocate journal file blocks if they're in the way.
7201	 * Doing this first will make sure that journal relocate code
7202	 * gets access to contiguous blocks on disk first.  The journal
7203	 * file has to be contiguous on the disk, otherwise resize will
7204	 * fail.
7205	 */
7206	error = hfs_reclaim_journal_file(hfsmp, allocLimit, context);
7207	if (error) {
7208		printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error);
7209		return error;
7210	}
7211
7212	/* Relocate journal info block blocks if they're in the way. */
7213	error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context);
7214	if (error) {
7215		printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error);
7216		return error;
7217	}
7218
7219	/* Relocate extents of the Extents B-tree if they're in the way.
7220	 * Relocating extents btree before other btrees is important as
7221	 * this will provide access to largest contiguous block range on
7222	 * the disk for relocating extents btree.  Note that extents btree
7223	 * can only have maximum of 8 extents.
7224	 */
7225	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID,
7226			kHFSDataForkType, allocLimit, context);
7227	if (error) {
7228		printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error);
7229		return error;
7230	}
7231
7232	/* Relocate extents of the Allocation file if they're in the way. */
7233	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID,
7234			kHFSDataForkType, allocLimit, context);
7235	if (error) {
7236		printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error);
7237		return error;
7238	}
7239
7240	/* Relocate extents of the Catalog B-tree if they're in the way. */
7241	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID,
7242			kHFSDataForkType, allocLimit, context);
7243	if (error) {
7244		printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error);
7245		return error;
7246	}
7247
7248	/* Relocate extents of the Attributes B-tree if they're in the way. */
7249	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID,
7250			kHFSDataForkType, allocLimit, context);
7251	if (error) {
7252		printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error);
7253		return error;
7254	}
7255
7256	/* Relocate extents of the Startup File if there is one and they're in the way. */
7257	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID,
7258			kHFSDataForkType, allocLimit, context);
7259	if (error) {
7260		printf("hfs_reclaimspace: reclaim startup file returned %d\n", error);
7261		return error;
7262	}
7263
7264	/*
7265	 * We need to make sure the alternate volume header gets flushed if we moved
7266	 * any extents in the volume header.  But we need to do that before
7267	 * shrinking the size of the volume, or else the journal code will panic
7268	 * with an invalid (too large) block number.
7269	 *
7270	 * Note that blks_moved will be set if ANY extent was moved, even
7271	 * if it was just an overflow extent.  In this case, the journal_flush isn't
7272	 * strictly required, but shouldn't hurt.
7273	 */
7274	if (hfsmp->hfs_resize_blocksmoved) {
7275		hfs_journal_flush(hfsmp, TRUE);
7276	}
7277
7278	/* Reclaim extents from catalog file records */
7279	error = hfs_reclaim_filespace(hfsmp, allocLimit, context);
7280	if (error) {
7281		printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error);
7282		return error;
7283	}
7284
7285	/* Reclaim extents from extent-based extended attributes, if any */
7286	error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context);
7287	if (error) {
7288		printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error);
7289		return error;
7290	}
7291
7292	return error;
7293}
7294
7295
7296/*
7297 * Check if there are any extents (including overflow extents) that overlap
7298 * into the disk space that is being reclaimed.
7299 *
7300 * Output -
7301 * 	true  - One of the extents need to be relocated
7302 * 	false - No overflow extents need to be relocated, or there was an error
7303 */
7304static int
7305hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec)
7306{
7307	struct BTreeIterator * iterator = NULL;
7308	struct FSBufferDescriptor btdata;
7309	HFSPlusExtentRecord extrec;
7310	HFSPlusExtentKey *extkeyptr;
7311	FCB *fcb;
7312	int overlapped = false;
7313	int i, j;
7314	int error;
7315	int lockflags = 0;
7316	u_int32_t endblock;
7317
7318	/* Check if data fork overlaps the target space */
7319	for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7320		if (filerec->dataFork.extents[i].blockCount == 0) {
7321			break;
7322		}
7323		endblock = filerec->dataFork.extents[i].startBlock +
7324			filerec->dataFork.extents[i].blockCount;
7325		if (endblock > allocLimit) {
7326			overlapped = true;
7327			goto out;
7328		}
7329	}
7330
7331	/* Check if resource fork overlaps the target space */
7332	for (j = 0; j < kHFSPlusExtentDensity; ++j) {
7333		if (filerec->resourceFork.extents[j].blockCount == 0) {
7334			break;
7335		}
7336		endblock = filerec->resourceFork.extents[j].startBlock +
7337			filerec->resourceFork.extents[j].blockCount;
7338		if (endblock > allocLimit) {
7339			overlapped = true;
7340			goto out;
7341		}
7342	}
7343
7344	/* Return back if there are no overflow extents for this file */
7345	if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) {
7346		goto out;
7347	}
7348
7349	if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
7350		return 0;
7351	}
7352	bzero(iterator, sizeof(*iterator));
7353	extkeyptr = (HFSPlusExtentKey *)&iterator->key;
7354	extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength;
7355	extkeyptr->forkType = 0;
7356	extkeyptr->fileID = filerec->fileID;
7357	extkeyptr->startBlock = 0;
7358
7359	btdata.bufferAddress = &extrec;
7360	btdata.itemSize = sizeof(extrec);
7361	btdata.itemCount = 1;
7362
7363	fcb = VTOF(hfsmp->hfs_extents_vp);
7364
7365	lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
7366
7367	/* This will position the iterator just before the first overflow
7368	 * extent record for given fileID.  It will always return btNotFound,
7369	 * so we special case the error code.
7370	 */
7371	error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
7372	if (error && (error != btNotFound)) {
7373		goto out;
7374	}
7375
7376	/* BTIterateRecord() might return error if the btree is empty, and
7377	 * therefore we return that the extent does not overflow to the caller
7378	 */
7379	error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7380	while (error == 0) {
7381		/* Stop when we encounter a different file. */
7382		if (extkeyptr->fileID != filerec->fileID) {
7383			break;
7384		}
7385		/* Check if any of the forks exist in the target space. */
7386		for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7387			if (extrec[i].blockCount == 0) {
7388				break;
7389			}
7390			endblock = extrec[i].startBlock + extrec[i].blockCount;
7391			if (endblock > allocLimit) {
7392				overlapped = true;
7393				goto out;
7394			}
7395		}
7396		/* Look for more records. */
7397		error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7398	}
7399
7400out:
7401	if (lockflags) {
7402		hfs_systemfile_unlock(hfsmp, lockflags);
7403	}
7404	if (iterator) {
7405		kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7406	}
7407	return overlapped;
7408}
7409
7410
7411/*
7412 * Calculate the progress of a file system resize operation.
7413 */
7414__private_extern__
7415int
7416hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress)
7417{
7418	if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) {
7419		return (ENXIO);
7420	}
7421
7422	if (hfsmp->hfs_resize_totalblocks > 0) {
7423		*progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks);
7424	} else {
7425		*progress = 0;
7426	}
7427
7428	return (0);
7429}
7430
7431
7432/*
7433 * Creates a UUID from a unique "name" in the HFS UUID Name space.
7434 * See version 3 UUID.
7435 */
7436static void
7437hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result)
7438{
7439	MD5_CTX  md5c;
7440	uint8_t  rawUUID[8];
7441
7442	((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6];
7443	((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7];
7444
7445	MD5Init( &md5c );
7446	MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) );
7447	MD5Update( &md5c, rawUUID, sizeof (rawUUID) );
7448	MD5Final( result, &md5c );
7449
7450	result[6] = 0x30 | ( result[6] & 0x0F );
7451	result[8] = 0x80 | ( result[8] & 0x3F );
7452}
7453
7454/*
7455 * Get file system attributes.
7456 */
7457static int
7458hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7459{
7460#define HFS_ATTR_CMN_VALIDMASK ATTR_CMN_VALIDMASK
7461#define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST))
7462#define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_ACCTIME))
7463
7464	ExtendedVCB *vcb = VFSTOVCB(mp);
7465	struct hfsmount *hfsmp = VFSTOHFS(mp);
7466	u_int32_t freeCNIDs;
7467
7468	int searchfs_on = 0;
7469	int exchangedata_on = 1;
7470
7471#if CONFIG_SEARCHFS
7472	searchfs_on = 1;
7473#endif
7474
7475#if CONFIG_PROTECT
7476	if (cp_fs_protected(mp)) {
7477		exchangedata_on = 0;
7478	}
7479#endif
7480
7481	freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)hfsmp->vcbNxtCNID;
7482
7483	VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt);
7484	VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt);
7485	VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt);
7486	VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF);
7487	VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0));
7488	VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks);
7489	VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0));
7490	VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1));
7491	VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize);
7492	/* XXX needs clarification */
7493	VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1));
7494	/* Maximum files is constrained by total blocks. */
7495	VFSATTR_RETURN(fsap, f_files, (u_int64_t)(hfsmp->totalBlocks - 2));
7496	VFSATTR_RETURN(fsap, f_ffree, MIN((u_int64_t)freeCNIDs, (u_int64_t)hfs_freeblks(hfsmp, 1)));
7497
7498	fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev;
7499	fsap->f_fsid.val[1] = vfs_typenum(mp);
7500	VFSATTR_SET_SUPPORTED(fsap, f_fsid);
7501
7502	VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord);
7503	VFSATTR_RETURN(fsap, f_carbon_fsid, 0);
7504
7505	if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) {
7506		vol_capabilities_attr_t *cap;
7507
7508		cap = &fsap->f_capabilities;
7509
7510		if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) {
7511			/* HFS+ & variants */
7512			cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7513				VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7514				VOL_CAP_FMT_SYMBOLICLINKS |
7515				VOL_CAP_FMT_HARDLINKS |
7516				VOL_CAP_FMT_JOURNAL |
7517				VOL_CAP_FMT_ZERO_RUNS |
7518				(hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) |
7519				(hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) |
7520				VOL_CAP_FMT_CASE_PRESERVING |
7521				VOL_CAP_FMT_FAST_STATFS |
7522				VOL_CAP_FMT_2TB_FILESIZE |
7523				VOL_CAP_FMT_HIDDEN_FILES |
7524#if HFS_COMPRESSION
7525				VOL_CAP_FMT_PATH_FROM_ID |
7526				VOL_CAP_FMT_DECMPFS_COMPRESSION;
7527#else
7528				VOL_CAP_FMT_PATH_FROM_ID;
7529#endif
7530		}
7531#if CONFIG_HFS_STD
7532		else {
7533			/* HFS standard */
7534			cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7535				VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7536				VOL_CAP_FMT_CASE_PRESERVING |
7537				VOL_CAP_FMT_FAST_STATFS |
7538				VOL_CAP_FMT_HIDDEN_FILES |
7539				VOL_CAP_FMT_PATH_FROM_ID;
7540		}
7541#endif
7542
7543		/*
7544		 * The capabilities word in 'cap' tell you whether or not
7545		 * this particular filesystem instance has feature X enabled.
7546		 */
7547
7548		cap->capabilities[VOL_CAPABILITIES_INTERFACES] =
7549			VOL_CAP_INT_ATTRLIST |
7550			VOL_CAP_INT_NFSEXPORT |
7551			VOL_CAP_INT_READDIRATTR |
7552			VOL_CAP_INT_ALLOCATE |
7553			VOL_CAP_INT_VOL_RENAME |
7554			VOL_CAP_INT_ADVLOCK |
7555			VOL_CAP_INT_FLOCK |
7556#if NAMEDSTREAMS
7557			VOL_CAP_INT_EXTENDED_ATTR |
7558			VOL_CAP_INT_NAMEDSTREAMS;
7559#else
7560			VOL_CAP_INT_EXTENDED_ATTR;
7561#endif
7562
7563		/* HFS may conditionally support searchfs and exchangedata depending on the runtime */
7564
7565		if (searchfs_on) {
7566			cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_SEARCHFS;
7567		}
7568		if (exchangedata_on) {
7569			cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_EXCHANGEDATA;
7570		}
7571
7572		cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0;
7573		cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0;
7574
7575		cap->valid[VOL_CAPABILITIES_FORMAT] =
7576			VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7577			VOL_CAP_FMT_SYMBOLICLINKS |
7578			VOL_CAP_FMT_HARDLINKS |
7579			VOL_CAP_FMT_JOURNAL |
7580			VOL_CAP_FMT_JOURNAL_ACTIVE |
7581			VOL_CAP_FMT_NO_ROOT_TIMES |
7582			VOL_CAP_FMT_SPARSE_FILES |
7583			VOL_CAP_FMT_ZERO_RUNS |
7584			VOL_CAP_FMT_CASE_SENSITIVE |
7585			VOL_CAP_FMT_CASE_PRESERVING |
7586			VOL_CAP_FMT_FAST_STATFS |
7587			VOL_CAP_FMT_2TB_FILESIZE |
7588			VOL_CAP_FMT_OPENDENYMODES |
7589			VOL_CAP_FMT_HIDDEN_FILES |
7590#if HFS_COMPRESSION
7591			VOL_CAP_FMT_PATH_FROM_ID |
7592			VOL_CAP_FMT_DECMPFS_COMPRESSION;
7593#else
7594			VOL_CAP_FMT_PATH_FROM_ID;
7595#endif
7596
7597		/*
7598		 * Bits in the "valid" field tell you whether or not the on-disk
7599		 * format supports feature X.
7600		 */
7601
7602		cap->valid[VOL_CAPABILITIES_INTERFACES] =
7603			VOL_CAP_INT_ATTRLIST |
7604			VOL_CAP_INT_NFSEXPORT |
7605			VOL_CAP_INT_READDIRATTR |
7606			VOL_CAP_INT_COPYFILE |
7607			VOL_CAP_INT_ALLOCATE |
7608			VOL_CAP_INT_VOL_RENAME |
7609			VOL_CAP_INT_ADVLOCK |
7610			VOL_CAP_INT_FLOCK |
7611			VOL_CAP_INT_MANLOCK |
7612#if NAMEDSTREAMS
7613			VOL_CAP_INT_EXTENDED_ATTR |
7614			VOL_CAP_INT_NAMEDSTREAMS;
7615#else
7616			VOL_CAP_INT_EXTENDED_ATTR;
7617#endif
7618
7619		/* HFS always supports exchangedata and searchfs in the on-disk format natively */
7620		cap->valid[VOL_CAPABILITIES_INTERFACES] |= (VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_EXCHANGEDATA);
7621
7622
7623		cap->valid[VOL_CAPABILITIES_RESERVED1] = 0;
7624		cap->valid[VOL_CAPABILITIES_RESERVED2] = 0;
7625		VFSATTR_SET_SUPPORTED(fsap, f_capabilities);
7626	}
7627	if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) {
7628		vol_attributes_attr_t *attrp = &fsap->f_attributes;
7629
7630        	attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7631        	attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7632        	attrp->validattr.dirattr = ATTR_DIR_VALIDMASK;
7633        	attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7634        	attrp->validattr.forkattr = 0;
7635
7636        	attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7637        	attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7638        	attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK;
7639        	attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7640        	attrp->nativeattr.forkattr = 0;
7641		VFSATTR_SET_SUPPORTED(fsap, f_attributes);
7642	}
7643	fsap->f_create_time.tv_sec = hfsmp->hfs_itime;
7644	fsap->f_create_time.tv_nsec = 0;
7645	VFSATTR_SET_SUPPORTED(fsap, f_create_time);
7646	fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod;
7647	fsap->f_modify_time.tv_nsec = 0;
7648	VFSATTR_SET_SUPPORTED(fsap, f_modify_time);
7649
7650	fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp;
7651	fsap->f_backup_time.tv_nsec = 0;
7652	VFSATTR_SET_SUPPORTED(fsap, f_backup_time);
7653	if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) {
7654		u_int16_t subtype = 0;
7655
7656		/*
7657		 * Subtypes (flavors) for HFS
7658		 *   0:   Mac OS Extended
7659		 *   1:   Mac OS Extended (Journaled)
7660		 *   2:   Mac OS Extended (Case Sensitive)
7661		 *   3:   Mac OS Extended (Case Sensitive, Journaled)
7662		 *   4 - 127:   Reserved
7663		 * 128:   Mac OS Standard
7664		 *
7665		 */
7666		if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) {
7667			if (hfsmp->jnl) {
7668				subtype |= HFS_SUBTYPE_JOURNALED;
7669			}
7670			if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) {
7671				subtype |= HFS_SUBTYPE_CASESENSITIVE;
7672			}
7673		}
7674#if CONFIG_HFS_STD
7675		else {
7676			subtype = HFS_SUBTYPE_STANDARDHFS;
7677		}
7678#endif
7679		fsap->f_fssubtype = subtype;
7680		VFSATTR_SET_SUPPORTED(fsap, f_fssubtype);
7681	}
7682
7683	if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7684		strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN);
7685		VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7686	}
7687	if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) {
7688		hfs_getvoluuid(hfsmp, fsap->f_uuid);
7689		VFSATTR_SET_SUPPORTED(fsap, f_uuid);
7690	}
7691	return (0);
7692}
7693
7694/*
7695 * Perform a volume rename.  Requires the FS' root vp.
7696 */
7697static int
7698hfs_rename_volume(struct vnode *vp, const char *name, proc_t p)
7699{
7700	ExtendedVCB *vcb = VTOVCB(vp);
7701	struct cnode *cp = VTOC(vp);
7702	struct hfsmount *hfsmp = VTOHFS(vp);
7703	struct cat_desc to_desc;
7704	struct cat_desc todir_desc;
7705	struct cat_desc new_desc;
7706	cat_cookie_t cookie;
7707	int lockflags;
7708	int error = 0;
7709	char converted_volname[256];
7710	size_t volname_length = 0;
7711	size_t conv_volname_length = 0;
7712
7713
7714	/*
7715	 * Ignore attempts to rename a volume to a zero-length name.
7716	 */
7717	if (name[0] == 0)
7718		return(0);
7719
7720	bzero(&to_desc, sizeof(to_desc));
7721	bzero(&todir_desc, sizeof(todir_desc));
7722	bzero(&new_desc, sizeof(new_desc));
7723	bzero(&cookie, sizeof(cookie));
7724
7725	todir_desc.cd_parentcnid = kHFSRootParentID;
7726	todir_desc.cd_cnid = kHFSRootFolderID;
7727	todir_desc.cd_flags = CD_ISDIR;
7728
7729	to_desc.cd_nameptr = (const u_int8_t *)name;
7730	to_desc.cd_namelen = strlen(name);
7731	to_desc.cd_parentcnid = kHFSRootParentID;
7732	to_desc.cd_cnid = cp->c_cnid;
7733	to_desc.cd_flags = CD_ISDIR;
7734
7735	if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) == 0) {
7736		if ((error = hfs_start_transaction(hfsmp)) == 0) {
7737			if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) {
7738				lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
7739
7740				error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc);
7741
7742				/*
7743				 * If successful, update the name in the VCB, ensure it's terminated.
7744				 */
7745				if (error == 0) {
7746					strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN));
7747
7748					volname_length = strlen ((const char*)vcb->vcbVN);
7749#define DKIOCCSSETLVNAME _IOW('d', 198, char[256])
7750					/* Send the volume name down to CoreStorage if necessary */
7751					error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED);
7752					if (error == 0) {
7753						(void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current());
7754					}
7755					error = 0;
7756				}
7757
7758				hfs_systemfile_unlock(hfsmp, lockflags);
7759				cat_postflight(hfsmp, &cookie, p);
7760
7761				if (error)
7762					MarkVCBDirty(vcb);
7763				(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
7764			}
7765			hfs_end_transaction(hfsmp);
7766		}
7767		if (!error) {
7768			/* Release old allocated name buffer */
7769			if (cp->c_desc.cd_flags & CD_HASBUF) {
7770				const char *tmp_name = (const char *)cp->c_desc.cd_nameptr;
7771
7772				cp->c_desc.cd_nameptr = 0;
7773				cp->c_desc.cd_namelen = 0;
7774				cp->c_desc.cd_flags &= ~CD_HASBUF;
7775				vfs_removename(tmp_name);
7776			}
7777			/* Update cnode's catalog descriptor */
7778			replace_desc(cp, &new_desc);
7779			vcb->volumeNameEncodingHint = new_desc.cd_encoding;
7780			cp->c_touch_chgtime = TRUE;
7781		}
7782
7783		hfs_unlock(cp);
7784	}
7785
7786	return(error);
7787}
7788
7789/*
7790 * Get file system attributes.
7791 */
7792static int
7793hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7794{
7795	kauth_cred_t cred = vfs_context_ucred(context);
7796	int error = 0;
7797
7798	/*
7799	 * Must be superuser or owner of filesystem to change volume attributes
7800	 */
7801	if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner))
7802		return(EACCES);
7803
7804	if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7805		vnode_t root_vp;
7806
7807		error = hfs_vfs_root(mp, &root_vp, context);
7808		if (error)
7809			goto out;
7810
7811		error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context));
7812		(void) vnode_put(root_vp);
7813		if (error)
7814			goto out;
7815
7816		VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7817	}
7818
7819out:
7820	return error;
7821}
7822
7823/* If a runtime corruption is detected, set the volume inconsistent
7824 * bit in the volume attributes.  The volume inconsistent bit is a persistent
7825 * bit which represents that the volume is corrupt and needs repair.
7826 * The volume inconsistent bit can be set from the kernel when it detects
7827 * runtime corruption or from file system repair utilities like fsck_hfs when
7828 * a repair operation fails.  The bit should be cleared only from file system
7829 * verify/repair utility like fsck_hfs when a verify/repair succeeds.
7830 */
7831void hfs_mark_volume_inconsistent(struct hfsmount *hfsmp)
7832{
7833	hfs_lock_mount (hfsmp);
7834	if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) {
7835		hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask;
7836		MarkVCBDirty(hfsmp);
7837	}
7838	if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) {
7839		/* Log information to ASL log */
7840		fslog_fs_corrupt(hfsmp->hfs_mp);
7841		printf("hfs: Runtime corruption detected on %s, fsck will be forced on next mount.\n", hfsmp->vcbVN);
7842	}
7843	hfs_unlock_mount (hfsmp);
7844}
7845
7846/* Replay the journal on the device node provided.  Returns zero if
7847 * journal replay succeeded or no journal was supposed to be replayed.
7848 */
7849static int hfs_journal_replay(vnode_t devvp, vfs_context_t context)
7850{
7851	int retval = 0;
7852	int error = 0;
7853	struct mount *mp = NULL;
7854	struct hfs_mount_args *args = NULL;
7855
7856	/* Replay allowed only on raw devices */
7857	if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) {
7858		retval = EINVAL;
7859		goto out;
7860	}
7861
7862	/* Create dummy mount structures */
7863	MALLOC(mp, struct mount *, sizeof(struct mount), M_TEMP, M_WAITOK);
7864	if (mp == NULL) {
7865		retval = ENOMEM;
7866		goto out;
7867	}
7868	bzero(mp, sizeof(struct mount));
7869	mount_lock_init(mp);
7870
7871	MALLOC(args, struct hfs_mount_args *, sizeof(struct hfs_mount_args), M_TEMP, M_WAITOK);
7872	if (args == NULL) {
7873		retval = ENOMEM;
7874		goto out;
7875	}
7876	bzero(args, sizeof(struct hfs_mount_args));
7877
7878	retval = hfs_mountfs(devvp, mp, args, 1, context);
7879	buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay");
7880
7881	/* FSYNC the devnode to be sure all data has been flushed */
7882	error = VNOP_FSYNC(devvp, MNT_WAIT, context);
7883	if (error) {
7884		retval = error;
7885	}
7886
7887out:
7888	if (mp) {
7889		mount_lock_destroy(mp);
7890		FREE(mp, M_TEMP);
7891	}
7892	if (args) {
7893		FREE(args, M_TEMP);
7894	}
7895	return retval;
7896}
7897
7898/*
7899 * hfs vfs operations.
7900 */
7901struct vfsops hfs_vfsops = {
7902	hfs_mount,
7903	hfs_start,
7904	hfs_unmount,
7905	hfs_vfs_root,
7906	hfs_quotactl,
7907	hfs_vfs_getattr, 	/* was hfs_statfs */
7908	hfs_sync,
7909	hfs_vfs_vget,
7910	hfs_fhtovp,
7911	hfs_vptofh,
7912	hfs_init,
7913	hfs_sysctl,
7914	hfs_vfs_setattr,
7915	{NULL}
7916};
7917