1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir_sf.h"
35#include "xfs_dir2_sf.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h"
38#include "xfs_inode.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_alloc.h"
42#include "xfs_rtalloc.h"
43#include "xfs_bmap.h"
44#include "xfs_error.h"
45#include "xfs_rw.h"
46#include "xfs_quota.h"
47#include "xfs_fsops.h"
48
49STATIC void	xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
50STATIC int	xfs_uuid_mount(xfs_mount_t *);
51STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
52STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
53
54
55#ifdef HAVE_PERCPU_SB
56STATIC void	xfs_icsb_destroy_counters(xfs_mount_t *);
57STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int);
58STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
59STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
60						int, int);
61STATIC int	xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t,
62						int, int);
63STATIC int	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
64
65#else
66
67#define xfs_icsb_destroy_counters(mp)			do { } while (0)
68#define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
69#define xfs_icsb_sync_counters(mp)			do { } while (0)
70#define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
71#define xfs_icsb_modify_counters_locked(mp, a, b, c)	do { } while (0)
72
73#endif
74
75static const struct {
76	short offset;
77	short type;	/* 0 = integer
78			 * 1 = binary / string (no translation)
79			 */
80} xfs_sb_info[] = {
81    { offsetof(xfs_sb_t, sb_magicnum),   0 },
82    { offsetof(xfs_sb_t, sb_blocksize),  0 },
83    { offsetof(xfs_sb_t, sb_dblocks),    0 },
84    { offsetof(xfs_sb_t, sb_rblocks),    0 },
85    { offsetof(xfs_sb_t, sb_rextents),   0 },
86    { offsetof(xfs_sb_t, sb_uuid),       1 },
87    { offsetof(xfs_sb_t, sb_logstart),   0 },
88    { offsetof(xfs_sb_t, sb_rootino),    0 },
89    { offsetof(xfs_sb_t, sb_rbmino),     0 },
90    { offsetof(xfs_sb_t, sb_rsumino),    0 },
91    { offsetof(xfs_sb_t, sb_rextsize),   0 },
92    { offsetof(xfs_sb_t, sb_agblocks),   0 },
93    { offsetof(xfs_sb_t, sb_agcount),    0 },
94    { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
95    { offsetof(xfs_sb_t, sb_logblocks),  0 },
96    { offsetof(xfs_sb_t, sb_versionnum), 0 },
97    { offsetof(xfs_sb_t, sb_sectsize),   0 },
98    { offsetof(xfs_sb_t, sb_inodesize),  0 },
99    { offsetof(xfs_sb_t, sb_inopblock),  0 },
100    { offsetof(xfs_sb_t, sb_fname[0]),   1 },
101    { offsetof(xfs_sb_t, sb_blocklog),   0 },
102    { offsetof(xfs_sb_t, sb_sectlog),    0 },
103    { offsetof(xfs_sb_t, sb_inodelog),   0 },
104    { offsetof(xfs_sb_t, sb_inopblog),   0 },
105    { offsetof(xfs_sb_t, sb_agblklog),   0 },
106    { offsetof(xfs_sb_t, sb_rextslog),   0 },
107    { offsetof(xfs_sb_t, sb_inprogress), 0 },
108    { offsetof(xfs_sb_t, sb_imax_pct),   0 },
109    { offsetof(xfs_sb_t, sb_icount),     0 },
110    { offsetof(xfs_sb_t, sb_ifree),      0 },
111    { offsetof(xfs_sb_t, sb_fdblocks),   0 },
112    { offsetof(xfs_sb_t, sb_frextents),  0 },
113    { offsetof(xfs_sb_t, sb_uquotino),   0 },
114    { offsetof(xfs_sb_t, sb_gquotino),   0 },
115    { offsetof(xfs_sb_t, sb_qflags),     0 },
116    { offsetof(xfs_sb_t, sb_flags),      0 },
117    { offsetof(xfs_sb_t, sb_shared_vn),  0 },
118    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
119    { offsetof(xfs_sb_t, sb_unit),	 0 },
120    { offsetof(xfs_sb_t, sb_width),	 0 },
121    { offsetof(xfs_sb_t, sb_dirblklog),	 0 },
122    { offsetof(xfs_sb_t, sb_logsectlog), 0 },
123    { offsetof(xfs_sb_t, sb_logsectsize),0 },
124    { offsetof(xfs_sb_t, sb_logsunit),	 0 },
125    { offsetof(xfs_sb_t, sb_features2),	 0 },
126    { sizeof(xfs_sb_t),			 0 }
127};
128
129/*
130 * Return a pointer to an initialized xfs_mount structure.
131 */
132xfs_mount_t *
133xfs_mount_init(void)
134{
135	xfs_mount_t *mp;
136
137	mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
138
139	if (xfs_icsb_init_counters(mp)) {
140		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
141	}
142
143	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
144	spinlock_init(&mp->m_sb_lock, "xfs_sb");
145	/* FreeBSD specfic */
146	sx_init(&mp->m_ilock, "xfs_mnt");
147	initnsema(&mp->m_growlock, 1, "xfs_grow");
148	/*
149	 * Initialize the AIL.
150	 */
151	xfs_trans_ail_init(mp);
152
153	atomic_set(&mp->m_active_trans, 0);
154
155	return mp;
156}
157
158/*
159 * Free up the resources associated with a mount structure.  Assume that
160 * the structure was initially zeroed, so we can tell which fields got
161 * initialized.
162 */
163void
164xfs_mount_free(
165	xfs_mount_t	*mp,
166	int		remove_bhv)
167{
168	if (mp->m_ihash)
169		xfs_ihash_free(mp);
170	if (mp->m_chash)
171		xfs_chash_free(mp);
172
173	if (mp->m_perag) {
174		int	agno;
175
176		for (agno = 0; agno < mp->m_maxagi; agno++)
177			if (mp->m_perag[agno].pagb_list)
178				kmem_free(mp->m_perag[agno].pagb_list,
179						sizeof(xfs_perag_busy_t) *
180							XFS_PAGB_NUM_SLOTS);
181		kmem_free(mp->m_perag,
182			  sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
183	}
184
185	AIL_LOCK_DESTROY(&mp->m_ail_lock);
186	spinlock_destroy(&mp->m_sb_lock);
187	/* FreeBSD specfic */
188	sx_destroy(&mp->m_ilock);
189	freesema(&mp->m_growlock);
190	if (mp->m_quotainfo)
191		XFS_QM_DONE(mp);
192
193	if (mp->m_fsname != NULL)
194		kmem_free(mp->m_fsname, mp->m_fsname_len);
195	if (mp->m_rtname != NULL)
196		kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1);
197	if (mp->m_logname != NULL)
198		kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
199
200	if (remove_bhv) {
201		xfs_vfs_t	*vfsp = XFS_MTOVFS(mp);
202
203		bhv_remove_all_vfsops(vfsp, 0);
204		VFS_REMOVEBHV(vfsp, &mp->m_bhv);
205	}
206
207	xfs_icsb_destroy_counters(mp);
208	kmem_free(mp, sizeof(xfs_mount_t));
209}
210
211
212/*
213 * Check the validity of the SB found.
214 */
215STATIC int
216xfs_mount_validate_sb(
217	xfs_mount_t	*mp,
218	xfs_sb_t	*sbp,
219	int		flags)
220{
221	/*
222	 * If the log device and data device have the
223	 * same device number, the log is internal.
224	 * Consequently, the sb_logstart should be non-zero.  If
225	 * we have a zero sb_logstart in this case, we may be trying to mount
226	 * a volume filesystem in a non-volume manner.
227	 */
228	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
229		xfs_fs_mount_cmn_err(flags, "bad magic number");
230		return XFS_ERROR(EWRONGFS);
231	}
232
233	if (!XFS_SB_GOOD_VERSION(sbp)) {
234		xfs_fs_mount_cmn_err(flags, "bad version");
235		return XFS_ERROR(EWRONGFS);
236	}
237
238	if (unlikely(
239	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
240		xfs_fs_mount_cmn_err(flags,
241			"filesystem is marked as having an external log; "
242			"specify logdev on the\nmount command line.");
243		return XFS_ERROR(EINVAL);
244	}
245
246	if (unlikely(
247	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
248		xfs_fs_mount_cmn_err(flags,
249			"filesystem is marked as having an internal log; "
250			"do not specify logdev on\nthe mount command line.");
251		return XFS_ERROR(EINVAL);
252	}
253
254	/*
255	 * More sanity checking. These were stolen directly from
256	 * xfs_repair.
257	 */
258	if (unlikely(
259	    sbp->sb_agcount <= 0					||
260	    sbp->sb_sectsize < XFS_MIN_SECTORSIZE			||
261	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
262	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
263	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
264	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
265	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
266	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
267	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
268	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
269	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
270	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
271	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
272	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
273	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
274	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
275	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
276		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
277		return XFS_ERROR(EFSCORRUPTED);
278	}
279
280	/*
281	 * Sanity check AG count, size fields against data size field
282	 */
283	if (unlikely(
284	    sbp->sb_dblocks == 0 ||
285	    sbp->sb_dblocks >
286	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
287	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
288			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
289		xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
290		return XFS_ERROR(EFSCORRUPTED);
291	}
292
293	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
294	ASSERT(sbp->sb_blocklog >= BBSHIFT);
295
296#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
297	if (unlikely(
298	    (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX ||
299	    (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) {
300#else                  /* Limited by UINT_MAX of sectors */
301	if (unlikely(
302	    (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX ||
303	    (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) {
304#endif
305		xfs_fs_mount_cmn_err(flags,
306			"file system too large to be mounted on this system.");
307		return XFS_ERROR(E2BIG);
308	}
309
310	if (unlikely(sbp->sb_inprogress)) {
311		xfs_fs_mount_cmn_err(flags, "file system busy");
312		return XFS_ERROR(EFSCORRUPTED);
313	}
314
315	/*
316	 * Version 1 directory format has never worked on Linux.
317	 */
318	if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) {
319		xfs_fs_mount_cmn_err(flags,
320			"file system using version 1 directory format");
321		return XFS_ERROR(ENOSYS);
322	}
323
324	/*
325	 * Until this is fixed only page-sized or smaller data blocks work.
326	 */
327	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
328		xfs_fs_mount_cmn_err(flags,
329			"file system with blocksize %d bytes",
330			sbp->sb_blocksize);
331		xfs_fs_mount_cmn_err(flags,
332			"only pagesize (%ld) or less will currently work.",
333			PAGE_SIZE);
334		return XFS_ERROR(ENOSYS);
335	}
336
337	return 0;
338}
339
340xfs_agnumber_t
341xfs_initialize_perag(
342	struct xfs_vfs	*vfs,
343	xfs_mount_t	*mp,
344	xfs_agnumber_t	agcount)
345{
346	xfs_agnumber_t	index, max_metadata;
347	xfs_perag_t	*pag;
348	xfs_agino_t	agino;
349	xfs_ino_t	ino;
350	xfs_sb_t	*sbp = &mp->m_sb;
351	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
352
353	/* Check to see if the filesystem can overflow 32 bit inodes */
354	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
355	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
356
357	/* Clear the mount flag if no inode can overflow 32 bits
358	 * on this filesystem, or if specifically requested..
359	 */
360	if ((vfs->vfs_flag & VFS_32BITINODES) && ino > max_inum) {
361		mp->m_flags |= XFS_MOUNT_32BITINODES;
362	} else {
363		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
364	}
365
366	/* If we can overflow then setup the ag headers accordingly */
367	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
368		/* Calculate how much should be reserved for inodes to
369		 * meet the max inode percentage.
370		 */
371		if (mp->m_maxicount) {
372			__uint64_t	icount;
373
374			icount = sbp->sb_dblocks * sbp->sb_imax_pct;
375			do_div(icount, 100);
376			icount += sbp->sb_agblocks - 1;
377			do_div(icount, sbp->sb_agblocks);
378			max_metadata = icount;
379		} else {
380			max_metadata = agcount;
381		}
382		for (index = 0; index < agcount; index++) {
383			ino = XFS_AGINO_TO_INO(mp, index, agino);
384			if (ino > max_inum) {
385				index++;
386				break;
387			}
388
389			/* This ag is preferred for inodes */
390			pag = &mp->m_perag[index];
391			pag->pagi_inodeok = 1;
392			if (index < max_metadata)
393				pag->pagf_metadata = 1;
394		}
395	} else {
396		/* Setup default behavior for smaller filesystems */
397		for (index = 0; index < agcount; index++) {
398			pag = &mp->m_perag[index];
399			pag->pagi_inodeok = 1;
400		}
401	}
402	return index;
403}
404
405/*
406 * xfs_xlatesb
407 *
408 *     data       - on disk version of sb
409 *     sb         - a superblock
410 *     dir        - conversion direction: <0 - convert sb to buf
411 *                                        >0 - convert buf to sb
412 *     fields     - which fields to copy (bitmask)
413 */
414void
415xfs_xlatesb(
416	void		*data,
417	xfs_sb_t	*sb,
418	int		dir,
419	__int64_t	fields)
420{
421	xfs_caddr_t	buf_ptr;
422	xfs_caddr_t	mem_ptr;
423	xfs_sb_field_t	f;
424	int		first;
425	int		size;
426
427	ASSERT(dir);
428	ASSERT(fields);
429
430	if (!fields)
431		return;
432
433	buf_ptr = (xfs_caddr_t)data;
434	mem_ptr = (xfs_caddr_t)sb;
435
436	while (fields) {
437		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
438		first = xfs_sb_info[f].offset;
439		size = xfs_sb_info[f + 1].offset - first;
440
441		ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
442
443		if (size == 1 || xfs_sb_info[f].type == 1) {
444			if (dir > 0) {
445				memcpy(mem_ptr + first, buf_ptr + first, size);
446			} else {
447				memcpy(buf_ptr + first, mem_ptr + first, size);
448			}
449		} else {
450			switch (size) {
451			case 2:
452				INT_XLATE(*(__uint16_t*)(buf_ptr+first),
453					  *(__uint16_t*)(mem_ptr+first),
454					  dir, ARCH_CONVERT);
455				break;
456			case 4:
457				INT_XLATE(*(__uint32_t*)(buf_ptr+first),
458					  *(__uint32_t*)(mem_ptr+first),
459					  dir, ARCH_CONVERT);
460				break;
461			case 8:
462				INT_XLATE(*(__uint64_t*)(buf_ptr+first),
463					  *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT);
464				break;
465			default:
466				ASSERT(0);
467			}
468		}
469
470		fields &= ~(1LL << f);
471	}
472}
473
474/*
475 * xfs_readsb
476 *
477 * Does the initial read of the superblock.
478 */
479int
480xfs_readsb(xfs_mount_t *mp, int flags)
481{
482	unsigned int	sector_size;
483	unsigned int	extra_flags;
484	xfs_buf_t	*bp;
485	xfs_sb_t	*sbp;
486	int		error;
487
488	ASSERT(mp->m_sb_bp == NULL);
489	ASSERT(mp->m_ddev_targp != NULL);
490
491	/*
492	 * Allocate a (locked) buffer to hold the superblock.
493	 * This will be kept around at all times to optimize
494	 * access to the superblock.
495	 */
496	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
497        extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
498
499	bp = xfs_getsb(mp,0);
500
501	if (!bp || XFS_BUF_ISERROR(bp)) {
502		xfs_fs_mount_cmn_err(flags, "SB read failed");
503		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
504		goto fail;
505	}
506	ASSERT(XFS_BUF_ISBUSY(bp));
507	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
508
509	/*
510	 * Initialize the mount structure from the superblock.
511	 * But first do some basic consistency checking.
512	 */
513	sbp = XFS_BUF_TO_SBP(bp);
514	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS);
515
516	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
517	if (error) {
518		xfs_fs_mount_cmn_err(flags, "SB validate failed");
519		goto fail;
520	}
521
522	/*
523	 * We must be able to do sector-sized and sector-aligned IO.
524	 */
525	if (sector_size > mp->m_sb.sb_sectsize) {
526		xfs_fs_mount_cmn_err(flags,
527			"device supports only %u byte sectors (not %u)",
528			sector_size, mp->m_sb.sb_sectsize);
529		error = ENOSYS;
530		goto fail;
531	}
532
533	/*
534	 * If device sector size is smaller than the superblock size,
535	 * re-read the superblock so the buffer is correctly sized.
536	 */
537	if (sector_size < mp->m_sb.sb_sectsize) {
538		XFS_BUF_UNMANAGE(bp);
539		xfs_buf_relse(bp);
540		sector_size = mp->m_sb.sb_sectsize;
541		bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
542					BTOBB(sector_size), extra_flags);
543		if (!bp || XFS_BUF_ISERROR(bp)) {
544			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
545			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
546			goto fail;
547		}
548		ASSERT(XFS_BUF_ISBUSY(bp));
549		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
550	}
551
552	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
553	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
554	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
555
556	mp->m_sb_bp = bp;
557	xfs_buf_relse(bp);
558	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
559	return 0;
560
561 fail:
562	if (bp) {
563		XFS_BUF_UNMANAGE(bp);
564		xfs_buf_relse(bp);
565	}
566	return error;
567}
568
569
570/*
571 * xfs_mount_common
572 *
573 * Mount initialization code establishing various mount
574 * fields from the superblock associated with the given
575 * mount structure
576 */
577STATIC void
578xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
579{
580	int	i;
581
582	mp->m_agfrotor = mp->m_agirotor = 0;
583	spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock");
584	mp->m_maxagi = mp->m_sb.sb_agcount;
585	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
586	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
587	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
588	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
589	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
590	mp->m_litino = sbp->sb_inodesize -
591		((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
592	mp->m_blockmask = sbp->sb_blocksize - 1;
593	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
594	mp->m_blockwmask = mp->m_blockwsize - 1;
595#ifdef RMC
596	INIT_LIST_HEAD(&mp->m_del_inodes);
597#endif
598	TAILQ_INIT(&mp->m_del_inodes);
599
600	/*
601	 * Setup for attributes, in case they get created.
602	 * This value is for inodes getting attributes for the first time,
603	 * the per-inode value is for old attribute values.
604	 */
605	ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
606	switch (sbp->sb_inodesize) {
607	case 256:
608		mp->m_attroffset = XFS_LITINO(mp) -
609				   XFS_BMDR_SPACE_CALC(MINABTPTRS);
610		break;
611	case 512:
612	case 1024:
613	case 2048:
614		mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
615		break;
616	default:
617		ASSERT(0);
618	}
619	ASSERT(mp->m_attroffset < XFS_LITINO(mp));
620
621	for (i = 0; i < 2; i++) {
622		mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
623			xfs_alloc, i == 0);
624		mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
625			xfs_alloc, i == 0);
626	}
627	for (i = 0; i < 2; i++) {
628		mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
629			xfs_bmbt, i == 0);
630		mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
631			xfs_bmbt, i == 0);
632	}
633	for (i = 0; i < 2; i++) {
634		mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
635			xfs_inobt, i == 0);
636		mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
637			xfs_inobt, i == 0);
638	}
639
640	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
641	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
642					sbp->sb_inopblock);
643	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
644}
645/*
646 * xfs_mountfs
647 *
648 * This function does the following on an initial mount of a file system:
649 *	- reads the superblock from disk and init the mount struct
650 *	- if we're a 32-bit kernel, do a size check on the superblock
651 *		so we don't mount terabyte filesystems
652 *	- init mount struct realtime fields
653 *	- allocate inode hash table for fs
654 *	- init directory manager
655 *	- perform recovery and init the log manager
656 */
657int
658xfs_mountfs(
659	xfs_vfs_t	*vfsp,
660	xfs_mount_t	*mp,
661	int		mfsi_flags)
662{
663	xfs_buf_t	*bp;
664	xfs_sb_t	*sbp = &(mp->m_sb);
665	xfs_inode_t	*rip;
666	xfs_vnode_t	*rvp = NULL;
667	int		readio_log, writeio_log;
668	xfs_daddr_t	d;
669	__uint64_t	ret64;
670	__int64_t	update_flags;
671	uint		quotamount, quotaflags;
672	int		agno;
673	int		uuid_mounted = 0;
674	int		error = 0;
675
676	if (mp->m_sb_bp == NULL) {
677		if ((error = xfs_readsb(mp, mfsi_flags))) {
678			return error;
679		}
680	}
681	xfs_mount_common(mp, sbp);
682
683	/*
684	 * Check if sb_agblocks is aligned at stripe boundary
685	 * If sb_agblocks is NOT aligned turn off m_dalign since
686	 * allocator alignment is within an ag, therefore ag has
687	 * to be aligned at stripe boundary.
688	 */
689	update_flags = 0LL;
690	if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
691		/*
692		 * If stripe unit and stripe width are not multiples
693		 * of the fs blocksize turn off alignment.
694		 */
695		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
696		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
697			if (mp->m_flags & XFS_MOUNT_RETERR) {
698				cmn_err(CE_WARN,
699					"XFS: alignment check 1 failed");
700				error = XFS_ERROR(EINVAL);
701				goto error1;
702			}
703			mp->m_dalign = mp->m_swidth = 0;
704		} else {
705			/*
706			 * Convert the stripe unit and width to FSBs.
707			 */
708			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
709			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
710				if (mp->m_flags & XFS_MOUNT_RETERR) {
711					error = XFS_ERROR(EINVAL);
712					goto error1;
713				}
714				xfs_fs_cmn_err(CE_WARN, mp,
715"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
716					mp->m_dalign, mp->m_swidth,
717					sbp->sb_agblocks);
718
719				mp->m_dalign = 0;
720				mp->m_swidth = 0;
721			} else if (mp->m_dalign) {
722				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
723			} else {
724				if (mp->m_flags & XFS_MOUNT_RETERR) {
725					xfs_fs_cmn_err(CE_WARN, mp,
726"stripe alignment turned off: sunit(%d) less than bsize(%d)",
727                                        	mp->m_dalign,
728						mp->m_blockmask +1);
729					error = XFS_ERROR(EINVAL);
730					goto error1;
731				}
732				mp->m_swidth = 0;
733			}
734		}
735
736		/*
737		 * Update superblock with new values
738		 * and log changes
739		 */
740		if (XFS_SB_VERSION_HASDALIGN(sbp)) {
741			if (sbp->sb_unit != mp->m_dalign) {
742				sbp->sb_unit = mp->m_dalign;
743				update_flags |= XFS_SB_UNIT;
744			}
745			if (sbp->sb_width != mp->m_swidth) {
746				sbp->sb_width = mp->m_swidth;
747				update_flags |= XFS_SB_WIDTH;
748			}
749		}
750	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
751		    XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
752			mp->m_dalign = sbp->sb_unit;
753			mp->m_swidth = sbp->sb_width;
754	}
755
756	xfs_alloc_compute_maxlevels(mp);
757	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
758	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
759	xfs_ialloc_compute_maxlevels(mp);
760
761	if (sbp->sb_imax_pct) {
762		__uint64_t	icount;
763
764		/* Make sure the maximum inode count is a multiple of the
765		 * units we allocate inodes in.
766		 */
767
768		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
769		do_div(icount, 100);
770		do_div(icount, mp->m_ialloc_blks);
771		mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
772				   sbp->sb_inopblog;
773	} else
774		mp->m_maxicount = 0;
775
776	mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
777
778	/*
779	 * XFS uses the uuid from the superblock as the unique
780	 * identifier for fsid.  We can not use the uuid from the volume
781	 * since a single partition filesystem is identical to a single
782	 * partition volume/filesystem.
783	 */
784	if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
785	    (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
786		if (xfs_uuid_mount(mp)) {
787			error = XFS_ERROR(EINVAL);
788			goto error1;
789		}
790		uuid_mounted=1;
791		ret64 = uuid_hash64(&sbp->sb_uuid);
792		memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64));
793	}
794
795	/*
796	 * Set the default minimum read and write sizes unless
797	 * already specified in a mount option.
798	 * We use smaller I/O sizes when the file system
799	 * is being used for NFS service (wsync mount option).
800	 */
801	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
802		if (mp->m_flags & XFS_MOUNT_WSYNC) {
803			readio_log = XFS_WSYNC_READIO_LOG;
804			writeio_log = XFS_WSYNC_WRITEIO_LOG;
805		} else {
806			readio_log = XFS_READIO_LOG_LARGE;
807			writeio_log = XFS_WRITEIO_LOG_LARGE;
808		}
809	} else {
810		readio_log = mp->m_readio_log;
811		writeio_log = mp->m_writeio_log;
812	}
813
814	/*
815	 * Set the number of readahead buffers to use based on
816	 * physical memory size.
817	 */
818	if (xfs_physmem <= 4096)		/* <= 16MB */
819		mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB;
820	else if (xfs_physmem <= 8192)	/* <= 32MB */
821		mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB;
822	else
823		mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32;
824	if (sbp->sb_blocklog > readio_log) {
825		mp->m_readio_log = sbp->sb_blocklog;
826	} else {
827		mp->m_readio_log = readio_log;
828	}
829	mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
830	if (sbp->sb_blocklog > writeio_log) {
831		mp->m_writeio_log = sbp->sb_blocklog;
832	} else {
833		mp->m_writeio_log = writeio_log;
834	}
835	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
836
837	/*
838	 * Set the inode cluster size based on the physical memory
839	 * size.  This may still be overridden by the file system
840	 * block size if it is larger than the chosen cluster size.
841	 */
842	if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */
843		mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE;
844	} else {
845		mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
846	}
847	/*
848	 * Set whether we're using inode alignment.
849	 */
850	if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
851	    mp->m_sb.sb_inoalignmt >=
852	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
853		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
854	else
855		mp->m_inoalign_mask = 0;
856	/*
857	 * If we are using stripe alignment, check whether
858	 * the stripe unit is a multiple of the inode alignment
859	 */
860	if (mp->m_dalign && mp->m_inoalign_mask &&
861	    !(mp->m_dalign & mp->m_inoalign_mask))
862		mp->m_sinoalign = mp->m_dalign;
863	else
864		mp->m_sinoalign = 0;
865	/*
866	 * Check that the data (and log if separate) are an ok size.
867	 */
868	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
869	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
870		cmn_err(CE_WARN, "XFS: size check 1 failed");
871		error = XFS_ERROR(E2BIG);
872		goto error1;
873	}
874	error = xfs_read_buf(mp, mp->m_ddev_targp,
875			     d - XFS_FSS_TO_BB(mp, 1),
876			     XFS_FSS_TO_BB(mp, 1), 0, &bp);
877	if (!error) {
878		xfs_buf_relse(bp);
879	} else {
880		cmn_err(CE_WARN, "XFS: size check 2 failed");
881		if (error == ENOSPC) {
882			error = XFS_ERROR(E2BIG);
883		}
884		goto error1;
885	}
886
887	if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
888	    mp->m_logdev_targp != mp->m_ddev_targp) {
889		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
890		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
891			cmn_err(CE_WARN, "XFS: size check 3 failed");
892			error = XFS_ERROR(E2BIG);
893			goto error1;
894		}
895		error = xfs_read_buf(mp, mp->m_logdev_targp,
896				     d - XFS_FSB_TO_BB(mp, 1),
897				     XFS_FSB_TO_BB(mp, 1), 0, &bp);
898		if (!error) {
899			xfs_buf_relse(bp);
900		} else {
901			cmn_err(CE_WARN, "XFS: size check 3 failed");
902			if (error == ENOSPC) {
903				error = XFS_ERROR(E2BIG);
904			}
905			goto error1;
906		}
907	}
908
909	/*
910	 * Initialize realtime fields in the mount structure
911	 */
912	if ((error = xfs_rtmount_init(mp))) {
913		cmn_err(CE_WARN, "XFS: RT mount failed");
914		goto error1;
915	}
916
917	/*
918	 * For client case we are done now
919	 */
920	if (mfsi_flags & XFS_MFSI_CLIENT) {
921		return 0;
922	}
923
924	/*
925	 *  Copies the low order bits of the timestamp and the randomly
926	 *  set "sequence" number out of a UUID.
927	 */
928	uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
929
930	/*
931	 *  The vfs structure needs to have a file system independent
932	 *  way of checking for the invariant file system ID.  Since it
933	 *  can't look at mount structures it has a pointer to the data
934	 *  in the mount structure.
935	 *
936	 *  File systems that don't support user level file handles (i.e.
937	 *  all of them except for XFS) will leave vfs_altfsid as NULL.
938	 */
939	vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
940	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
941
942	/*
943	 * Select the right directory manager.
944	 */
945	mp->m_dirops =
946		XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
947			xfsv2_dirops :
948			xfsv1_dirops;
949
950	/*
951	 * Initialize directory manager's entries.
952	 */
953	XFS_DIR_MOUNT(mp);
954
955	/*
956	 * Initialize the attribute manager's entries.
957	 */
958	mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
959
960	/*
961	 * Initialize the precomputed transaction reservations values.
962	 */
963	xfs_trans_init(mp);
964
965	/*
966	 * Allocate and initialize the inode hash table for this
967	 * file system.
968	 */
969	xfs_ihash_init(mp);
970	xfs_chash_init(mp);
971
972	/*
973	 * Allocate and initialize the per-ag data.
974	 */
975	init_rwsem(&mp->m_peraglock);
976	mp->m_perag =
977		kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
978
979	mp->m_maxagi = xfs_initialize_perag(vfsp, mp, sbp->sb_agcount);
980
981	/*
982	 * log's mount-time initialization. Perform 1st part recovery if needed
983	 */
984	if (likely(sbp->sb_logblocks > 0)) {	/* check for volume case */
985		error = xfs_log_mount(mp, mp->m_logdev_targp,
986				      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
987				      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
988		if (error) {
989			cmn_err(CE_WARN, "XFS: log mount failed");
990			goto error2;
991		}
992	} else {	/* No log has been defined */
993		cmn_err(CE_WARN, "XFS: no log defined");
994		XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
995		error = XFS_ERROR(EFSCORRUPTED);
996		goto error2;
997	}
998
999	/*
1000	 * Get and sanity-check the root inode.
1001	 * Save the pointer to it in the mount structure.
1002	 */
1003	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
1004	if (error) {
1005		cmn_err(CE_WARN, "XFS: failed to read root inode");
1006		goto error3;
1007	}
1008
1009	ASSERT(rip != NULL);
1010	rvp = XFS_ITOV(rip);
1011
1012	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1013		cmn_err(CE_WARN, "XFS: corrupted root inode");
1014		printf("Root inode %p is not a directory: %llu",
1015		       mp->m_ddev_targp, (unsigned long long)rip->i_ino);
1016		xfs_iunlock(rip, XFS_ILOCK_EXCL);
1017		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
1018				 mp);
1019		error = XFS_ERROR(EFSCORRUPTED);
1020		goto error4;
1021	}
1022	mp->m_rootip = rip;	/* save it */
1023
1024	xfs_iunlock(rip, XFS_ILOCK_EXCL);
1025
1026	/*
1027	 * Initialize realtime inode pointers in the mount structure
1028	 */
1029	if ((error = xfs_rtmount_inodes(mp))) {
1030		/*
1031		 * Free up the root inode.
1032		 */
1033		cmn_err(CE_WARN, "XFS: failed to read RT inodes");
1034		goto error4;
1035	}
1036
1037	/*
1038	 * If fs is not mounted readonly, then update the superblock
1039	 * unit and width changes.
1040	 */
1041	if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY))
1042		xfs_mount_log_sbunit(mp, update_flags);
1043
1044	/*
1045	 * Initialise the XFS quota management subsystem for this mount
1046	 */
1047	if ((error = XFS_QM_INIT(mp, &quotamount, &quotaflags)))
1048		goto error4;
1049
1050	/*
1051	 * Finish recovering the file system.  This part needed to be
1052	 * delayed until after the root and real-time bitmap inodes
1053	 * were consistently read in.
1054	 */
1055	error = xfs_log_mount_finish(mp, mfsi_flags);
1056	if (error) {
1057		cmn_err(CE_WARN, "XFS: log mount finish failed");
1058		goto error4;
1059	}
1060
1061	/*
1062	 * Complete the quota initialisation, post-log-replay component.
1063	 */
1064	if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags)))
1065		goto error4;
1066
1067	return 0;
1068
1069 error4:
1070	/*
1071	 * Free up the root inode.
1072	 */
1073	VN_RELE(rvp);
1074 error3:
1075	xfs_log_unmount_dealloc(mp);
1076 error2:
1077	xfs_ihash_free(mp);
1078	xfs_chash_free(mp);
1079	for (agno = 0; agno < sbp->sb_agcount; agno++)
1080		if (mp->m_perag[agno].pagb_list)
1081			kmem_free(mp->m_perag[agno].pagb_list,
1082			  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
1083	kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
1084	mp->m_perag = NULL;
1085	/* FALLTHROUGH */
1086 error1:
1087	if (uuid_mounted)
1088		xfs_uuid_unmount(mp);
1089	xfs_freesb(mp);
1090	return error;
1091}
1092
1093/*
1094 * xfs_unmountfs
1095 *
1096 * This flushes out the inodes,dquots and the superblock, unmounts the
1097 * log and makes sure that incore structures are freed.
1098 */
1099int
1100xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1101{
1102	struct xfs_vfs	*vfsp = XFS_MTOVFS(mp);
1103#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1104	int64_t		fsid;
1105#endif
1106
1107	xfs_iflush_all(mp);
1108
1109	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
1110
1111	/*
1112	 * Flush out the log synchronously so that we know for sure
1113	 * that nothing is pinned.  This is important because bflush()
1114	 * will skip pinned buffers.
1115	 */
1116	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1117
1118	xfs_binval(mp->m_ddev_targp);
1119	if (mp->m_rtdev_targp) {
1120		xfs_binval(mp->m_rtdev_targp);
1121	}
1122
1123	xfs_unmountfs_writesb(mp);
1124
1125	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
1126
1127	xfs_log_unmount(mp);			/* Done! No more fs ops. */
1128
1129	xfs_freesb(mp);
1130
1131	/*
1132	 * All inodes from this mount point should be freed.
1133	 */
1134	//ASSERT(mp->m_inodes == NULL);
1135	if (mp->m_inodes != NULL ) {
1136		printf("WRONG: mp->m_ireclaims: %d\n", mp->m_ireclaims);
1137		printf("WRONG: mp->m_inodes: %p\n", mp->m_inodes);
1138	}
1139
1140	xfs_unmountfs_close(mp, cr);
1141	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
1142		xfs_uuid_unmount(mp);
1143
1144#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1145	/*
1146	 * clear all error tags on this filesystem
1147	 */
1148	memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t));
1149	xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0);
1150#endif
1151	XFS_IODONE(vfsp);
1152	xfs_mount_free(mp, 1);
1153	return 0;
1154}
1155
1156void
1157xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
1158{
1159	if (mp->m_logdev_targp != mp->m_ddev_targp)
1160		xfs_free_buftarg(mp->m_logdev_targp, 1);
1161	if (mp->m_rtdev_targp)
1162		xfs_free_buftarg(mp->m_rtdev_targp, 1);
1163	xfs_free_buftarg(mp->m_ddev_targp, 0);
1164}
1165
1166STATIC void
1167xfs_unmountfs_wait(xfs_mount_t *mp)
1168{
1169	if (mp->m_logdev_targp != mp->m_ddev_targp)
1170		xfs_wait_buftarg(mp->m_logdev_targp);
1171	if (mp->m_rtdev_targp)
1172		xfs_wait_buftarg(mp->m_rtdev_targp);
1173	xfs_wait_buftarg(mp->m_ddev_targp);
1174}
1175
1176int
1177xfs_unmountfs_writesb(xfs_mount_t *mp)
1178{
1179	xfs_buf_t	*sbp;
1180	xfs_sb_t	*sb;
1181	int		error = 0;
1182
1183	/*
1184	 * skip superblock write if fs is read-only, or
1185	 * if we are doing a forced umount.
1186	 */
1187	sbp = xfs_getsb(mp, 0);
1188	if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
1189		XFS_FORCED_SHUTDOWN(mp))) {
1190
1191		xfs_icsb_sync_counters(mp);
1192
1193		/*
1194		 * mark shared-readonly if desired
1195		 */
1196		sb = XFS_BUF_TO_SBP(sbp);
1197		if (mp->m_mk_sharedro) {
1198			if (!(sb->sb_flags & XFS_SBF_READONLY))
1199				sb->sb_flags |= XFS_SBF_READONLY;
1200			if (!XFS_SB_VERSION_HASSHARED(sb))
1201				XFS_SB_VERSION_ADDSHARED(sb);
1202			xfs_fs_cmn_err(CE_NOTE, mp,
1203				"Unmounting, marking shared read-only");
1204		}
1205		XFS_BUF_UNDONE(sbp);
1206		XFS_BUF_UNREAD(sbp);
1207		XFS_BUF_UNDELAYWRITE(sbp);
1208		XFS_BUF_WRITE(sbp);
1209		XFS_BUF_UNASYNC(sbp);
1210		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1211		xfsbdstrat(mp, sbp);
1212		/* Nevermind errors we might get here. */
1213		error = xfs_iowait(sbp);
1214		if (error)
1215			xfs_ioerror_alert("xfs_unmountfs_writesb",
1216					  mp, sbp, XFS_BUF_ADDR(sbp));
1217		if (error && mp->m_mk_sharedro)
1218			xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
1219	}
1220	xfs_buf_relse(sbp);
1221	return error;
1222}
1223
1224/*
1225 * xfs_mod_sb() can be used to copy arbitrary changes to the
1226 * in-core superblock into the superblock buffer to be logged.
1227 * It does not provide the higher level of locking that is
1228 * needed to protect the in-core superblock from concurrent
1229 * access.
1230 */
1231void
1232xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1233{
1234	xfs_buf_t	*bp;
1235	int		first;
1236	int		last;
1237	xfs_mount_t	*mp;
1238	xfs_sb_t	*sbp;
1239	xfs_sb_field_t	f;
1240
1241	ASSERT(fields);
1242	if (!fields)
1243		return;
1244	mp = tp->t_mountp;
1245	bp = xfs_trans_getsb(tp, mp, 0);
1246	sbp = XFS_BUF_TO_SBP(bp);
1247	first = sizeof(xfs_sb_t);
1248	last = 0;
1249
1250	/* translate/copy */
1251
1252	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields);
1253
1254	/* find modified range */
1255
1256	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1257	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1258	first = xfs_sb_info[f].offset;
1259
1260	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1261	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1262	last = xfs_sb_info[f + 1].offset - 1;
1263
1264	xfs_trans_log_buf(tp, bp, first, last);
1265}
1266/*
1267 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1268 * a delta to a specified field in the in-core superblock.  Simply
1269 * switch on the field indicated and apply the delta to that field.
1270 * Fields are not allowed to dip below zero, so if the delta would
1271 * do this do not apply it and return EINVAL.
1272 *
1273 * The SB_LOCK must be held when this routine is called.
1274 */
1275int
1276xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
1277			int delta, int rsvd)
1278{
1279	int		scounter;	/* short counter for 32 bit fields */
1280	long long	lcounter;	/* long counter for 64 bit fields */
1281	long long	res_used, rem;
1282
1283	/*
1284	 * With the in-core superblock spin lock held, switch
1285	 * on the indicated field.  Apply the delta to the
1286	 * proper field.  If the fields value would dip below
1287	 * 0, then do not apply the delta and return EINVAL.
1288	 */
1289	switch (field) {
1290	case XFS_SBS_ICOUNT:
1291		lcounter = (long long)mp->m_sb.sb_icount;
1292		lcounter += delta;
1293		if (lcounter < 0) {
1294			ASSERT(0);
1295			return XFS_ERROR(EINVAL);
1296		}
1297		mp->m_sb.sb_icount = lcounter;
1298		return 0;
1299	case XFS_SBS_IFREE:
1300		lcounter = (long long)mp->m_sb.sb_ifree;
1301		lcounter += delta;
1302		if (lcounter < 0) {
1303			ASSERT(0);
1304			return XFS_ERROR(EINVAL);
1305		}
1306		mp->m_sb.sb_ifree = lcounter;
1307		return 0;
1308	case XFS_SBS_FDBLOCKS:
1309
1310		lcounter = (long long)mp->m_sb.sb_fdblocks;
1311		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1312
1313		if (delta > 0) {		/* Putting blocks back */
1314			if (res_used > delta) {
1315				mp->m_resblks_avail += delta;
1316			} else {
1317				rem = delta - res_used;
1318				mp->m_resblks_avail = mp->m_resblks;
1319				lcounter += rem;
1320			}
1321		} else {				/* Taking blocks away */
1322
1323			lcounter += delta;
1324
1325		/*
1326		 * If were out of blocks, use any available reserved blocks if
1327		 * were allowed to.
1328		 */
1329
1330			if (lcounter < 0) {
1331				if (rsvd) {
1332					lcounter = (long long)mp->m_resblks_avail + delta;
1333					if (lcounter < 0) {
1334						return XFS_ERROR(ENOSPC);
1335					}
1336					mp->m_resblks_avail = lcounter;
1337					return 0;
1338				} else {	/* not reserved */
1339					return XFS_ERROR(ENOSPC);
1340				}
1341			}
1342		}
1343
1344		mp->m_sb.sb_fdblocks = lcounter;
1345		return 0;
1346	case XFS_SBS_FREXTENTS:
1347		lcounter = (long long)mp->m_sb.sb_frextents;
1348		lcounter += delta;
1349		if (lcounter < 0) {
1350			return XFS_ERROR(ENOSPC);
1351		}
1352		mp->m_sb.sb_frextents = lcounter;
1353		return 0;
1354	case XFS_SBS_DBLOCKS:
1355		lcounter = (long long)mp->m_sb.sb_dblocks;
1356		lcounter += delta;
1357		if (lcounter < 0) {
1358			ASSERT(0);
1359			return XFS_ERROR(EINVAL);
1360		}
1361		mp->m_sb.sb_dblocks = lcounter;
1362		return 0;
1363	case XFS_SBS_AGCOUNT:
1364		scounter = mp->m_sb.sb_agcount;
1365		scounter += delta;
1366		if (scounter < 0) {
1367			ASSERT(0);
1368			return XFS_ERROR(EINVAL);
1369		}
1370		mp->m_sb.sb_agcount = scounter;
1371		return 0;
1372	case XFS_SBS_IMAX_PCT:
1373		scounter = mp->m_sb.sb_imax_pct;
1374		scounter += delta;
1375		if (scounter < 0) {
1376			ASSERT(0);
1377			return XFS_ERROR(EINVAL);
1378		}
1379		mp->m_sb.sb_imax_pct = scounter;
1380		return 0;
1381	case XFS_SBS_REXTSIZE:
1382		scounter = mp->m_sb.sb_rextsize;
1383		scounter += delta;
1384		if (scounter < 0) {
1385			ASSERT(0);
1386			return XFS_ERROR(EINVAL);
1387		}
1388		mp->m_sb.sb_rextsize = scounter;
1389		return 0;
1390	case XFS_SBS_RBMBLOCKS:
1391		scounter = mp->m_sb.sb_rbmblocks;
1392		scounter += delta;
1393		if (scounter < 0) {
1394			ASSERT(0);
1395			return XFS_ERROR(EINVAL);
1396		}
1397		mp->m_sb.sb_rbmblocks = scounter;
1398		return 0;
1399	case XFS_SBS_RBLOCKS:
1400		lcounter = (long long)mp->m_sb.sb_rblocks;
1401		lcounter += delta;
1402		if (lcounter < 0) {
1403			ASSERT(0);
1404			return XFS_ERROR(EINVAL);
1405		}
1406		mp->m_sb.sb_rblocks = lcounter;
1407		return 0;
1408	case XFS_SBS_REXTENTS:
1409		lcounter = (long long)mp->m_sb.sb_rextents;
1410		lcounter += delta;
1411		if (lcounter < 0) {
1412			ASSERT(0);
1413			return XFS_ERROR(EINVAL);
1414		}
1415		mp->m_sb.sb_rextents = lcounter;
1416		return 0;
1417	case XFS_SBS_REXTSLOG:
1418		scounter = mp->m_sb.sb_rextslog;
1419		scounter += delta;
1420		if (scounter < 0) {
1421			ASSERT(0);
1422			return XFS_ERROR(EINVAL);
1423		}
1424		mp->m_sb.sb_rextslog = scounter;
1425		return 0;
1426	default:
1427		ASSERT(0);
1428		return XFS_ERROR(EINVAL);
1429	}
1430}
1431
1432/*
1433 * xfs_mod_incore_sb() is used to change a field in the in-core
1434 * superblock structure by the specified delta.  This modification
1435 * is protected by the SB_LOCK.  Just use the xfs_mod_incore_sb_unlocked()
1436 * routine to do the work.
1437 */
1438int
1439xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
1440{
1441	unsigned long	s;
1442	int	status;
1443
1444	/* check for per-cpu counters */
1445	switch (field) {
1446#ifdef HAVE_PERCPU_SB
1447	case XFS_SBS_ICOUNT:
1448	case XFS_SBS_IFREE:
1449	case XFS_SBS_FDBLOCKS:
1450		if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1451			status = xfs_icsb_modify_counters(mp, field,
1452							delta, rsvd);
1453			break;
1454		}
1455		/* FALLTHROUGH */
1456#endif
1457	default:
1458		s = XFS_SB_LOCK(mp);
1459		status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1460		XFS_SB_UNLOCK(mp, s);
1461		break;
1462	}
1463
1464	return status;
1465}
1466
1467/*
1468 * xfs_mod_incore_sb_batch() is used to change more than one field
1469 * in the in-core superblock structure at a time.  This modification
1470 * is protected by a lock internal to this module.  The fields and
1471 * changes to those fields are specified in the array of xfs_mod_sb
1472 * structures passed in.
1473 *
1474 * Either all of the specified deltas will be applied or none of
1475 * them will.  If any modified field dips below 0, then all modifications
1476 * will be backed out and EINVAL will be returned.
1477 */
1478int
1479xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
1480{
1481	unsigned long	s;
1482	int		status=0;
1483	xfs_mod_sb_t	*msbp;
1484
1485	/*
1486	 * Loop through the array of mod structures and apply each
1487	 * individually.  If any fail, then back out all those
1488	 * which have already been applied.  Do all of this within
1489	 * the scope of the SB_LOCK so that all of the changes will
1490	 * be atomic.
1491	 */
1492	s = XFS_SB_LOCK(mp);
1493	msbp = &msb[0];
1494	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1495		/*
1496		 * Apply the delta at index n.  If it fails, break
1497		 * from the loop so we'll fall into the undo loop
1498		 * below.
1499		 */
1500		switch (msbp->msb_field) {
1501#ifdef HAVE_PERCPU_SB
1502		case XFS_SBS_ICOUNT:
1503		case XFS_SBS_IFREE:
1504		case XFS_SBS_FDBLOCKS:
1505			if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1506				status = xfs_icsb_modify_counters_locked(mp,
1507							msbp->msb_field,
1508							msbp->msb_delta, rsvd);
1509				break;
1510			}
1511			/* FALLTHROUGH */
1512#endif
1513		default:
1514			status = xfs_mod_incore_sb_unlocked(mp,
1515						msbp->msb_field,
1516						msbp->msb_delta, rsvd);
1517			break;
1518		}
1519
1520		if (status != 0) {
1521			break;
1522		}
1523	}
1524
1525	/*
1526	 * If we didn't complete the loop above, then back out
1527	 * any changes made to the superblock.  If you add code
1528	 * between the loop above and here, make sure that you
1529	 * preserve the value of status. Loop back until
1530	 * we step below the beginning of the array.  Make sure
1531	 * we don't touch anything back there.
1532	 */
1533	if (status != 0) {
1534		msbp--;
1535		while (msbp >= msb) {
1536			switch (msbp->msb_field) {
1537#ifdef HAVE_PERCPU_SB
1538			case XFS_SBS_ICOUNT:
1539			case XFS_SBS_IFREE:
1540			case XFS_SBS_FDBLOCKS:
1541				if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1542					status =
1543					    xfs_icsb_modify_counters_locked(mp,
1544							msbp->msb_field,
1545							-(msbp->msb_delta),
1546							rsvd);
1547					break;
1548				}
1549				/* FALLTHROUGH */
1550#endif
1551			default:
1552				status = xfs_mod_incore_sb_unlocked(mp,
1553							msbp->msb_field,
1554							-(msbp->msb_delta),
1555							rsvd);
1556				break;
1557			}
1558			ASSERT(status == 0);
1559			msbp--;
1560		}
1561	}
1562	XFS_SB_UNLOCK(mp, s);
1563	return status;
1564}
1565
1566/*
1567 * xfs_getsb() is called to obtain the buffer for the superblock.
1568 * The buffer is returned locked and read in from disk.
1569 * The buffer should be released with a call to xfs_brelse().
1570 *
1571 * If the flags parameter is BUF_TRYLOCK, then we'll only return
1572 * the superblock buffer if it can be locked without sleeping.
1573 * If it can't then we'll return NULL.
1574 */
1575xfs_buf_t *
1576xfs_getsb(
1577	xfs_mount_t	*mp,
1578	int		flags)
1579{
1580	xfs_buf_t	*bp;
1581	int		extra_flags = 0;
1582	unsigned int	sector_size;
1583
1584
1585	bp = mp->m_sb_bp;
1586	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
1587#ifdef NOT
1588	/* MANAGED buf's appear broken in FreeBSD
1589	 * but it's unclear if we need a persistant superblock?
1590	 * since we now translate the ondisk superblock to
1591	 * a separate translated structure and then translate that
1592	 * structure back when we want to write the superblock
1593	 */
1594	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
1595	extra_flags = XFS_BUF_MANAGE;
1596#endif
1597
1598	mp->m_sb_bp = bp
1599	  = xfs_buf_read_flags(mp->m_ddev_targp,
1600			       XFS_SB_DADDR,
1601			       BTOBB(sector_size),
1602			       extra_flags);
1603
1604	XFS_BUF_HOLD(bp);
1605	ASSERT(XFS_BUF_ISDONE(bp));
1606	if (!XFS_BUF_ISDONE(bp)){
1607		printf("xfs_getsb: %p bp flags 0x%x\n",bp,bp->b_flags);
1608	}
1609	return bp;
1610}
1611
1612/*
1613 * Used to free the superblock along various error paths.
1614 */
1615void
1616xfs_freesb(
1617	xfs_mount_t	*mp)
1618{
1619	xfs_buf_t	*bp;
1620
1621	/*
1622	 * Use xfs_getsb() so that the buffer will be locked
1623	 * when we call xfs_buf_relse().
1624	 */
1625	bp = xfs_getsb(mp, 0);
1626	XFS_BUF_UNMANAGE(bp);
1627	xfs_buf_relse(bp);
1628	mp->m_sb_bp = NULL;
1629}
1630
1631/*
1632 * See if the UUID is unique among mounted XFS filesystems.
1633 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
1634 */
1635STATIC int
1636xfs_uuid_mount(
1637	xfs_mount_t	*mp)
1638{
1639	if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
1640		cmn_err(CE_WARN,
1641			"XFS: Filesystem %s has nil UUID - can't mount",
1642			mp->m_fsname);
1643		return -1;
1644	}
1645	if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
1646		cmn_err(CE_WARN,
1647			"XFS: Filesystem %s has duplicate UUID - can't mount",
1648			mp->m_fsname);
1649		return -1;
1650	}
1651	return 0;
1652}
1653
1654/*
1655 * Remove filesystem from the UUID table.
1656 */
1657STATIC void
1658xfs_uuid_unmount(
1659	xfs_mount_t	*mp)
1660{
1661	uuid_table_remove(&mp->m_sb.sb_uuid);
1662}
1663
1664/*
1665 * Used to log changes to the superblock unit and width fields which could
1666 * be altered by the mount options. Only the first superblock is updated.
1667 */
1668STATIC void
1669xfs_mount_log_sbunit(
1670	xfs_mount_t	*mp,
1671	__int64_t	fields)
1672{
1673	xfs_trans_t	*tp;
1674
1675	ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
1676
1677	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1678	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1679				XFS_DEFAULT_LOG_COUNT)) {
1680		xfs_trans_cancel(tp, 0);
1681		return;
1682	}
1683	xfs_mod_sb(tp, fields);
1684	xfs_trans_commit(tp, 0, NULL);
1685}
1686
1687
1688#ifdef HAVE_PERCPU_SB
1689/*
1690 * Per-cpu incore superblock counters
1691 *
1692 * Simple concept, difficult implementation
1693 *
1694 * Basically, replace the incore superblock counters with a distributed per cpu
1695 * counter for contended fields (e.g.  free block count).
1696 *
1697 * Difficulties arise in that the incore sb is used for ENOSPC checking, and
1698 * hence needs to be accurately read when we are running low on space. Hence
1699 * there is a method to enable and disable the per-cpu counters based on how
1700 * much "stuff" is available in them.
1701 *
1702 * Basically, a counter is enabled if there is enough free resource to justify
1703 * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
1704 * ENOSPC), then we disable the counters to synchronise all callers and
1705 * re-distribute the available resources.
1706 *
1707 * If, once we redistributed the available resources, we still get a failure,
1708 * we disable the per-cpu counter and go through the slow path.
1709 *
1710 * The slow path is the current xfs_mod_incore_sb() function.  This means that
1711 * when we disable a per-cpu counter, we need to drain it's resources back to
1712 * the global superblock. We do this after disabling the counter to prevent
1713 * more threads from queueing up on the counter.
1714 *
1715 * Essentially, this means that we still need a lock in the fast path to enable
1716 * synchronisation between the global counters and the per-cpu counters. This
1717 * is not a problem because the lock will be local to a CPU almost all the time
1718 * and have little contention except when we get to ENOSPC conditions.
1719 *
1720 * Basically, this lock becomes a barrier that enables us to lock out the fast
1721 * path while we do things like enabling and disabling counters and
1722 * synchronising the counters.
1723 *
1724 * Locking rules:
1725 *
1726 * 	1. XFS_SB_LOCK() before picking up per-cpu locks
1727 * 	2. per-cpu locks always picked up via for_each_online_cpu() order
1728 * 	3. accurate counter sync requires XFS_SB_LOCK + per cpu locks
1729 * 	4. modifying per-cpu counters requires holding per-cpu lock
1730 * 	5. modifying global counters requires holding XFS_SB_LOCK
1731 *	6. enabling or disabling a counter requires holding the XFS_SB_LOCK
1732 *	   and _none_ of the per-cpu locks.
1733 *
1734 * Disabled counters are only ever re-enabled by a balance operation
1735 * that results in more free resources per CPU than a given threshold.
1736 * To ensure counters don't remain disabled, they are rebalanced when
1737 * the global resource goes above a higher threshold (i.e. some hysteresis
1738 * is present to prevent thrashing).
1739 */
1740
1741/*
1742 * hot-plug CPU notifier support.
1743 *
1744 * We cannot use the hotcpu_register() function because it does
1745 * not allow notifier instances. We need a notifier per filesystem
1746 * as we need to be able to identify the filesystem to balance
1747 * the counters out. This is achieved by having a notifier block
1748 * embedded in the xfs_mount_t and doing pointer magic to get the
1749 * mount pointer from the notifier block address.
1750 */
1751STATIC int
1752xfs_icsb_cpu_notify(
1753	struct notifier_block *nfb,
1754	unsigned long action,
1755	void *hcpu)
1756{
1757	xfs_icsb_cnts_t *cntp;
1758	xfs_mount_t	*mp;
1759	int		s;
1760
1761	mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
1762	cntp = (xfs_icsb_cnts_t *)
1763			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
1764	switch (action) {
1765	case CPU_UP_PREPARE:
1766		/* Easy Case - initialize the area and locks, and
1767		 * then rebalance when online does everything else for us. */
1768		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1769		break;
1770	case CPU_ONLINE:
1771		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
1772		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
1773		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
1774		break;
1775	case CPU_DEAD:
1776		/* Disable all the counters, then fold the dead cpu's
1777		 * count into the total on the global superblock and
1778		 * re-enable the counters. */
1779		s = XFS_SB_LOCK(mp);
1780		xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
1781		xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
1782		xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
1783
1784		mp->m_sb.sb_icount += cntp->icsb_icount;
1785		mp->m_sb.sb_ifree += cntp->icsb_ifree;
1786		mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
1787
1788		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1789
1790		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, XFS_ICSB_SB_LOCKED);
1791		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, XFS_ICSB_SB_LOCKED);
1792		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, XFS_ICSB_SB_LOCKED);
1793		XFS_SB_UNLOCK(mp, s);
1794		break;
1795	}
1796
1797	return NOTIFY_OK;
1798}
1799
1800int
1801xfs_icsb_init_counters(
1802	xfs_mount_t	*mp)
1803{
1804	xfs_icsb_cnts_t *cntp;
1805	int		i;
1806
1807	mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
1808	if (mp->m_sb_cnts == NULL)
1809		return -ENOMEM;
1810
1811	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
1812	mp->m_icsb_notifier.priority = 0;
1813	register_cpu_notifier(&mp->m_icsb_notifier);
1814
1815	for_each_online_cpu(i) {
1816		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1817		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1818	}
1819	/*
1820	 * start with all counters disabled so that the
1821	 * initial balance kicks us off correctly
1822	 */
1823	mp->m_icsb_counters = -1;
1824	return 0;
1825}
1826
1827STATIC void
1828xfs_icsb_destroy_counters(
1829	xfs_mount_t	*mp)
1830{
1831	if (mp->m_sb_cnts) {
1832		unregister_cpu_notifier(&mp->m_icsb_notifier);
1833		free_percpu(mp->m_sb_cnts);
1834	}
1835}
1836
1837STATIC inline void
1838xfs_icsb_lock_cntr(
1839	xfs_icsb_cnts_t	*icsbp)
1840{
1841	while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
1842		ndelay(1000);
1843	}
1844}
1845
1846STATIC inline void
1847xfs_icsb_unlock_cntr(
1848	xfs_icsb_cnts_t	*icsbp)
1849{
1850	clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
1851}
1852
1853
1854STATIC inline void
1855xfs_icsb_lock_all_counters(
1856	xfs_mount_t	*mp)
1857{
1858	xfs_icsb_cnts_t *cntp;
1859	int		i;
1860
1861	for_each_online_cpu(i) {
1862		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1863		xfs_icsb_lock_cntr(cntp);
1864	}
1865}
1866
1867STATIC inline void
1868xfs_icsb_unlock_all_counters(
1869	xfs_mount_t	*mp)
1870{
1871	xfs_icsb_cnts_t *cntp;
1872	int		i;
1873
1874	for_each_online_cpu(i) {
1875		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1876		xfs_icsb_unlock_cntr(cntp);
1877	}
1878}
1879
1880STATIC void
1881xfs_icsb_count(
1882	xfs_mount_t	*mp,
1883	xfs_icsb_cnts_t	*cnt,
1884	int		flags)
1885{
1886	xfs_icsb_cnts_t *cntp;
1887	int		i;
1888
1889	memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
1890
1891	if (!(flags & XFS_ICSB_LAZY_COUNT))
1892		xfs_icsb_lock_all_counters(mp);
1893
1894	for_each_online_cpu(i) {
1895		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1896		cnt->icsb_icount += cntp->icsb_icount;
1897		cnt->icsb_ifree += cntp->icsb_ifree;
1898		cnt->icsb_fdblocks += cntp->icsb_fdblocks;
1899	}
1900
1901	if (!(flags & XFS_ICSB_LAZY_COUNT))
1902		xfs_icsb_unlock_all_counters(mp);
1903}
1904
1905STATIC int
1906xfs_icsb_counter_disabled(
1907	xfs_mount_t	*mp,
1908	xfs_sb_field_t	field)
1909{
1910	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1911	return test_bit(field, &mp->m_icsb_counters);
1912}
1913
1914STATIC int
1915xfs_icsb_disable_counter(
1916	xfs_mount_t	*mp,
1917	xfs_sb_field_t	field)
1918{
1919	xfs_icsb_cnts_t	cnt;
1920
1921	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1922
1923	xfs_icsb_lock_all_counters(mp);
1924	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
1925		/* drain back to superblock */
1926
1927		xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT);
1928		switch(field) {
1929		case XFS_SBS_ICOUNT:
1930			mp->m_sb.sb_icount = cnt.icsb_icount;
1931			break;
1932		case XFS_SBS_IFREE:
1933			mp->m_sb.sb_ifree = cnt.icsb_ifree;
1934			break;
1935		case XFS_SBS_FDBLOCKS:
1936			mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
1937			break;
1938		default:
1939			BUG();
1940		}
1941	}
1942
1943	xfs_icsb_unlock_all_counters(mp);
1944
1945	return 0;
1946}
1947
1948STATIC void
1949xfs_icsb_enable_counter(
1950	xfs_mount_t	*mp,
1951	xfs_sb_field_t	field,
1952	uint64_t	count,
1953	uint64_t	resid)
1954{
1955	xfs_icsb_cnts_t	*cntp;
1956	int		i;
1957
1958	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1959
1960	xfs_icsb_lock_all_counters(mp);
1961	for_each_online_cpu(i) {
1962		cntp = per_cpu_ptr(mp->m_sb_cnts, i);
1963		switch (field) {
1964		case XFS_SBS_ICOUNT:
1965			cntp->icsb_icount = count + resid;
1966			break;
1967		case XFS_SBS_IFREE:
1968			cntp->icsb_ifree = count + resid;
1969			break;
1970		case XFS_SBS_FDBLOCKS:
1971			cntp->icsb_fdblocks = count + resid;
1972			break;
1973		default:
1974			BUG();
1975			break;
1976		}
1977		resid = 0;
1978	}
1979	clear_bit(field, &mp->m_icsb_counters);
1980	xfs_icsb_unlock_all_counters(mp);
1981}
1982
1983STATIC void
1984xfs_icsb_sync_counters_int(
1985	xfs_mount_t	*mp,
1986	int		flags)
1987{
1988	xfs_icsb_cnts_t	cnt;
1989	int		s;
1990
1991	/* Pass 1: lock all counters */
1992	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
1993		s = XFS_SB_LOCK(mp);
1994
1995	xfs_icsb_count(mp, &cnt, flags);
1996
1997	/* Step 3: update mp->m_sb fields */
1998	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
1999		mp->m_sb.sb_icount = cnt.icsb_icount;
2000	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
2001		mp->m_sb.sb_ifree = cnt.icsb_ifree;
2002	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
2003		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
2004
2005	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
2006		XFS_SB_UNLOCK(mp, s);
2007}
2008
2009/*
2010 * Accurate update of per-cpu counters to incore superblock
2011 */
2012STATIC void
2013xfs_icsb_sync_counters(
2014	xfs_mount_t	*mp)
2015{
2016	xfs_icsb_sync_counters_int(mp, 0);
2017}
2018
2019/*
2020 * lazy addition used for things like df, background sb syncs, etc
2021 */
2022void
2023xfs_icsb_sync_counters_lazy(
2024	xfs_mount_t	*mp)
2025{
2026	xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT);
2027}
2028
2029/*
2030 * Balance and enable/disable counters as necessary.
2031 *
2032 * Thresholds for re-enabling counters are somewhat magic.
2033 * inode counts are chosen to be the same number as single
2034 * on disk allocation chunk per CPU, and free blocks is
2035 * something far enough zero that we aren't going thrash
2036 * when we get near ENOSPC.
2037 */
2038#define XFS_ICSB_INO_CNTR_REENABLE	64
2039#define XFS_ICSB_FDBLK_CNTR_REENABLE	512
2040STATIC void
2041xfs_icsb_balance_counter(
2042	xfs_mount_t	*mp,
2043	xfs_sb_field_t  field,
2044	int		flags)
2045{
2046	uint64_t	count, resid = 0;
2047	int		weight = num_online_cpus();
2048	int		s;
2049
2050	if (!(flags & XFS_ICSB_SB_LOCKED))
2051		s = XFS_SB_LOCK(mp);
2052
2053	/* disable counter and sync counter */
2054	xfs_icsb_disable_counter(mp, field);
2055
2056	/* update counters  - first CPU gets residual*/
2057	switch (field) {
2058	case XFS_SBS_ICOUNT:
2059		count = mp->m_sb.sb_icount;
2060		resid = do_div(count, weight);
2061		if (count < XFS_ICSB_INO_CNTR_REENABLE)
2062			goto out;
2063		break;
2064	case XFS_SBS_IFREE:
2065		count = mp->m_sb.sb_ifree;
2066		resid = do_div(count, weight);
2067		if (count < XFS_ICSB_INO_CNTR_REENABLE)
2068			goto out;
2069		break;
2070	case XFS_SBS_FDBLOCKS:
2071		count = mp->m_sb.sb_fdblocks;
2072		resid = do_div(count, weight);
2073		if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
2074			goto out;
2075		break;
2076	default:
2077		BUG();
2078		break;
2079	}
2080
2081	xfs_icsb_enable_counter(mp, field, count, resid);
2082out:
2083	if (!(flags & XFS_ICSB_SB_LOCKED))
2084		XFS_SB_UNLOCK(mp, s);
2085}
2086
2087STATIC int
2088xfs_icsb_modify_counters_int(
2089	xfs_mount_t	*mp,
2090	xfs_sb_field_t	field,
2091	int		delta,
2092	int		rsvd,
2093	int		flags)
2094{
2095	xfs_icsb_cnts_t	*icsbp;
2096	long long	lcounter;	/* long counter for 64 bit fields */
2097	int		cpu, s, locked = 0;
2098	int		ret = 0, balance_done = 0;
2099
2100again:
2101	cpu = get_cpu();
2102	icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu),
2103	xfs_icsb_lock_cntr(icsbp);
2104	if (unlikely(xfs_icsb_counter_disabled(mp, field)))
2105		goto slow_path;
2106
2107	switch (field) {
2108	case XFS_SBS_ICOUNT:
2109		lcounter = icsbp->icsb_icount;
2110		lcounter += delta;
2111		if (unlikely(lcounter < 0))
2112			goto slow_path;
2113		icsbp->icsb_icount = lcounter;
2114		break;
2115
2116	case XFS_SBS_IFREE:
2117		lcounter = icsbp->icsb_ifree;
2118		lcounter += delta;
2119		if (unlikely(lcounter < 0))
2120			goto slow_path;
2121		icsbp->icsb_ifree = lcounter;
2122		break;
2123
2124	case XFS_SBS_FDBLOCKS:
2125		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
2126
2127		lcounter = icsbp->icsb_fdblocks;
2128		lcounter += delta;
2129		if (unlikely(lcounter < 0))
2130			goto slow_path;
2131		icsbp->icsb_fdblocks = lcounter;
2132		break;
2133	default:
2134		BUG();
2135		break;
2136	}
2137	xfs_icsb_unlock_cntr(icsbp);
2138	put_cpu();
2139	if (locked)
2140		XFS_SB_UNLOCK(mp, s);
2141	return 0;
2142
2143	/*
2144	 * The slow path needs to be run with the SBLOCK
2145	 * held so that we prevent other threads from
2146	 * attempting to run this path at the same time.
2147	 * this provides exclusion for the balancing code,
2148	 * and exclusive fallback if the balance does not
2149	 * provide enough resources to continue in an unlocked
2150	 * manner.
2151	 */
2152slow_path:
2153	xfs_icsb_unlock_cntr(icsbp);
2154	put_cpu();
2155
2156	/* need to hold superblock incase we need
2157	 * to disable a counter */
2158	if (!(flags & XFS_ICSB_SB_LOCKED)) {
2159		s = XFS_SB_LOCK(mp);
2160		locked = 1;
2161		flags |= XFS_ICSB_SB_LOCKED;
2162	}
2163	if (!balance_done) {
2164		xfs_icsb_balance_counter(mp, field, flags);
2165		balance_done = 1;
2166		goto again;
2167	} else {
2168		/*
2169		 * we might not have enough on this local
2170		 * cpu to allocate for a bulk request.
2171		 * We need to drain this field from all CPUs
2172		 * and disable the counter fastpath
2173		 */
2174		xfs_icsb_disable_counter(mp, field);
2175	}
2176
2177	ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
2178
2179	if (locked)
2180		XFS_SB_UNLOCK(mp, s);
2181	return ret;
2182}
2183
2184STATIC int
2185xfs_icsb_modify_counters(
2186	xfs_mount_t	*mp,
2187	xfs_sb_field_t	field,
2188	int		delta,
2189	int		rsvd)
2190{
2191	return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0);
2192}
2193
2194/*
2195 * Called when superblock is already locked
2196 */
2197STATIC int
2198xfs_icsb_modify_counters_locked(
2199	xfs_mount_t	*mp,
2200	xfs_sb_field_t	field,
2201	int		delta,
2202	int		rsvd)
2203{
2204	return xfs_icsb_modify_counters_int(mp, field, delta,
2205						rsvd, XFS_ICSB_SB_LOCKED);
2206}
2207#endif
2208