1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dir2_sf.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h"
40#include "xfs_rtalloc.h"
41#include "xfs_bmap.h"
42#include "xfs_error.h"
43#include "xfs_rw.h"
44#include "xfs_quota.h"
45#include "xfs_fsops.h"
46
47STATIC void	xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
48STATIC int	xfs_uuid_mount(xfs_mount_t *);
49STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
50STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
51
52
53#ifdef HAVE_PERCPU_SB
54STATIC void	xfs_icsb_destroy_counters(xfs_mount_t *);
55STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
56						int, int);
57STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
58STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
59						int64_t, int);
60STATIC int	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
61
62#else
63
64#define xfs_icsb_destroy_counters(mp)			do { } while (0)
65#define xfs_icsb_balance_counter(mp, a, b, c)		do { } while (0)
66#define xfs_icsb_sync_counters(mp)			do { } while (0)
67#define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
68
69#endif
70
71static const struct {
72	short offset;
73	short type;	/* 0 = integer
74			 * 1 = binary / string (no translation)
75			 */
76} xfs_sb_info[] = {
77    { offsetof(xfs_sb_t, sb_magicnum),   0 },
78    { offsetof(xfs_sb_t, sb_blocksize),  0 },
79    { offsetof(xfs_sb_t, sb_dblocks),    0 },
80    { offsetof(xfs_sb_t, sb_rblocks),    0 },
81    { offsetof(xfs_sb_t, sb_rextents),   0 },
82    { offsetof(xfs_sb_t, sb_uuid),       1 },
83    { offsetof(xfs_sb_t, sb_logstart),   0 },
84    { offsetof(xfs_sb_t, sb_rootino),    0 },
85    { offsetof(xfs_sb_t, sb_rbmino),     0 },
86    { offsetof(xfs_sb_t, sb_rsumino),    0 },
87    { offsetof(xfs_sb_t, sb_rextsize),   0 },
88    { offsetof(xfs_sb_t, sb_agblocks),   0 },
89    { offsetof(xfs_sb_t, sb_agcount),    0 },
90    { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
91    { offsetof(xfs_sb_t, sb_logblocks),  0 },
92    { offsetof(xfs_sb_t, sb_versionnum), 0 },
93    { offsetof(xfs_sb_t, sb_sectsize),   0 },
94    { offsetof(xfs_sb_t, sb_inodesize),  0 },
95    { offsetof(xfs_sb_t, sb_inopblock),  0 },
96    { offsetof(xfs_sb_t, sb_fname[0]),   1 },
97    { offsetof(xfs_sb_t, sb_blocklog),   0 },
98    { offsetof(xfs_sb_t, sb_sectlog),    0 },
99    { offsetof(xfs_sb_t, sb_inodelog),   0 },
100    { offsetof(xfs_sb_t, sb_inopblog),   0 },
101    { offsetof(xfs_sb_t, sb_agblklog),   0 },
102    { offsetof(xfs_sb_t, sb_rextslog),   0 },
103    { offsetof(xfs_sb_t, sb_inprogress), 0 },
104    { offsetof(xfs_sb_t, sb_imax_pct),   0 },
105    { offsetof(xfs_sb_t, sb_icount),     0 },
106    { offsetof(xfs_sb_t, sb_ifree),      0 },
107    { offsetof(xfs_sb_t, sb_fdblocks),   0 },
108    { offsetof(xfs_sb_t, sb_frextents),  0 },
109    { offsetof(xfs_sb_t, sb_uquotino),   0 },
110    { offsetof(xfs_sb_t, sb_gquotino),   0 },
111    { offsetof(xfs_sb_t, sb_qflags),     0 },
112    { offsetof(xfs_sb_t, sb_flags),      0 },
113    { offsetof(xfs_sb_t, sb_shared_vn),  0 },
114    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
115    { offsetof(xfs_sb_t, sb_unit),	 0 },
116    { offsetof(xfs_sb_t, sb_width),	 0 },
117    { offsetof(xfs_sb_t, sb_dirblklog),	 0 },
118    { offsetof(xfs_sb_t, sb_logsectlog), 0 },
119    { offsetof(xfs_sb_t, sb_logsectsize),0 },
120    { offsetof(xfs_sb_t, sb_logsunit),	 0 },
121    { offsetof(xfs_sb_t, sb_features2),	 0 },
122    { sizeof(xfs_sb_t),			 0 }
123};
124
125/*
126 * Return a pointer to an initialized xfs_mount structure.
127 */
128xfs_mount_t *
129xfs_mount_init(void)
130{
131	xfs_mount_t *mp;
132
133	mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
134
135	if (xfs_icsb_init_counters(mp)) {
136		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
137	}
138
139	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
140	spinlock_init(&mp->m_sb_lock, "xfs_sb");
141	mutex_init(&mp->m_ilock);
142	initnsema(&mp->m_growlock, 1, "xfs_grow");
143	/*
144	 * Initialize the AIL.
145	 */
146	xfs_trans_ail_init(mp);
147
148	atomic_set(&mp->m_active_trans, 0);
149
150	return mp;
151}
152
153/*
154 * Free up the resources associated with a mount structure.  Assume that
155 * the structure was initially zeroed, so we can tell which fields got
156 * initialized.
157 */
158void
159xfs_mount_free(
160	xfs_mount_t	*mp,
161	int		remove_bhv)
162{
163	if (mp->m_ihash)
164		xfs_ihash_free(mp);
165	if (mp->m_chash)
166		xfs_chash_free(mp);
167
168	if (mp->m_perag) {
169		int	agno;
170
171		for (agno = 0; agno < mp->m_maxagi; agno++)
172			if (mp->m_perag[agno].pagb_list)
173				kmem_free(mp->m_perag[agno].pagb_list,
174						sizeof(xfs_perag_busy_t) *
175							XFS_PAGB_NUM_SLOTS);
176		kmem_free(mp->m_perag,
177			  sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
178	}
179
180	AIL_LOCK_DESTROY(&mp->m_ail_lock);
181	spinlock_destroy(&mp->m_sb_lock);
182	mutex_destroy(&mp->m_ilock);
183	freesema(&mp->m_growlock);
184	if (mp->m_quotainfo)
185		XFS_QM_DONE(mp);
186
187	if (mp->m_fsname != NULL)
188		kmem_free(mp->m_fsname, mp->m_fsname_len);
189	if (mp->m_rtname != NULL)
190		kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1);
191	if (mp->m_logname != NULL)
192		kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
193
194	if (remove_bhv) {
195		struct bhv_vfs	*vfsp = XFS_MTOVFS(mp);
196
197		bhv_remove_all_vfsops(vfsp, 0);
198		VFS_REMOVEBHV(vfsp, &mp->m_bhv);
199	}
200
201	xfs_icsb_destroy_counters(mp);
202	kmem_free(mp, sizeof(xfs_mount_t));
203}
204
205
206/*
207 * Check the validity of the SB found.
208 */
209STATIC int
210xfs_mount_validate_sb(
211	xfs_mount_t	*mp,
212	xfs_sb_t	*sbp,
213	int		flags)
214{
215	/*
216	 * If the log device and data device have the
217	 * same device number, the log is internal.
218	 * Consequently, the sb_logstart should be non-zero.  If
219	 * we have a zero sb_logstart in this case, we may be trying to mount
220	 * a volume filesystem in a non-volume manner.
221	 */
222	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
223		xfs_fs_mount_cmn_err(flags, "bad magic number");
224		return XFS_ERROR(EWRONGFS);
225	}
226
227	if (!XFS_SB_GOOD_VERSION(sbp)) {
228		xfs_fs_mount_cmn_err(flags, "bad version");
229		return XFS_ERROR(EWRONGFS);
230	}
231
232	if (unlikely(
233	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
234		xfs_fs_mount_cmn_err(flags,
235			"filesystem is marked as having an external log; "
236			"specify logdev on the\nmount command line.");
237		return XFS_ERROR(EINVAL);
238	}
239
240	if (unlikely(
241	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
242		xfs_fs_mount_cmn_err(flags,
243			"filesystem is marked as having an internal log; "
244			"do not specify logdev on\nthe mount command line.");
245		return XFS_ERROR(EINVAL);
246	}
247
248	/*
249	 * More sanity checking. These were stolen directly from
250	 * xfs_repair.
251	 */
252	if (unlikely(
253	    sbp->sb_agcount <= 0					||
254	    sbp->sb_sectsize < XFS_MIN_SECTORSIZE			||
255	    sbp->sb_sectsize > XFS_MAX_SECTORSIZE			||
256	    sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG			||
257	    sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG			||
258	    sbp->sb_blocksize < XFS_MIN_BLOCKSIZE			||
259	    sbp->sb_blocksize > XFS_MAX_BLOCKSIZE			||
260	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
261	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
262	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
263	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
264	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
265	    sbp->sb_inodelog > XFS_DINODE_MAX_LOG			||
266	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
267	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
268	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
269	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
270		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
271		return XFS_ERROR(EFSCORRUPTED);
272	}
273
274	/*
275	 * Sanity check AG count, size fields against data size field
276	 */
277	if (unlikely(
278	    sbp->sb_dblocks == 0 ||
279	    sbp->sb_dblocks >
280	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
281	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
282			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
283		xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
284		return XFS_ERROR(EFSCORRUPTED);
285	}
286
287	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
288	ASSERT(sbp->sb_blocklog >= BBSHIFT);
289
290#if XFS_BIG_BLKNOS         /* Limited by ULONG_MAX of page cache index */
291	if (unlikely(
292	    (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX ||
293	    (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) {
294#else                  /* Limited by UINT_MAX of sectors */
295	if (unlikely(
296	    (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX ||
297	    (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) {
298#endif
299		xfs_fs_mount_cmn_err(flags,
300			"file system too large to be mounted on this system.");
301		return XFS_ERROR(E2BIG);
302	}
303
304	if (unlikely(sbp->sb_inprogress)) {
305		xfs_fs_mount_cmn_err(flags, "file system busy");
306		return XFS_ERROR(EFSCORRUPTED);
307	}
308
309	/*
310	 * Version 1 directory format has never worked on Linux.
311	 */
312	if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) {
313		xfs_fs_mount_cmn_err(flags,
314			"file system using version 1 directory format");
315		return XFS_ERROR(ENOSYS);
316	}
317
318	/*
319	 * Until this is fixed only page-sized or smaller data blocks work.
320	 */
321	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
322		xfs_fs_mount_cmn_err(flags,
323			"file system with blocksize %d bytes",
324			sbp->sb_blocksize);
325		xfs_fs_mount_cmn_err(flags,
326			"only pagesize (%ld) or less will currently work.",
327			PAGE_SIZE);
328		return XFS_ERROR(ENOSYS);
329	}
330
331	return 0;
332}
333
334xfs_agnumber_t
335xfs_initialize_perag(
336	bhv_vfs_t	*vfs,
337	xfs_mount_t	*mp,
338	xfs_agnumber_t	agcount)
339{
340	xfs_agnumber_t	index, max_metadata;
341	xfs_perag_t	*pag;
342	xfs_agino_t	agino;
343	xfs_ino_t	ino;
344	xfs_sb_t	*sbp = &mp->m_sb;
345	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
346
347	/* Check to see if the filesystem can overflow 32 bit inodes */
348	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
349	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
350
351	/* Clear the mount flag if no inode can overflow 32 bits
352	 * on this filesystem, or if specifically requested..
353	 */
354	if ((vfs->vfs_flag & VFS_32BITINODES) && ino > max_inum) {
355		mp->m_flags |= XFS_MOUNT_32BITINODES;
356	} else {
357		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
358	}
359
360	/* If we can overflow then setup the ag headers accordingly */
361	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
362		/* Calculate how much should be reserved for inodes to
363		 * meet the max inode percentage.
364		 */
365		if (mp->m_maxicount) {
366			__uint64_t	icount;
367
368			icount = sbp->sb_dblocks * sbp->sb_imax_pct;
369			do_div(icount, 100);
370			icount += sbp->sb_agblocks - 1;
371			do_div(icount, sbp->sb_agblocks);
372			max_metadata = icount;
373		} else {
374			max_metadata = agcount;
375		}
376		for (index = 0; index < agcount; index++) {
377			ino = XFS_AGINO_TO_INO(mp, index, agino);
378			if (ino > max_inum) {
379				index++;
380				break;
381			}
382
383			/* This ag is preferred for inodes */
384			pag = &mp->m_perag[index];
385			pag->pagi_inodeok = 1;
386			if (index < max_metadata)
387				pag->pagf_metadata = 1;
388		}
389	} else {
390		/* Setup default behavior for smaller filesystems */
391		for (index = 0; index < agcount; index++) {
392			pag = &mp->m_perag[index];
393			pag->pagi_inodeok = 1;
394		}
395	}
396	return index;
397}
398
399/*
400 * xfs_xlatesb
401 *
402 *     data       - on disk version of sb
403 *     sb         - a superblock
404 *     dir        - conversion direction: <0 - convert sb to buf
405 *                                        >0 - convert buf to sb
406 *     fields     - which fields to copy (bitmask)
407 */
408void
409xfs_xlatesb(
410	void		*data,
411	xfs_sb_t	*sb,
412	int		dir,
413	__int64_t	fields)
414{
415	xfs_caddr_t	buf_ptr;
416	xfs_caddr_t	mem_ptr;
417	xfs_sb_field_t	f;
418	int		first;
419	int		size;
420
421	ASSERT(dir);
422	ASSERT(fields);
423
424	if (!fields)
425		return;
426
427	buf_ptr = (xfs_caddr_t)data;
428	mem_ptr = (xfs_caddr_t)sb;
429
430	while (fields) {
431		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
432		first = xfs_sb_info[f].offset;
433		size = xfs_sb_info[f + 1].offset - first;
434
435		ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
436
437		if (size == 1 || xfs_sb_info[f].type == 1) {
438			if (dir > 0) {
439				memcpy(mem_ptr + first, buf_ptr + first, size);
440			} else {
441				memcpy(buf_ptr + first, mem_ptr + first, size);
442			}
443		} else {
444			switch (size) {
445			case 2:
446				INT_XLATE(*(__uint16_t*)(buf_ptr+first),
447					  *(__uint16_t*)(mem_ptr+first),
448					  dir, ARCH_CONVERT);
449				break;
450			case 4:
451				INT_XLATE(*(__uint32_t*)(buf_ptr+first),
452					  *(__uint32_t*)(mem_ptr+first),
453					  dir, ARCH_CONVERT);
454				break;
455			case 8:
456				INT_XLATE(*(__uint64_t*)(buf_ptr+first),
457					  *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT);
458				break;
459			default:
460				ASSERT(0);
461			}
462		}
463
464		fields &= ~(1LL << f);
465	}
466}
467
468/*
469 * xfs_readsb
470 *
471 * Does the initial read of the superblock.
472 */
473int
474xfs_readsb(xfs_mount_t *mp, int flags)
475{
476	unsigned int	sector_size;
477	unsigned int	extra_flags;
478	xfs_buf_t	*bp;
479	xfs_sb_t	*sbp;
480	int		error;
481
482	ASSERT(mp->m_sb_bp == NULL);
483	ASSERT(mp->m_ddev_targp != NULL);
484
485	/*
486	 * Allocate a (locked) buffer to hold the superblock.
487	 * This will be kept around at all times to optimize
488	 * access to the superblock.
489	 */
490	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
491	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
492
493	bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
494				BTOBB(sector_size), extra_flags);
495	if (!bp || XFS_BUF_ISERROR(bp)) {
496		xfs_fs_mount_cmn_err(flags, "SB read failed");
497		error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
498		goto fail;
499	}
500	ASSERT(XFS_BUF_ISBUSY(bp));
501	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
502
503	/*
504	 * Initialize the mount structure from the superblock.
505	 * But first do some basic consistency checking.
506	 */
507	sbp = XFS_BUF_TO_SBP(bp);
508	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS);
509
510	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
511	if (error) {
512		xfs_fs_mount_cmn_err(flags, "SB validate failed");
513		goto fail;
514	}
515
516	/*
517	 * We must be able to do sector-sized and sector-aligned IO.
518	 */
519	if (sector_size > mp->m_sb.sb_sectsize) {
520		xfs_fs_mount_cmn_err(flags,
521			"device supports only %u byte sectors (not %u)",
522			sector_size, mp->m_sb.sb_sectsize);
523		error = ENOSYS;
524		goto fail;
525	}
526
527	/*
528	 * If device sector size is smaller than the superblock size,
529	 * re-read the superblock so the buffer is correctly sized.
530	 */
531	if (sector_size < mp->m_sb.sb_sectsize) {
532		XFS_BUF_UNMANAGE(bp);
533		xfs_buf_relse(bp);
534		sector_size = mp->m_sb.sb_sectsize;
535		bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
536					BTOBB(sector_size), extra_flags);
537		if (!bp || XFS_BUF_ISERROR(bp)) {
538			xfs_fs_mount_cmn_err(flags, "SB re-read failed");
539			error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
540			goto fail;
541		}
542		ASSERT(XFS_BUF_ISBUSY(bp));
543		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
544	}
545
546	/* Initialize per-cpu counters */
547	xfs_icsb_reinit_counters(mp);
548
549	mp->m_sb_bp = bp;
550	xfs_buf_relse(bp);
551	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
552	return 0;
553
554 fail:
555	if (bp) {
556		XFS_BUF_UNMANAGE(bp);
557		xfs_buf_relse(bp);
558	}
559	return error;
560}
561
562
563/*
564 * xfs_mount_common
565 *
566 * Mount initialization code establishing various mount
567 * fields from the superblock associated with the given
568 * mount structure
569 */
570STATIC void
571xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
572{
573	int	i;
574
575	mp->m_agfrotor = mp->m_agirotor = 0;
576	spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock");
577	mp->m_maxagi = mp->m_sb.sb_agcount;
578	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
579	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
580	mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
581	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
582	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
583	mp->m_litino = sbp->sb_inodesize -
584		((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
585	mp->m_blockmask = sbp->sb_blocksize - 1;
586	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
587	mp->m_blockwmask = mp->m_blockwsize - 1;
588	INIT_LIST_HEAD(&mp->m_del_inodes);
589
590	/*
591	 * Setup for attributes, in case they get created.
592	 * This value is for inodes getting attributes for the first time,
593	 * the per-inode value is for old attribute values.
594	 */
595	ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
596	switch (sbp->sb_inodesize) {
597	case 256:
598		mp->m_attroffset = XFS_LITINO(mp) -
599				   XFS_BMDR_SPACE_CALC(MINABTPTRS);
600		break;
601	case 512:
602	case 1024:
603	case 2048:
604		mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
605		break;
606	default:
607		ASSERT(0);
608	}
609	ASSERT(mp->m_attroffset < XFS_LITINO(mp));
610
611	for (i = 0; i < 2; i++) {
612		mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
613			xfs_alloc, i == 0);
614		mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
615			xfs_alloc, i == 0);
616	}
617	for (i = 0; i < 2; i++) {
618		mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
619			xfs_bmbt, i == 0);
620		mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
621			xfs_bmbt, i == 0);
622	}
623	for (i = 0; i < 2; i++) {
624		mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
625			xfs_inobt, i == 0);
626		mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
627			xfs_inobt, i == 0);
628	}
629
630	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
631	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
632					sbp->sb_inopblock);
633	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
634}
635/*
636 * xfs_mountfs
637 *
638 * This function does the following on an initial mount of a file system:
639 *	- reads the superblock from disk and init the mount struct
640 *	- if we're a 32-bit kernel, do a size check on the superblock
641 *		so we don't mount terabyte filesystems
642 *	- init mount struct realtime fields
643 *	- allocate inode hash table for fs
644 *	- init directory manager
645 *	- perform recovery and init the log manager
646 */
647int
648xfs_mountfs(
649	bhv_vfs_t	*vfsp,
650	xfs_mount_t	*mp,
651	int		mfsi_flags)
652{
653	xfs_buf_t	*bp;
654	xfs_sb_t	*sbp = &(mp->m_sb);
655	xfs_inode_t	*rip;
656	bhv_vnode_t	*rvp = NULL;
657	int		readio_log, writeio_log;
658	xfs_daddr_t	d;
659	__uint64_t	ret64;
660	__int64_t	update_flags;
661	uint		quotamount, quotaflags;
662	int		agno;
663	int		uuid_mounted = 0;
664	int		error = 0;
665
666	if (mp->m_sb_bp == NULL) {
667		if ((error = xfs_readsb(mp, mfsi_flags))) {
668			return error;
669		}
670	}
671	xfs_mount_common(mp, sbp);
672
673	/*
674	 * Check if sb_agblocks is aligned at stripe boundary
675	 * If sb_agblocks is NOT aligned turn off m_dalign since
676	 * allocator alignment is within an ag, therefore ag has
677	 * to be aligned at stripe boundary.
678	 */
679	update_flags = 0LL;
680	if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
681		/*
682		 * If stripe unit and stripe width are not multiples
683		 * of the fs blocksize turn off alignment.
684		 */
685		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
686		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
687			if (mp->m_flags & XFS_MOUNT_RETERR) {
688				cmn_err(CE_WARN,
689					"XFS: alignment check 1 failed");
690				error = XFS_ERROR(EINVAL);
691				goto error1;
692			}
693			mp->m_dalign = mp->m_swidth = 0;
694		} else {
695			/*
696			 * Convert the stripe unit and width to FSBs.
697			 */
698			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
699			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
700				if (mp->m_flags & XFS_MOUNT_RETERR) {
701					error = XFS_ERROR(EINVAL);
702					goto error1;
703				}
704				xfs_fs_cmn_err(CE_WARN, mp,
705"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
706					mp->m_dalign, mp->m_swidth,
707					sbp->sb_agblocks);
708
709				mp->m_dalign = 0;
710				mp->m_swidth = 0;
711			} else if (mp->m_dalign) {
712				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
713			} else {
714				if (mp->m_flags & XFS_MOUNT_RETERR) {
715					xfs_fs_cmn_err(CE_WARN, mp,
716"stripe alignment turned off: sunit(%d) less than bsize(%d)",
717                                        	mp->m_dalign,
718						mp->m_blockmask +1);
719					error = XFS_ERROR(EINVAL);
720					goto error1;
721				}
722				mp->m_swidth = 0;
723			}
724		}
725
726		/*
727		 * Update superblock with new values
728		 * and log changes
729		 */
730		if (XFS_SB_VERSION_HASDALIGN(sbp)) {
731			if (sbp->sb_unit != mp->m_dalign) {
732				sbp->sb_unit = mp->m_dalign;
733				update_flags |= XFS_SB_UNIT;
734			}
735			if (sbp->sb_width != mp->m_swidth) {
736				sbp->sb_width = mp->m_swidth;
737				update_flags |= XFS_SB_WIDTH;
738			}
739		}
740	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
741		    XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
742			mp->m_dalign = sbp->sb_unit;
743			mp->m_swidth = sbp->sb_width;
744	}
745
746	xfs_alloc_compute_maxlevels(mp);
747	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
748	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
749	xfs_ialloc_compute_maxlevels(mp);
750
751	if (sbp->sb_imax_pct) {
752		__uint64_t	icount;
753
754		/* Make sure the maximum inode count is a multiple of the
755		 * units we allocate inodes in.
756		 */
757
758		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
759		do_div(icount, 100);
760		do_div(icount, mp->m_ialloc_blks);
761		mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
762				   sbp->sb_inopblog;
763	} else
764		mp->m_maxicount = 0;
765
766	mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
767
768	/*
769	 * XFS uses the uuid from the superblock as the unique
770	 * identifier for fsid.  We can not use the uuid from the volume
771	 * since a single partition filesystem is identical to a single
772	 * partition volume/filesystem.
773	 */
774	if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
775	    (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
776		if (xfs_uuid_mount(mp)) {
777			error = XFS_ERROR(EINVAL);
778			goto error1;
779		}
780		uuid_mounted=1;
781		ret64 = uuid_hash64(&sbp->sb_uuid);
782		memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64));
783	}
784
785	/*
786	 * Set the default minimum read and write sizes unless
787	 * already specified in a mount option.
788	 * We use smaller I/O sizes when the file system
789	 * is being used for NFS service (wsync mount option).
790	 */
791	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
792		if (mp->m_flags & XFS_MOUNT_WSYNC) {
793			readio_log = XFS_WSYNC_READIO_LOG;
794			writeio_log = XFS_WSYNC_WRITEIO_LOG;
795		} else {
796			readio_log = XFS_READIO_LOG_LARGE;
797			writeio_log = XFS_WRITEIO_LOG_LARGE;
798		}
799	} else {
800		readio_log = mp->m_readio_log;
801		writeio_log = mp->m_writeio_log;
802	}
803
804	/*
805	 * Set the number of readahead buffers to use based on
806	 * physical memory size.
807	 */
808	if (xfs_physmem <= 4096)		/* <= 16MB */
809		mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB;
810	else if (xfs_physmem <= 8192)	/* <= 32MB */
811		mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB;
812	else
813		mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32;
814	if (sbp->sb_blocklog > readio_log) {
815		mp->m_readio_log = sbp->sb_blocklog;
816	} else {
817		mp->m_readio_log = readio_log;
818	}
819	mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
820	if (sbp->sb_blocklog > writeio_log) {
821		mp->m_writeio_log = sbp->sb_blocklog;
822	} else {
823		mp->m_writeio_log = writeio_log;
824	}
825	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
826
827	/*
828	 * Set the inode cluster size based on the physical memory
829	 * size.  This may still be overridden by the file system
830	 * block size if it is larger than the chosen cluster size.
831	 */
832	if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */
833		mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE;
834	} else {
835		mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
836	}
837	/*
838	 * Set whether we're using inode alignment.
839	 */
840	if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
841	    mp->m_sb.sb_inoalignmt >=
842	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
843		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
844	else
845		mp->m_inoalign_mask = 0;
846	/*
847	 * If we are using stripe alignment, check whether
848	 * the stripe unit is a multiple of the inode alignment
849	 */
850	if (mp->m_dalign && mp->m_inoalign_mask &&
851	    !(mp->m_dalign & mp->m_inoalign_mask))
852		mp->m_sinoalign = mp->m_dalign;
853	else
854		mp->m_sinoalign = 0;
855	/*
856	 * Check that the data (and log if separate) are an ok size.
857	 */
858	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
859	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
860		cmn_err(CE_WARN, "XFS: size check 1 failed");
861		error = XFS_ERROR(E2BIG);
862		goto error1;
863	}
864	error = xfs_read_buf(mp, mp->m_ddev_targp,
865			     d - XFS_FSS_TO_BB(mp, 1),
866			     XFS_FSS_TO_BB(mp, 1), 0, &bp);
867	if (!error) {
868		xfs_buf_relse(bp);
869	} else {
870		cmn_err(CE_WARN, "XFS: size check 2 failed");
871		if (error == ENOSPC) {
872			error = XFS_ERROR(E2BIG);
873		}
874		goto error1;
875	}
876
877	if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
878	    mp->m_logdev_targp != mp->m_ddev_targp) {
879		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
880		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
881			cmn_err(CE_WARN, "XFS: size check 3 failed");
882			error = XFS_ERROR(E2BIG);
883			goto error1;
884		}
885		error = xfs_read_buf(mp, mp->m_logdev_targp,
886				     d - XFS_FSB_TO_BB(mp, 1),
887				     XFS_FSB_TO_BB(mp, 1), 0, &bp);
888		if (!error) {
889			xfs_buf_relse(bp);
890		} else {
891			cmn_err(CE_WARN, "XFS: size check 3 failed");
892			if (error == ENOSPC) {
893				error = XFS_ERROR(E2BIG);
894			}
895			goto error1;
896		}
897	}
898
899	/*
900	 * Initialize realtime fields in the mount structure
901	 */
902	if ((error = xfs_rtmount_init(mp))) {
903		cmn_err(CE_WARN, "XFS: RT mount failed");
904		goto error1;
905	}
906
907	/*
908	 * For client case we are done now
909	 */
910	if (mfsi_flags & XFS_MFSI_CLIENT) {
911		return 0;
912	}
913
914	/*
915	 *  Copies the low order bits of the timestamp and the randomly
916	 *  set "sequence" number out of a UUID.
917	 */
918	uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
919
920	/*
921	 *  The vfs structure needs to have a file system independent
922	 *  way of checking for the invariant file system ID.  Since it
923	 *  can't look at mount structures it has a pointer to the data
924	 *  in the mount structure.
925	 *
926	 *  File systems that don't support user level file handles (i.e.
927	 *  all of them except for XFS) will leave vfs_altfsid as NULL.
928	 */
929	vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
930	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
931
932	xfs_dir_mount(mp);
933
934	/*
935	 * Initialize the attribute manager's entries.
936	 */
937	mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
938
939	/*
940	 * Initialize the precomputed transaction reservations values.
941	 */
942	xfs_trans_init(mp);
943
944	/*
945	 * Allocate and initialize the inode hash table for this
946	 * file system.
947	 */
948	xfs_ihash_init(mp);
949	xfs_chash_init(mp);
950
951	/*
952	 * Allocate and initialize the per-ag data.
953	 */
954	init_rwsem(&mp->m_peraglock);
955	mp->m_perag =
956		kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
957
958	mp->m_maxagi = xfs_initialize_perag(vfsp, mp, sbp->sb_agcount);
959
960	/*
961	 * log's mount-time initialization. Perform 1st part recovery if needed
962	 */
963	if (likely(sbp->sb_logblocks > 0)) {	/* check for volume case */
964		error = xfs_log_mount(mp, mp->m_logdev_targp,
965				      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
966				      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
967		if (error) {
968			cmn_err(CE_WARN, "XFS: log mount failed");
969			goto error2;
970		}
971	} else {	/* No log has been defined */
972		cmn_err(CE_WARN, "XFS: no log defined");
973		XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
974		error = XFS_ERROR(EFSCORRUPTED);
975		goto error2;
976	}
977
978	/*
979	 * Get and sanity-check the root inode.
980	 * Save the pointer to it in the mount structure.
981	 */
982	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
983	if (error) {
984		cmn_err(CE_WARN, "XFS: failed to read root inode");
985		goto error3;
986	}
987
988	ASSERT(rip != NULL);
989	rvp = XFS_ITOV(rip);
990
991	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
992		cmn_err(CE_WARN, "XFS: corrupted root inode");
993		cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
994			XFS_BUFTARG_NAME(mp->m_ddev_targp),
995			(unsigned long long)rip->i_ino);
996		xfs_iunlock(rip, XFS_ILOCK_EXCL);
997		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
998				 mp);
999		error = XFS_ERROR(EFSCORRUPTED);
1000		goto error4;
1001	}
1002	mp->m_rootip = rip;	/* save it */
1003
1004	xfs_iunlock(rip, XFS_ILOCK_EXCL);
1005
1006	/*
1007	 * Initialize realtime inode pointers in the mount structure
1008	 */
1009	if ((error = xfs_rtmount_inodes(mp))) {
1010		/*
1011		 * Free up the root inode.
1012		 */
1013		cmn_err(CE_WARN, "XFS: failed to read RT inodes");
1014		goto error4;
1015	}
1016
1017	/*
1018	 * If fs is not mounted readonly, then update the superblock
1019	 * unit and width changes.
1020	 */
1021	if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY))
1022		xfs_mount_log_sbunit(mp, update_flags);
1023
1024	/*
1025	 * Initialise the XFS quota management subsystem for this mount
1026	 */
1027	if ((error = XFS_QM_INIT(mp, &quotamount, &quotaflags)))
1028		goto error4;
1029
1030	/*
1031	 * Finish recovering the file system.  This part needed to be
1032	 * delayed until after the root and real-time bitmap inodes
1033	 * were consistently read in.
1034	 */
1035	error = xfs_log_mount_finish(mp, mfsi_flags);
1036	if (error) {
1037		cmn_err(CE_WARN, "XFS: log mount finish failed");
1038		goto error4;
1039	}
1040
1041	/*
1042	 * Complete the quota initialisation, post-log-replay component.
1043	 */
1044	if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags)))
1045		goto error4;
1046
1047	return 0;
1048
1049 error4:
1050	/*
1051	 * Free up the root inode.
1052	 */
1053	VN_RELE(rvp);
1054 error3:
1055	xfs_log_unmount_dealloc(mp);
1056 error2:
1057	xfs_ihash_free(mp);
1058	xfs_chash_free(mp);
1059	for (agno = 0; agno < sbp->sb_agcount; agno++)
1060		if (mp->m_perag[agno].pagb_list)
1061			kmem_free(mp->m_perag[agno].pagb_list,
1062			  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
1063	kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
1064	mp->m_perag = NULL;
1065	/* FALLTHROUGH */
1066 error1:
1067	if (uuid_mounted)
1068		xfs_uuid_unmount(mp);
1069	xfs_freesb(mp);
1070	return error;
1071}
1072
1073/*
1074 * xfs_unmountfs
1075 *
1076 * This flushes out the inodes,dquots and the superblock, unmounts the
1077 * log and makes sure that incore structures are freed.
1078 */
1079int
1080xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1081{
1082	struct bhv_vfs	*vfsp = XFS_MTOVFS(mp);
1083#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1084	int64_t		fsid;
1085#endif
1086
1087	xfs_iflush_all(mp);
1088
1089	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
1090
1091	/*
1092	 * Flush out the log synchronously so that we know for sure
1093	 * that nothing is pinned.  This is important because bflush()
1094	 * will skip pinned buffers.
1095	 */
1096	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1097
1098	xfs_binval(mp->m_ddev_targp);
1099	if (mp->m_rtdev_targp) {
1100		xfs_binval(mp->m_rtdev_targp);
1101	}
1102
1103	xfs_unmountfs_writesb(mp);
1104
1105	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
1106
1107	xfs_log_unmount(mp);			/* Done! No more fs ops. */
1108
1109	xfs_freesb(mp);
1110
1111	/*
1112	 * All inodes from this mount point should be freed.
1113	 */
1114	ASSERT(mp->m_inodes == NULL);
1115
1116	xfs_unmountfs_close(mp, cr);
1117	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
1118		xfs_uuid_unmount(mp);
1119
1120#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1121	/*
1122	 * clear all error tags on this filesystem
1123	 */
1124	memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t));
1125	xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0);
1126#endif
1127	XFS_IODONE(vfsp);
1128	xfs_mount_free(mp, 1);
1129	return 0;
1130}
1131
1132void
1133xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
1134{
1135	if (mp->m_logdev_targp != mp->m_ddev_targp)
1136		xfs_free_buftarg(mp->m_logdev_targp, 1);
1137	if (mp->m_rtdev_targp)
1138		xfs_free_buftarg(mp->m_rtdev_targp, 1);
1139	xfs_free_buftarg(mp->m_ddev_targp, 0);
1140}
1141
1142STATIC void
1143xfs_unmountfs_wait(xfs_mount_t *mp)
1144{
1145	if (mp->m_logdev_targp != mp->m_ddev_targp)
1146		xfs_wait_buftarg(mp->m_logdev_targp);
1147	if (mp->m_rtdev_targp)
1148		xfs_wait_buftarg(mp->m_rtdev_targp);
1149	xfs_wait_buftarg(mp->m_ddev_targp);
1150}
1151
1152int
1153xfs_unmountfs_writesb(xfs_mount_t *mp)
1154{
1155	xfs_buf_t	*sbp;
1156	xfs_sb_t	*sb;
1157	int		error = 0;
1158
1159	/*
1160	 * skip superblock write if fs is read-only, or
1161	 * if we are doing a forced umount.
1162	 */
1163	sbp = xfs_getsb(mp, 0);
1164	if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
1165		XFS_FORCED_SHUTDOWN(mp))) {
1166
1167		xfs_icsb_sync_counters(mp);
1168
1169		/*
1170		 * mark shared-readonly if desired
1171		 */
1172		sb = XFS_BUF_TO_SBP(sbp);
1173		if (mp->m_mk_sharedro) {
1174			if (!(sb->sb_flags & XFS_SBF_READONLY))
1175				sb->sb_flags |= XFS_SBF_READONLY;
1176			if (!XFS_SB_VERSION_HASSHARED(sb))
1177				XFS_SB_VERSION_ADDSHARED(sb);
1178			xfs_fs_cmn_err(CE_NOTE, mp,
1179				"Unmounting, marking shared read-only");
1180		}
1181		XFS_BUF_UNDONE(sbp);
1182		XFS_BUF_UNREAD(sbp);
1183		XFS_BUF_UNDELAYWRITE(sbp);
1184		XFS_BUF_WRITE(sbp);
1185		XFS_BUF_UNASYNC(sbp);
1186		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1187		xfsbdstrat(mp, sbp);
1188		/* Nevermind errors we might get here. */
1189		error = xfs_iowait(sbp);
1190		if (error)
1191			xfs_ioerror_alert("xfs_unmountfs_writesb",
1192					  mp, sbp, XFS_BUF_ADDR(sbp));
1193		if (error && mp->m_mk_sharedro)
1194			xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
1195	}
1196	xfs_buf_relse(sbp);
1197	return error;
1198}
1199
1200/*
1201 * xfs_mod_sb() can be used to copy arbitrary changes to the
1202 * in-core superblock into the superblock buffer to be logged.
1203 * It does not provide the higher level of locking that is
1204 * needed to protect the in-core superblock from concurrent
1205 * access.
1206 */
1207void
1208xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1209{
1210	xfs_buf_t	*bp;
1211	int		first;
1212	int		last;
1213	xfs_mount_t	*mp;
1214	xfs_sb_t	*sbp;
1215	xfs_sb_field_t	f;
1216
1217	ASSERT(fields);
1218	if (!fields)
1219		return;
1220	mp = tp->t_mountp;
1221	bp = xfs_trans_getsb(tp, mp, 0);
1222	sbp = XFS_BUF_TO_SBP(bp);
1223	first = sizeof(xfs_sb_t);
1224	last = 0;
1225
1226	/* translate/copy */
1227
1228	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields);
1229
1230	/* find modified range */
1231
1232	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1233	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1234	first = xfs_sb_info[f].offset;
1235
1236	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1237	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1238	last = xfs_sb_info[f + 1].offset - 1;
1239
1240	xfs_trans_log_buf(tp, bp, first, last);
1241}
1242
1243
1244/*
1245 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1246 * a delta to a specified field in the in-core superblock.  Simply
1247 * switch on the field indicated and apply the delta to that field.
1248 * Fields are not allowed to dip below zero, so if the delta would
1249 * do this do not apply it and return EINVAL.
1250 *
1251 * The SB_LOCK must be held when this routine is called.
1252 */
1253int
1254xfs_mod_incore_sb_unlocked(
1255	xfs_mount_t	*mp,
1256	xfs_sb_field_t	field,
1257	int64_t		delta,
1258	int		rsvd)
1259{
1260	int		scounter;	/* short counter for 32 bit fields */
1261	long long	lcounter;	/* long counter for 64 bit fields */
1262	long long	res_used, rem;
1263
1264	/*
1265	 * With the in-core superblock spin lock held, switch
1266	 * on the indicated field.  Apply the delta to the
1267	 * proper field.  If the fields value would dip below
1268	 * 0, then do not apply the delta and return EINVAL.
1269	 */
1270	switch (field) {
1271	case XFS_SBS_ICOUNT:
1272		lcounter = (long long)mp->m_sb.sb_icount;
1273		lcounter += delta;
1274		if (lcounter < 0) {
1275			ASSERT(0);
1276			return XFS_ERROR(EINVAL);
1277		}
1278		mp->m_sb.sb_icount = lcounter;
1279		return 0;
1280	case XFS_SBS_IFREE:
1281		lcounter = (long long)mp->m_sb.sb_ifree;
1282		lcounter += delta;
1283		if (lcounter < 0) {
1284			ASSERT(0);
1285			return XFS_ERROR(EINVAL);
1286		}
1287		mp->m_sb.sb_ifree = lcounter;
1288		return 0;
1289	case XFS_SBS_FDBLOCKS:
1290		lcounter = (long long)
1291			mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1292		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1293
1294		if (delta > 0) {		/* Putting blocks back */
1295			if (res_used > delta) {
1296				mp->m_resblks_avail += delta;
1297			} else {
1298				rem = delta - res_used;
1299				mp->m_resblks_avail = mp->m_resblks;
1300				lcounter += rem;
1301			}
1302		} else {				/* Taking blocks away */
1303
1304			lcounter += delta;
1305
1306		/*
1307		 * If were out of blocks, use any available reserved blocks if
1308		 * were allowed to.
1309		 */
1310
1311			if (lcounter < 0) {
1312				if (rsvd) {
1313					lcounter = (long long)mp->m_resblks_avail + delta;
1314					if (lcounter < 0) {
1315						return XFS_ERROR(ENOSPC);
1316					}
1317					mp->m_resblks_avail = lcounter;
1318					return 0;
1319				} else {	/* not reserved */
1320					return XFS_ERROR(ENOSPC);
1321				}
1322			}
1323		}
1324
1325		mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1326		return 0;
1327	case XFS_SBS_FREXTENTS:
1328		lcounter = (long long)mp->m_sb.sb_frextents;
1329		lcounter += delta;
1330		if (lcounter < 0) {
1331			return XFS_ERROR(ENOSPC);
1332		}
1333		mp->m_sb.sb_frextents = lcounter;
1334		return 0;
1335	case XFS_SBS_DBLOCKS:
1336		lcounter = (long long)mp->m_sb.sb_dblocks;
1337		lcounter += delta;
1338		if (lcounter < 0) {
1339			ASSERT(0);
1340			return XFS_ERROR(EINVAL);
1341		}
1342		mp->m_sb.sb_dblocks = lcounter;
1343		return 0;
1344	case XFS_SBS_AGCOUNT:
1345		scounter = mp->m_sb.sb_agcount;
1346		scounter += delta;
1347		if (scounter < 0) {
1348			ASSERT(0);
1349			return XFS_ERROR(EINVAL);
1350		}
1351		mp->m_sb.sb_agcount = scounter;
1352		return 0;
1353	case XFS_SBS_IMAX_PCT:
1354		scounter = mp->m_sb.sb_imax_pct;
1355		scounter += delta;
1356		if (scounter < 0) {
1357			ASSERT(0);
1358			return XFS_ERROR(EINVAL);
1359		}
1360		mp->m_sb.sb_imax_pct = scounter;
1361		return 0;
1362	case XFS_SBS_REXTSIZE:
1363		scounter = mp->m_sb.sb_rextsize;
1364		scounter += delta;
1365		if (scounter < 0) {
1366			ASSERT(0);
1367			return XFS_ERROR(EINVAL);
1368		}
1369		mp->m_sb.sb_rextsize = scounter;
1370		return 0;
1371	case XFS_SBS_RBMBLOCKS:
1372		scounter = mp->m_sb.sb_rbmblocks;
1373		scounter += delta;
1374		if (scounter < 0) {
1375			ASSERT(0);
1376			return XFS_ERROR(EINVAL);
1377		}
1378		mp->m_sb.sb_rbmblocks = scounter;
1379		return 0;
1380	case XFS_SBS_RBLOCKS:
1381		lcounter = (long long)mp->m_sb.sb_rblocks;
1382		lcounter += delta;
1383		if (lcounter < 0) {
1384			ASSERT(0);
1385			return XFS_ERROR(EINVAL);
1386		}
1387		mp->m_sb.sb_rblocks = lcounter;
1388		return 0;
1389	case XFS_SBS_REXTENTS:
1390		lcounter = (long long)mp->m_sb.sb_rextents;
1391		lcounter += delta;
1392		if (lcounter < 0) {
1393			ASSERT(0);
1394			return XFS_ERROR(EINVAL);
1395		}
1396		mp->m_sb.sb_rextents = lcounter;
1397		return 0;
1398	case XFS_SBS_REXTSLOG:
1399		scounter = mp->m_sb.sb_rextslog;
1400		scounter += delta;
1401		if (scounter < 0) {
1402			ASSERT(0);
1403			return XFS_ERROR(EINVAL);
1404		}
1405		mp->m_sb.sb_rextslog = scounter;
1406		return 0;
1407	default:
1408		ASSERT(0);
1409		return XFS_ERROR(EINVAL);
1410	}
1411}
1412
1413/*
1414 * xfs_mod_incore_sb() is used to change a field in the in-core
1415 * superblock structure by the specified delta.  This modification
1416 * is protected by the SB_LOCK.  Just use the xfs_mod_incore_sb_unlocked()
1417 * routine to do the work.
1418 */
1419int
1420xfs_mod_incore_sb(
1421	xfs_mount_t	*mp,
1422	xfs_sb_field_t	field,
1423	int64_t		delta,
1424	int		rsvd)
1425{
1426	unsigned long	s;
1427	int	status;
1428
1429	/* check for per-cpu counters */
1430	switch (field) {
1431#ifdef HAVE_PERCPU_SB
1432	case XFS_SBS_ICOUNT:
1433	case XFS_SBS_IFREE:
1434	case XFS_SBS_FDBLOCKS:
1435		if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1436			status = xfs_icsb_modify_counters(mp, field,
1437							delta, rsvd);
1438			break;
1439		}
1440		/* FALLTHROUGH */
1441#endif
1442	default:
1443		s = XFS_SB_LOCK(mp);
1444		status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1445		XFS_SB_UNLOCK(mp, s);
1446		break;
1447	}
1448
1449	return status;
1450}
1451
1452/*
1453 * xfs_mod_incore_sb_batch() is used to change more than one field
1454 * in the in-core superblock structure at a time.  This modification
1455 * is protected by a lock internal to this module.  The fields and
1456 * changes to those fields are specified in the array of xfs_mod_sb
1457 * structures passed in.
1458 *
1459 * Either all of the specified deltas will be applied or none of
1460 * them will.  If any modified field dips below 0, then all modifications
1461 * will be backed out and EINVAL will be returned.
1462 */
1463int
1464xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
1465{
1466	unsigned long	s;
1467	int		status=0;
1468	xfs_mod_sb_t	*msbp;
1469
1470	/*
1471	 * Loop through the array of mod structures and apply each
1472	 * individually.  If any fail, then back out all those
1473	 * which have already been applied.  Do all of this within
1474	 * the scope of the SB_LOCK so that all of the changes will
1475	 * be atomic.
1476	 */
1477	s = XFS_SB_LOCK(mp);
1478	msbp = &msb[0];
1479	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1480		/*
1481		 * Apply the delta at index n.  If it fails, break
1482		 * from the loop so we'll fall into the undo loop
1483		 * below.
1484		 */
1485		switch (msbp->msb_field) {
1486#ifdef HAVE_PERCPU_SB
1487		case XFS_SBS_ICOUNT:
1488		case XFS_SBS_IFREE:
1489		case XFS_SBS_FDBLOCKS:
1490			if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1491				XFS_SB_UNLOCK(mp, s);
1492				status = xfs_icsb_modify_counters(mp,
1493							msbp->msb_field,
1494							msbp->msb_delta, rsvd);
1495				s = XFS_SB_LOCK(mp);
1496				break;
1497			}
1498			/* FALLTHROUGH */
1499#endif
1500		default:
1501			status = xfs_mod_incore_sb_unlocked(mp,
1502						msbp->msb_field,
1503						msbp->msb_delta, rsvd);
1504			break;
1505		}
1506
1507		if (status != 0) {
1508			break;
1509		}
1510	}
1511
1512	/*
1513	 * If we didn't complete the loop above, then back out
1514	 * any changes made to the superblock.  If you add code
1515	 * between the loop above and here, make sure that you
1516	 * preserve the value of status. Loop back until
1517	 * we step below the beginning of the array.  Make sure
1518	 * we don't touch anything back there.
1519	 */
1520	if (status != 0) {
1521		msbp--;
1522		while (msbp >= msb) {
1523			switch (msbp->msb_field) {
1524#ifdef HAVE_PERCPU_SB
1525			case XFS_SBS_ICOUNT:
1526			case XFS_SBS_IFREE:
1527			case XFS_SBS_FDBLOCKS:
1528				if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1529					XFS_SB_UNLOCK(mp, s);
1530					status = xfs_icsb_modify_counters(mp,
1531							msbp->msb_field,
1532							-(msbp->msb_delta),
1533							rsvd);
1534					s = XFS_SB_LOCK(mp);
1535					break;
1536				}
1537				/* FALLTHROUGH */
1538#endif
1539			default:
1540				status = xfs_mod_incore_sb_unlocked(mp,
1541							msbp->msb_field,
1542							-(msbp->msb_delta),
1543							rsvd);
1544				break;
1545			}
1546			ASSERT(status == 0);
1547			msbp--;
1548		}
1549	}
1550	XFS_SB_UNLOCK(mp, s);
1551	return status;
1552}
1553
1554/*
1555 * xfs_getsb() is called to obtain the buffer for the superblock.
1556 * The buffer is returned locked and read in from disk.
1557 * The buffer should be released with a call to xfs_brelse().
1558 *
1559 * If the flags parameter is BUF_TRYLOCK, then we'll only return
1560 * the superblock buffer if it can be locked without sleeping.
1561 * If it can't then we'll return NULL.
1562 */
1563xfs_buf_t *
1564xfs_getsb(
1565	xfs_mount_t	*mp,
1566	int		flags)
1567{
1568	xfs_buf_t	*bp;
1569
1570	ASSERT(mp->m_sb_bp != NULL);
1571	bp = mp->m_sb_bp;
1572	if (flags & XFS_BUF_TRYLOCK) {
1573		if (!XFS_BUF_CPSEMA(bp)) {
1574			return NULL;
1575		}
1576	} else {
1577		XFS_BUF_PSEMA(bp, PRIBIO);
1578	}
1579	XFS_BUF_HOLD(bp);
1580	ASSERT(XFS_BUF_ISDONE(bp));
1581	return bp;
1582}
1583
1584/*
1585 * Used to free the superblock along various error paths.
1586 */
1587void
1588xfs_freesb(
1589	xfs_mount_t	*mp)
1590{
1591	xfs_buf_t	*bp;
1592
1593	/*
1594	 * Use xfs_getsb() so that the buffer will be locked
1595	 * when we call xfs_buf_relse().
1596	 */
1597	bp = xfs_getsb(mp, 0);
1598	XFS_BUF_UNMANAGE(bp);
1599	xfs_buf_relse(bp);
1600	mp->m_sb_bp = NULL;
1601}
1602
1603/*
1604 * See if the UUID is unique among mounted XFS filesystems.
1605 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
1606 */
1607STATIC int
1608xfs_uuid_mount(
1609	xfs_mount_t	*mp)
1610{
1611	if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
1612		cmn_err(CE_WARN,
1613			"XFS: Filesystem %s has nil UUID - can't mount",
1614			mp->m_fsname);
1615		return -1;
1616	}
1617	if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
1618		cmn_err(CE_WARN,
1619			"XFS: Filesystem %s has duplicate UUID - can't mount",
1620			mp->m_fsname);
1621		return -1;
1622	}
1623	return 0;
1624}
1625
1626/*
1627 * Remove filesystem from the UUID table.
1628 */
1629STATIC void
1630xfs_uuid_unmount(
1631	xfs_mount_t	*mp)
1632{
1633	uuid_table_remove(&mp->m_sb.sb_uuid);
1634}
1635
1636/*
1637 * Used to log changes to the superblock unit and width fields which could
1638 * be altered by the mount options. Only the first superblock is updated.
1639 */
1640STATIC void
1641xfs_mount_log_sbunit(
1642	xfs_mount_t	*mp,
1643	__int64_t	fields)
1644{
1645	xfs_trans_t	*tp;
1646
1647	ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
1648
1649	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1650	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1651				XFS_DEFAULT_LOG_COUNT)) {
1652		xfs_trans_cancel(tp, 0);
1653		return;
1654	}
1655	xfs_mod_sb(tp, fields);
1656	xfs_trans_commit(tp, 0);
1657}
1658
1659
1660#ifdef HAVE_PERCPU_SB
1661/*
1662 * Per-cpu incore superblock counters
1663 *
1664 * Simple concept, difficult implementation
1665 *
1666 * Basically, replace the incore superblock counters with a distributed per cpu
1667 * counter for contended fields (e.g.  free block count).
1668 *
1669 * Difficulties arise in that the incore sb is used for ENOSPC checking, and
1670 * hence needs to be accurately read when we are running low on space. Hence
1671 * there is a method to enable and disable the per-cpu counters based on how
1672 * much "stuff" is available in them.
1673 *
1674 * Basically, a counter is enabled if there is enough free resource to justify
1675 * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
1676 * ENOSPC), then we disable the counters to synchronise all callers and
1677 * re-distribute the available resources.
1678 *
1679 * If, once we redistributed the available resources, we still get a failure,
1680 * we disable the per-cpu counter and go through the slow path.
1681 *
1682 * The slow path is the current xfs_mod_incore_sb() function.  This means that
1683 * when we disable a per-cpu counter, we need to drain it's resources back to
1684 * the global superblock. We do this after disabling the counter to prevent
1685 * more threads from queueing up on the counter.
1686 *
1687 * Essentially, this means that we still need a lock in the fast path to enable
1688 * synchronisation between the global counters and the per-cpu counters. This
1689 * is not a problem because the lock will be local to a CPU almost all the time
1690 * and have little contention except when we get to ENOSPC conditions.
1691 *
1692 * Basically, this lock becomes a barrier that enables us to lock out the fast
1693 * path while we do things like enabling and disabling counters and
1694 * synchronising the counters.
1695 *
1696 * Locking rules:
1697 *
1698 * 	1. XFS_SB_LOCK() before picking up per-cpu locks
1699 * 	2. per-cpu locks always picked up via for_each_online_cpu() order
1700 * 	3. accurate counter sync requires XFS_SB_LOCK + per cpu locks
1701 * 	4. modifying per-cpu counters requires holding per-cpu lock
1702 * 	5. modifying global counters requires holding XFS_SB_LOCK
1703 *	6. enabling or disabling a counter requires holding the XFS_SB_LOCK
1704 *	   and _none_ of the per-cpu locks.
1705 *
1706 * Disabled counters are only ever re-enabled by a balance operation
1707 * that results in more free resources per CPU than a given threshold.
1708 * To ensure counters don't remain disabled, they are rebalanced when
1709 * the global resource goes above a higher threshold (i.e. some hysteresis
1710 * is present to prevent thrashing).
1711 */
1712
1713#ifdef CONFIG_HOTPLUG_CPU
1714/*
1715 * hot-plug CPU notifier support.
1716 *
1717 * We need a notifier per filesystem as we need to be able to identify
1718 * the filesystem to balance the counters out. This is achieved by
1719 * having a notifier block embedded in the xfs_mount_t and doing pointer
1720 * magic to get the mount pointer from the notifier block address.
1721 */
1722STATIC int
1723xfs_icsb_cpu_notify(
1724	struct notifier_block *nfb,
1725	unsigned long action,
1726	void *hcpu)
1727{
1728	xfs_icsb_cnts_t *cntp;
1729	xfs_mount_t	*mp;
1730	int		s;
1731
1732	mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
1733	cntp = (xfs_icsb_cnts_t *)
1734			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
1735	switch (action) {
1736	case CPU_UP_PREPARE:
1737	case CPU_UP_PREPARE_FROZEN:
1738		/* Easy Case - initialize the area and locks, and
1739		 * then rebalance when online does everything else for us. */
1740		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1741		break;
1742	case CPU_ONLINE:
1743	case CPU_ONLINE_FROZEN:
1744		xfs_icsb_lock(mp);
1745		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
1746		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
1747		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0);
1748		xfs_icsb_unlock(mp);
1749		break;
1750	case CPU_DEAD:
1751	case CPU_DEAD_FROZEN:
1752		/* Disable all the counters, then fold the dead cpu's
1753		 * count into the total on the global superblock and
1754		 * re-enable the counters. */
1755		xfs_icsb_lock(mp);
1756		s = XFS_SB_LOCK(mp);
1757		xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
1758		xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
1759		xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
1760
1761		mp->m_sb.sb_icount += cntp->icsb_icount;
1762		mp->m_sb.sb_ifree += cntp->icsb_ifree;
1763		mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
1764
1765		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1766
1767		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT,
1768					 XFS_ICSB_SB_LOCKED, 0);
1769		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE,
1770					 XFS_ICSB_SB_LOCKED, 0);
1771		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS,
1772					 XFS_ICSB_SB_LOCKED, 0);
1773		XFS_SB_UNLOCK(mp, s);
1774		xfs_icsb_unlock(mp);
1775		break;
1776	}
1777
1778	return NOTIFY_OK;
1779}
1780#endif /* CONFIG_HOTPLUG_CPU */
1781
1782int
1783xfs_icsb_init_counters(
1784	xfs_mount_t	*mp)
1785{
1786	xfs_icsb_cnts_t *cntp;
1787	int		i;
1788
1789	mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
1790	if (mp->m_sb_cnts == NULL)
1791		return -ENOMEM;
1792
1793#ifdef CONFIG_HOTPLUG_CPU
1794	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
1795	mp->m_icsb_notifier.priority = 0;
1796	register_hotcpu_notifier(&mp->m_icsb_notifier);
1797#endif /* CONFIG_HOTPLUG_CPU */
1798
1799	for_each_online_cpu(i) {
1800		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1801		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1802	}
1803
1804	mutex_init(&mp->m_icsb_mutex);
1805
1806	/*
1807	 * start with all counters disabled so that the
1808	 * initial balance kicks us off correctly
1809	 */
1810	mp->m_icsb_counters = -1;
1811	return 0;
1812}
1813
1814void
1815xfs_icsb_reinit_counters(
1816	xfs_mount_t	*mp)
1817{
1818	xfs_icsb_lock(mp);
1819	/*
1820	 * start with all counters disabled so that the
1821	 * initial balance kicks us off correctly
1822	 */
1823	mp->m_icsb_counters = -1;
1824	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
1825	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
1826	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0);
1827	xfs_icsb_unlock(mp);
1828}
1829
1830STATIC void
1831xfs_icsb_destroy_counters(
1832	xfs_mount_t	*mp)
1833{
1834	if (mp->m_sb_cnts) {
1835		unregister_hotcpu_notifier(&mp->m_icsb_notifier);
1836		free_percpu(mp->m_sb_cnts);
1837	}
1838	mutex_destroy(&mp->m_icsb_mutex);
1839}
1840
1841STATIC_INLINE void
1842xfs_icsb_lock_cntr(
1843	xfs_icsb_cnts_t	*icsbp)
1844{
1845	while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
1846		ndelay(1000);
1847	}
1848}
1849
1850STATIC_INLINE void
1851xfs_icsb_unlock_cntr(
1852	xfs_icsb_cnts_t	*icsbp)
1853{
1854	clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
1855}
1856
1857
1858STATIC_INLINE void
1859xfs_icsb_lock_all_counters(
1860	xfs_mount_t	*mp)
1861{
1862	xfs_icsb_cnts_t *cntp;
1863	int		i;
1864
1865	for_each_online_cpu(i) {
1866		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1867		xfs_icsb_lock_cntr(cntp);
1868	}
1869}
1870
1871STATIC_INLINE void
1872xfs_icsb_unlock_all_counters(
1873	xfs_mount_t	*mp)
1874{
1875	xfs_icsb_cnts_t *cntp;
1876	int		i;
1877
1878	for_each_online_cpu(i) {
1879		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1880		xfs_icsb_unlock_cntr(cntp);
1881	}
1882}
1883
1884STATIC void
1885xfs_icsb_count(
1886	xfs_mount_t	*mp,
1887	xfs_icsb_cnts_t	*cnt,
1888	int		flags)
1889{
1890	xfs_icsb_cnts_t *cntp;
1891	int		i;
1892
1893	memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
1894
1895	if (!(flags & XFS_ICSB_LAZY_COUNT))
1896		xfs_icsb_lock_all_counters(mp);
1897
1898	for_each_online_cpu(i) {
1899		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1900		cnt->icsb_icount += cntp->icsb_icount;
1901		cnt->icsb_ifree += cntp->icsb_ifree;
1902		cnt->icsb_fdblocks += cntp->icsb_fdblocks;
1903	}
1904
1905	if (!(flags & XFS_ICSB_LAZY_COUNT))
1906		xfs_icsb_unlock_all_counters(mp);
1907}
1908
1909STATIC int
1910xfs_icsb_counter_disabled(
1911	xfs_mount_t	*mp,
1912	xfs_sb_field_t	field)
1913{
1914	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1915	return test_bit(field, &mp->m_icsb_counters);
1916}
1917
1918STATIC int
1919xfs_icsb_disable_counter(
1920	xfs_mount_t	*mp,
1921	xfs_sb_field_t	field)
1922{
1923	xfs_icsb_cnts_t	cnt;
1924
1925	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1926
1927	/*
1928	 * If we are already disabled, then there is nothing to do
1929	 * here. We check before locking all the counters to avoid
1930	 * the expensive lock operation when being called in the
1931	 * slow path and the counter is already disabled. This is
1932	 * safe because the only time we set or clear this state is under
1933	 * the m_icsb_mutex.
1934	 */
1935	if (xfs_icsb_counter_disabled(mp, field))
1936		return 0;
1937
1938	xfs_icsb_lock_all_counters(mp);
1939	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
1940		/* drain back to superblock */
1941
1942		xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT);
1943		switch(field) {
1944		case XFS_SBS_ICOUNT:
1945			mp->m_sb.sb_icount = cnt.icsb_icount;
1946			break;
1947		case XFS_SBS_IFREE:
1948			mp->m_sb.sb_ifree = cnt.icsb_ifree;
1949			break;
1950		case XFS_SBS_FDBLOCKS:
1951			mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
1952			break;
1953		default:
1954			BUG();
1955		}
1956	}
1957
1958	xfs_icsb_unlock_all_counters(mp);
1959
1960	return 0;
1961}
1962
1963STATIC void
1964xfs_icsb_enable_counter(
1965	xfs_mount_t	*mp,
1966	xfs_sb_field_t	field,
1967	uint64_t	count,
1968	uint64_t	resid)
1969{
1970	xfs_icsb_cnts_t	*cntp;
1971	int		i;
1972
1973	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1974
1975	xfs_icsb_lock_all_counters(mp);
1976	for_each_online_cpu(i) {
1977		cntp = per_cpu_ptr(mp->m_sb_cnts, i);
1978		switch (field) {
1979		case XFS_SBS_ICOUNT:
1980			cntp->icsb_icount = count + resid;
1981			break;
1982		case XFS_SBS_IFREE:
1983			cntp->icsb_ifree = count + resid;
1984			break;
1985		case XFS_SBS_FDBLOCKS:
1986			cntp->icsb_fdblocks = count + resid;
1987			break;
1988		default:
1989			BUG();
1990			break;
1991		}
1992		resid = 0;
1993	}
1994	clear_bit(field, &mp->m_icsb_counters);
1995	xfs_icsb_unlock_all_counters(mp);
1996}
1997
1998void
1999xfs_icsb_sync_counters_flags(
2000	xfs_mount_t	*mp,
2001	int		flags)
2002{
2003	xfs_icsb_cnts_t	cnt;
2004	int		s;
2005
2006	/* Pass 1: lock all counters */
2007	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
2008		s = XFS_SB_LOCK(mp);
2009
2010	xfs_icsb_count(mp, &cnt, flags);
2011
2012	/* Step 3: update mp->m_sb fields */
2013	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
2014		mp->m_sb.sb_icount = cnt.icsb_icount;
2015	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
2016		mp->m_sb.sb_ifree = cnt.icsb_ifree;
2017	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
2018		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
2019
2020	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
2021		XFS_SB_UNLOCK(mp, s);
2022}
2023
2024/*
2025 * Accurate update of per-cpu counters to incore superblock
2026 */
2027STATIC void
2028xfs_icsb_sync_counters(
2029	xfs_mount_t	*mp)
2030{
2031	xfs_icsb_sync_counters_flags(mp, 0);
2032}
2033
2034/*
2035 * Balance and enable/disable counters as necessary.
2036 *
2037 * Thresholds for re-enabling counters are somewhat magic.  inode counts are
2038 * chosen to be the same number as single on disk allocation chunk per CPU, and
2039 * free blocks is something far enough zero that we aren't going thrash when we
2040 * get near ENOSPC. We also need to supply a minimum we require per cpu to
2041 * prevent looping endlessly when xfs_alloc_space asks for more than will
2042 * be distributed to a single CPU but each CPU has enough blocks to be
2043 * reenabled.
2044 *
2045 * Note that we can be called when counters are already disabled.
2046 * xfs_icsb_disable_counter() optimises the counter locking in this case to
2047 * prevent locking every per-cpu counter needlessly.
2048 */
2049
2050#define XFS_ICSB_INO_CNTR_REENABLE	(uint64_t)64
2051#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
2052		(uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
2053STATIC void
2054xfs_icsb_balance_counter(
2055	xfs_mount_t	*mp,
2056	xfs_sb_field_t  field,
2057	int		flags,
2058	int		min_per_cpu)
2059{
2060	uint64_t	count, resid;
2061	int		weight = num_online_cpus();
2062	int		s;
2063	uint64_t	min = (uint64_t)min_per_cpu;
2064
2065	if (!(flags & XFS_ICSB_SB_LOCKED))
2066		s = XFS_SB_LOCK(mp);
2067
2068	/* disable counter and sync counter */
2069	xfs_icsb_disable_counter(mp, field);
2070
2071	/* update counters  - first CPU gets residual*/
2072	switch (field) {
2073	case XFS_SBS_ICOUNT:
2074		count = mp->m_sb.sb_icount;
2075		resid = do_div(count, weight);
2076		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
2077			goto out;
2078		break;
2079	case XFS_SBS_IFREE:
2080		count = mp->m_sb.sb_ifree;
2081		resid = do_div(count, weight);
2082		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
2083			goto out;
2084		break;
2085	case XFS_SBS_FDBLOCKS:
2086		count = mp->m_sb.sb_fdblocks;
2087		resid = do_div(count, weight);
2088		if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
2089			goto out;
2090		break;
2091	default:
2092		BUG();
2093		count = resid = 0;	/* quiet, gcc */
2094		break;
2095	}
2096
2097	xfs_icsb_enable_counter(mp, field, count, resid);
2098out:
2099	if (!(flags & XFS_ICSB_SB_LOCKED))
2100		XFS_SB_UNLOCK(mp, s);
2101}
2102
2103int
2104xfs_icsb_modify_counters(
2105	xfs_mount_t	*mp,
2106	xfs_sb_field_t	field,
2107	int64_t		delta,
2108	int		rsvd)
2109{
2110	xfs_icsb_cnts_t	*icsbp;
2111	long long	lcounter;	/* long counter for 64 bit fields */
2112	int		cpu, ret = 0, s;
2113
2114	might_sleep();
2115again:
2116	cpu = get_cpu();
2117	icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu);
2118
2119	/*
2120	 * if the counter is disabled, go to slow path
2121	 */
2122	if (unlikely(xfs_icsb_counter_disabled(mp, field)))
2123		goto slow_path;
2124	xfs_icsb_lock_cntr(icsbp);
2125	if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
2126		xfs_icsb_unlock_cntr(icsbp);
2127		goto slow_path;
2128	}
2129
2130	switch (field) {
2131	case XFS_SBS_ICOUNT:
2132		lcounter = icsbp->icsb_icount;
2133		lcounter += delta;
2134		if (unlikely(lcounter < 0))
2135			goto balance_counter;
2136		icsbp->icsb_icount = lcounter;
2137		break;
2138
2139	case XFS_SBS_IFREE:
2140		lcounter = icsbp->icsb_ifree;
2141		lcounter += delta;
2142		if (unlikely(lcounter < 0))
2143			goto balance_counter;
2144		icsbp->icsb_ifree = lcounter;
2145		break;
2146
2147	case XFS_SBS_FDBLOCKS:
2148		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
2149
2150		lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
2151		lcounter += delta;
2152		if (unlikely(lcounter < 0))
2153			goto balance_counter;
2154		icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
2155		break;
2156	default:
2157		BUG();
2158		break;
2159	}
2160	xfs_icsb_unlock_cntr(icsbp);
2161	put_cpu();
2162	return 0;
2163
2164slow_path:
2165	put_cpu();
2166
2167	/*
2168	 * serialise with a mutex so we don't burn lots of cpu on
2169	 * the superblock lock. We still need to hold the superblock
2170	 * lock, however, when we modify the global structures.
2171	 */
2172	xfs_icsb_lock(mp);
2173
2174	/*
2175	 * Now running atomically.
2176	 *
2177	 * If the counter is enabled, someone has beaten us to rebalancing.
2178	 * Drop the lock and try again in the fast path....
2179	 */
2180	if (!(xfs_icsb_counter_disabled(mp, field))) {
2181		xfs_icsb_unlock(mp);
2182		goto again;
2183	}
2184
2185	/*
2186	 * The counter is currently disabled. Because we are
2187	 * running atomically here, we know a rebalance cannot
2188	 * be in progress. Hence we can go straight to operating
2189	 * on the global superblock. We do not call xfs_mod_incore_sb()
2190	 * here even though we need to get the SB_LOCK. Doing so
2191	 * will cause us to re-enter this function and deadlock.
2192	 * Hence we get the SB_LOCK ourselves and then call
2193	 * xfs_mod_incore_sb_unlocked() as the unlocked path operates
2194	 * directly on the global counters.
2195	 */
2196	s = XFS_SB_LOCK(mp);
2197	ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
2198	XFS_SB_UNLOCK(mp, s);
2199
2200	/*
2201	 * Now that we've modified the global superblock, we
2202	 * may be able to re-enable the distributed counters
2203	 * (e.g. lots of space just got freed). After that
2204	 * we are done.
2205	 */
2206	if (ret != ENOSPC)
2207		xfs_icsb_balance_counter(mp, field, 0, 0);
2208	xfs_icsb_unlock(mp);
2209	return ret;
2210
2211balance_counter:
2212	xfs_icsb_unlock_cntr(icsbp);
2213	put_cpu();
2214
2215	/*
2216	 * We may have multiple threads here if multiple per-cpu
2217	 * counters run dry at the same time. This will mean we can
2218	 * do more balances than strictly necessary but it is not
2219	 * the common slowpath case.
2220	 */
2221	xfs_icsb_lock(mp);
2222
2223	/*
2224	 * running atomically.
2225	 *
2226	 * This will leave the counter in the correct state for future
2227	 * accesses. After the rebalance, we simply try again and our retry
2228	 * will either succeed through the fast path or slow path without
2229	 * another balance operation being required.
2230	 */
2231	xfs_icsb_balance_counter(mp, field, 0, delta);
2232	xfs_icsb_unlock(mp);
2233	goto again;
2234}
2235
2236#endif
2237