1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26/*
27 * VM - page locking primitives
28 */
29#include <sys/param.h>
30#include <sys/t_lock.h>
31#include <sys/vtrace.h>
32#include <sys/debug.h>
33#include <sys/cmn_err.h>
34#include <sys/bitmap.h>
35#include <sys/lockstat.h>
36#include <sys/sysmacros.h>
37#include <sys/condvar_impl.h>
38#include <vm/page.h>
39#include <vm/seg_enum.h>
40#include <vm/vm_dep.h>
41#include <vm/seg_kmem.h>
42
43/*
44 * This global mutex array is for logical page locking.
45 * The following fields in the page structure are protected
46 * by this lock:
47 *
48 *	p_lckcnt
49 *	p_cowcnt
50 */
51pad_mutex_t page_llocks[8 * NCPU_P2];
52
53/*
54 * This is a global lock for the logical page free list.  The
55 * logical free list, in this implementation, is maintained as two
56 * separate physical lists - the cache list and the free list.
57 */
58kmutex_t  page_freelock;
59
60/*
61 * The hash table, page_hash[], the p_selock fields, and the
62 * list of pages associated with vnodes are protected by arrays of mutexes.
63 *
64 * Unless the hashes are changed radically, the table sizes must be
65 * a power of two.  Also, we typically need more mutexes for the
66 * vnodes since these locks are occasionally held for long periods.
67 * And since there seem to be two special vnodes (kvp and swapvp),
68 * we make room for private mutexes for them.
69 *
70 * The pse_mutex[] array holds the mutexes to protect the p_selock
71 * fields of all page_t structures.
72 *
73 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
74 * when given a pointer to a page_t.
75 *
76 * PIO_TABLE_SIZE must be a power of two.  One could argue that we
77 * should go to the trouble of setting it up at run time and base it
78 * on memory size rather than the number of compile time CPUs.
79 *
80 * XX64	We should be using physmem size to calculate PIO_SHIFT.
81 *
82 *	These might break in 64 bit world.
83 */
84#define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
85#define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
86
87pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
88kmutex_t	pio_mutex[PIO_TABLE_SIZE];
89
90#define	PAGE_IO_MUTEX(pp) \
91	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
92
93/*
94 * The pse_mutex[] array is allocated in the platform startup code
95 * based on the size of the machine at startup.
96 */
97extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
98extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
99extern int pse_shift;			/* log2(pse_table_size) */
100#define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
101	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
102	(pse_table_size - 1)].pad_mutex
103
104#define	PSZC_MTX_TABLE_SIZE	128
105#define	PSZC_MTX_TABLE_SHIFT	7
106
107static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
108
109#define	PAGE_SZC_MUTEX(_pp) \
110	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
111		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
112		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
113		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
114
115/*
116 * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
117 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
118 * and p_vpnext).
119 *
120 * The page_vnode_mutex(vp) function returns the address of the appropriate
121 * mutex from this array given a pointer to a vnode.  It is complicated
122 * by the fact that the kernel's vnode and the swapfs vnode are referenced
123 * frequently enough to warrent their own mutexes.
124 *
125 * The VP_HASH_FUNC returns the index into the vph_mutex array given
126 * an address of a vnode.
127 */
128
129#if defined(_LP64)
130#define	VPH_TABLE_SIZE  (8 * NCPU_P2)
131#else	/* 32 bits */
132#define	VPH_TABLE_SIZE	(2 * NCPU_P2)
133#endif
134
135#define	VP_HASH_FUNC(vp) \
136	((((uintptr_t)(vp) >> 6) + \
137	    ((uintptr_t)(vp) >> 8) + \
138	    ((uintptr_t)(vp) >> 10) + \
139	    ((uintptr_t)(vp) >> 12)) \
140	    & (VPH_TABLE_SIZE - 1))
141
142/*
143 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
144 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
145 * VPH_TABLE_SIZE + 1.
146 */
147
148kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
149
150/*
151 * Initialize the locks used by the Virtual Memory Management system.
152 */
153void
154page_lock_init()
155{
156}
157
158/*
159 * Return a value for pse_shift based on npg (the number of physical pages)
160 * and ncpu (the maximum number of CPUs).  This is called by platform startup
161 * code.
162 *
163 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
164 * locks grew approximately as the square of the number of threads executing.
165 * So the primary scaling factor used is NCPU^2.  The size of the machine in
166 * megabytes is used as an upper bound, particularly for sun4v machines which
167 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
168 * (128) is used as a minimum.  Since the size of the table has to be a power
169 * of two, the calculated size is rounded up to the next power of two.
170 */
171/*ARGSUSED*/
172int
173size_pse_array(pgcnt_t npg, int ncpu)
174{
175	size_t size;
176	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
177
178	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
179	size += (1 << (highbit(size) - 1)) - 1;
180	return (highbit(size) - 1);
181}
182
183/*
184 * At present we only use page ownership to aid debugging, so it's
185 * OK if the owner field isn't exact.  In the 32-bit world two thread ids
186 * can map to the same owner because we just 'or' in 0x80000000 and
187 * then clear the second highest bit, so that (for example) 0x2faced00
188 * and 0xafaced00 both map to 0xafaced00.
189 * In the 64-bit world, p_selock may not be large enough to hold a full
190 * thread pointer.  If we ever need precise ownership (e.g. if we implement
191 * priority inheritance for page locks) then p_selock should become a
192 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
193 */
194#define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
195#define	SE_READER	1
196
197/*
198 * A page that is deleted must be marked as such using the
199 * page_lock_delete() function. The page must be exclusively locked.
200 * The SE_DELETED marker is put in p_selock when this function is called.
201 * SE_DELETED must be distinct from any SE_WRITER value.
202 */
203#define	SE_DELETED	(1 | INT_MIN)
204
205#ifdef VM_STATS
206uint_t	vph_kvp_count;
207uint_t	vph_swapfsvp_count;
208uint_t	vph_other;
209#endif /* VM_STATS */
210
211#ifdef VM_STATS
212uint_t	page_lock_count;
213uint_t	page_lock_miss;
214uint_t	page_lock_miss_lock;
215uint_t	page_lock_reclaim;
216uint_t	page_lock_bad_reclaim;
217uint_t	page_lock_same_page;
218uint_t	page_lock_upgrade;
219uint_t	page_lock_retired;
220uint_t	page_lock_upgrade_failed;
221uint_t	page_lock_deleted;
222
223uint_t	page_trylock_locked;
224uint_t	page_trylock_failed;
225uint_t	page_trylock_missed;
226
227uint_t	page_try_reclaim_upgrade;
228#endif /* VM_STATS */
229
230/*
231 * Acquire the "shared/exclusive" lock on a page.
232 *
233 * Returns 1 on success and locks the page appropriately.
234 *	   0 on failure and does not lock the page.
235 *
236 * If `lock' is non-NULL, it will be dropped and reacquired in the
237 * failure case.  This routine can block, and if it does
238 * it will always return a failure since the page identity [vp, off]
239 * or state may have changed.
240 */
241
242int
243page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
244{
245	return (page_lock_es(pp, se, lock, reclaim, 0));
246}
247
248/*
249 * With the addition of reader-writer lock semantics to page_lock_es,
250 * callers wanting an exclusive (writer) lock may prevent shared-lock
251 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
252 * In this case, when an exclusive lock cannot be acquired, p_selock's
253 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
254 * if the page is slated for retirement.
255 *
256 * The se and es parameters determine if the lock should be granted
257 * based on the following decision table:
258 *
259 * Lock wanted   es flags     p_selock/SE_EWANTED  Action
260 * ----------- -------------- -------------------  ---------
261 * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
262 * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
263 * SE_EXCL        none         any lock/any        deny
264 * SE_SHARED      n/a [2]        shared/0          grant
265 * SE_SHARED      n/a [2]      unlocked/0          grant
266 * SE_SHARED      n/a            shared/1          deny
267 * SE_SHARED      n/a          unlocked/1          deny
268 * SE_SHARED      n/a              excl/any        deny
269 *
270 * Notes:
271 * [1] The code grants an exclusive lock to the caller and clears the bit
272 *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
273 *   bit's value.  This was deemed acceptable as we are not concerned about
274 *   exclusive-lock starvation. If this ever becomes an issue, a priority or
275 *   fifo mechanism should also be implemented. Meantime, the thread that
276 *   set SE_EWANTED should be prepared to catch this condition and reset it
277 *
278 * [2] Retired pages may not be locked at any time, regardless of the
279 *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
280 *
281 * Notes on values of "es":
282 *
283 *   es & 1: page_lookup_create will attempt page relocation
284 *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
285 *       memory thread); this prevents reader-starvation of waiting
286 *       writer thread(s) by giving priority to writers over readers.
287 *   es & SE_RETIRED: caller wants to lock pages even if they are
288 *       retired.  Default is to deny the lock if the page is retired.
289 *
290 * And yes, we know, the semantics of this function are too complicated.
291 * It's on the list to be cleaned up.
292 */
293int
294page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
295{
296	int		retval;
297	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
298	int		upgraded;
299	int		reclaim_it;
300
301	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
302
303	VM_STAT_ADD(page_lock_count);
304
305	upgraded = 0;
306	reclaim_it = 0;
307
308	mutex_enter(pse);
309
310	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
311	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
312
313	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
314		mutex_exit(pse);
315		VM_STAT_ADD(page_lock_retired);
316		return (0);
317	}
318
319	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
320		se = SE_EXCL;
321	}
322
323	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
324
325		reclaim_it = 1;
326		if (se == SE_SHARED) {
327			/*
328			 * This is an interesting situation.
329			 *
330			 * Remember that p_free can only change if
331			 * p_selock < 0.
332			 * p_free does not depend on our holding `pse'.
333			 * And, since we hold `pse', p_selock can not change.
334			 * So, if p_free changes on us, the page is already
335			 * exclusively held, and we would fail to get p_selock
336			 * regardless.
337			 *
338			 * We want to avoid getting the share
339			 * lock on a free page that needs to be reclaimed.
340			 * It is possible that some other thread has the share
341			 * lock and has left the free page on the cache list.
342			 * pvn_vplist_dirty() does this for brief periods.
343			 * If the se_share is currently SE_EXCL, we will fail
344			 * to acquire p_selock anyway.  Blocking is the
345			 * right thing to do.
346			 * If we need to reclaim this page, we must get
347			 * exclusive access to it, force the upgrade now.
348			 * Again, we will fail to acquire p_selock if the
349			 * page is not free and block.
350			 */
351			upgraded = 1;
352			se = SE_EXCL;
353			VM_STAT_ADD(page_lock_upgrade);
354		}
355	}
356
357	if (se == SE_EXCL) {
358		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
359			/*
360			 * if the caller wants a writer lock (but did not
361			 * specify exclusive access), and there is a pending
362			 * writer that wants exclusive access, return failure
363			 */
364			retval = 0;
365		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
366			/* no reader/writer lock held */
367			THREAD_KPRI_REQUEST();
368			/* this clears our setting of the SE_EWANTED bit */
369			pp->p_selock = SE_WRITER;
370			retval = 1;
371		} else {
372			/* page is locked */
373			if (es & SE_EXCL_WANTED) {
374				/* set the SE_EWANTED bit */
375				pp->p_selock |= SE_EWANTED;
376			}
377			retval = 0;
378		}
379	} else {
380		retval = 0;
381		if (pp->p_selock >= 0) {
382			if ((pp->p_selock & SE_EWANTED) == 0) {
383				pp->p_selock += SE_READER;
384				retval = 1;
385			}
386		}
387	}
388
389	if (retval == 0) {
390		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
391			VM_STAT_ADD(page_lock_deleted);
392			mutex_exit(pse);
393			return (retval);
394		}
395
396#ifdef VM_STATS
397		VM_STAT_ADD(page_lock_miss);
398		if (upgraded) {
399			VM_STAT_ADD(page_lock_upgrade_failed);
400		}
401#endif
402		if (lock) {
403			VM_STAT_ADD(page_lock_miss_lock);
404			mutex_exit(lock);
405		}
406
407		/*
408		 * Now, wait for the page to be unlocked and
409		 * release the lock protecting p_cv and p_selock.
410		 */
411		cv_wait(&pp->p_cv, pse);
412		mutex_exit(pse);
413
414		/*
415		 * The page identity may have changed while we were
416		 * blocked.  If we are willing to depend on "pp"
417		 * still pointing to a valid page structure (i.e.,
418		 * assuming page structures are not dynamically allocated
419		 * or freed), we could try to lock the page if its
420		 * identity hasn't changed.
421		 *
422		 * This needs to be measured, since we come back from
423		 * cv_wait holding pse (the expensive part of this
424		 * operation) we might as well try the cheap part.
425		 * Though we would also have to confirm that dropping
426		 * `lock' did not cause any grief to the callers.
427		 */
428		if (lock) {
429			mutex_enter(lock);
430		}
431	} else {
432		/*
433		 * We have the page lock.
434		 * If we needed to reclaim the page, and the page
435		 * needed reclaiming (ie, it was free), then we
436		 * have the page exclusively locked.  We may need
437		 * to downgrade the page.
438		 */
439		ASSERT((upgraded) ?
440		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
441		mutex_exit(pse);
442
443		/*
444		 * We now hold this page's lock, either shared or
445		 * exclusive.  This will prevent its identity from changing.
446		 * The page, however, may or may not be free.  If the caller
447		 * requested, and it is free, go reclaim it from the
448		 * free list.  If the page can't be reclaimed, return failure
449		 * so that the caller can start all over again.
450		 *
451		 * NOTE:page_reclaim() releases the page lock (p_selock)
452		 *	if it can't be reclaimed.
453		 */
454		if (reclaim_it) {
455			if (!page_reclaim(pp, lock)) {
456				VM_STAT_ADD(page_lock_bad_reclaim);
457				retval = 0;
458			} else {
459				VM_STAT_ADD(page_lock_reclaim);
460				if (upgraded) {
461					page_downgrade(pp);
462				}
463			}
464		}
465	}
466	return (retval);
467}
468
469/*
470 * Clear the SE_EWANTED bit from p_selock.  This function allows
471 * callers of page_lock_es and page_try_reclaim_lock to clear
472 * their setting of this bit if they decide they no longer wish
473 * to gain exclusive access to the page.  Currently only
474 * delete_memory_thread uses this when the delete memory
475 * operation is cancelled.
476 */
477void
478page_lock_clr_exclwanted(page_t *pp)
479{
480	kmutex_t *pse = PAGE_SE_MUTEX(pp);
481
482	mutex_enter(pse);
483	pp->p_selock &= ~SE_EWANTED;
484	if (CV_HAS_WAITERS(&pp->p_cv))
485		cv_broadcast(&pp->p_cv);
486	mutex_exit(pse);
487}
488
489/*
490 * Read the comments inside of page_lock_es() carefully.
491 *
492 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
493 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
494 * This is used by threads subject to reader-starvation (eg. memory delete).
495 *
496 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
497 * it is expected that it will retry at a later time.  Threads that will
498 * not retry the lock *must* call page_lock_clr_exclwanted to clear the
499 * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
500 * the bit is cleared.)
501 */
502int
503page_try_reclaim_lock(page_t *pp, se_t se, int es)
504{
505	kmutex_t *pse = PAGE_SE_MUTEX(pp);
506	selock_t old;
507
508	mutex_enter(pse);
509
510	old = pp->p_selock;
511
512	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
513	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
514
515	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
516		mutex_exit(pse);
517		VM_STAT_ADD(page_trylock_failed);
518		return (0);
519	}
520
521	if (se == SE_SHARED && es == 1 && old == 0) {
522		se = SE_EXCL;
523	}
524
525	if (se == SE_SHARED) {
526		if (!PP_ISFREE(pp)) {
527			if (old >= 0) {
528				/*
529				 * Readers are not allowed when excl wanted
530				 */
531				if ((old & SE_EWANTED) == 0) {
532					pp->p_selock = old + SE_READER;
533					mutex_exit(pse);
534					return (1);
535				}
536			}
537			mutex_exit(pse);
538			return (0);
539		}
540		/*
541		 * The page is free, so we really want SE_EXCL (below)
542		 */
543		VM_STAT_ADD(page_try_reclaim_upgrade);
544	}
545
546	/*
547	 * The caller wants a writer lock.  We try for it only if
548	 * SE_EWANTED is not set, or if the caller specified
549	 * SE_EXCL_WANTED.
550	 */
551	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
552		if ((old & ~SE_EWANTED) == 0) {
553			/* no reader/writer lock held */
554			THREAD_KPRI_REQUEST();
555			/* this clears out our setting of the SE_EWANTED bit */
556			pp->p_selock = SE_WRITER;
557			mutex_exit(pse);
558			return (1);
559		}
560	}
561	if (es & SE_EXCL_WANTED) {
562		/* page is locked, set the SE_EWANTED bit */
563		pp->p_selock |= SE_EWANTED;
564	}
565	mutex_exit(pse);
566	return (0);
567}
568
569/*
570 * Acquire a page's "shared/exclusive" lock, but never block.
571 * Returns 1 on success, 0 on failure.
572 */
573int
574page_trylock(page_t *pp, se_t se)
575{
576	kmutex_t *pse = PAGE_SE_MUTEX(pp);
577
578	mutex_enter(pse);
579	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
580	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
581		/*
582		 * Fail if a thread wants exclusive access and page is
583		 * retired, if the page is slated for retirement, or a
584		 * share lock is requested.
585		 */
586		mutex_exit(pse);
587		VM_STAT_ADD(page_trylock_failed);
588		return (0);
589	}
590
591	if (se == SE_EXCL) {
592		if (pp->p_selock == 0) {
593			THREAD_KPRI_REQUEST();
594			pp->p_selock = SE_WRITER;
595			mutex_exit(pse);
596			return (1);
597		}
598	} else {
599		if (pp->p_selock >= 0) {
600			pp->p_selock += SE_READER;
601			mutex_exit(pse);
602			return (1);
603		}
604	}
605	mutex_exit(pse);
606	return (0);
607}
608
609/*
610 * Variant of page_unlock() specifically for the page freelist
611 * code. The mere existence of this code is a vile hack that
612 * has resulted due to the backwards locking order of the page
613 * freelist manager; please don't call it.
614 */
615void
616page_unlock_nocapture(page_t *pp)
617{
618	kmutex_t *pse = PAGE_SE_MUTEX(pp);
619	selock_t old;
620
621	mutex_enter(pse);
622
623	old = pp->p_selock;
624	if ((old & ~SE_EWANTED) == SE_READER) {
625		pp->p_selock = old & ~SE_READER;
626		if (CV_HAS_WAITERS(&pp->p_cv))
627			cv_broadcast(&pp->p_cv);
628	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
629		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
630	} else if (old < 0) {
631		THREAD_KPRI_RELEASE();
632		pp->p_selock &= SE_EWANTED;
633		if (CV_HAS_WAITERS(&pp->p_cv))
634			cv_broadcast(&pp->p_cv);
635	} else if ((old & ~SE_EWANTED) > SE_READER) {
636		pp->p_selock = old - SE_READER;
637	} else {
638		panic("page_unlock_nocapture: page %p is not locked",
639		    (void *)pp);
640	}
641
642	mutex_exit(pse);
643}
644
645/*
646 * Release the page's "shared/exclusive" lock and wake up anyone
647 * who might be waiting for it.
648 */
649void
650page_unlock(page_t *pp)
651{
652	kmutex_t *pse = PAGE_SE_MUTEX(pp);
653	selock_t old;
654
655	mutex_enter(pse);
656
657	old = pp->p_selock;
658	if ((old & ~SE_EWANTED) == SE_READER) {
659		pp->p_selock = old & ~SE_READER;
660		if (CV_HAS_WAITERS(&pp->p_cv))
661			cv_broadcast(&pp->p_cv);
662	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
663		panic("page_unlock: page %p is deleted", (void *)pp);
664	} else if (old < 0) {
665		THREAD_KPRI_RELEASE();
666		pp->p_selock &= SE_EWANTED;
667		if (CV_HAS_WAITERS(&pp->p_cv))
668			cv_broadcast(&pp->p_cv);
669	} else if ((old & ~SE_EWANTED) > SE_READER) {
670		pp->p_selock = old - SE_READER;
671	} else {
672		panic("page_unlock: page %p is not locked", (void *)pp);
673	}
674
675	if (pp->p_selock == 0) {
676		/*
677		 * If the T_CAPTURING bit is set, that means that we should
678		 * not try and capture the page again as we could recurse
679		 * which could lead to a stack overflow panic or spending a
680		 * relatively long time in the kernel making no progress.
681		 */
682		if ((pp->p_toxic & PR_CAPTURE) &&
683		    !(curthread->t_flag & T_CAPTURING) &&
684		    !PP_RETIRED(pp)) {
685			THREAD_KPRI_REQUEST();
686			pp->p_selock = SE_WRITER;
687			mutex_exit(pse);
688			page_unlock_capture(pp);
689		} else {
690			mutex_exit(pse);
691		}
692	} else {
693		mutex_exit(pse);
694	}
695}
696
697/*
698 * Try to upgrade the lock on the page from a "shared" to an
699 * "exclusive" lock.  Since this upgrade operation is done while
700 * holding the mutex protecting this page, no one else can acquire this page's
701 * lock and change the page. Thus, it is safe to drop the "shared"
702 * lock and attempt to acquire the "exclusive" lock.
703 *
704 * Returns 1 on success, 0 on failure.
705 */
706int
707page_tryupgrade(page_t *pp)
708{
709	kmutex_t *pse = PAGE_SE_MUTEX(pp);
710
711	mutex_enter(pse);
712	if (!(pp->p_selock & SE_EWANTED)) {
713		/* no threads want exclusive access, try upgrade */
714		if (pp->p_selock == SE_READER) {
715			THREAD_KPRI_REQUEST();
716			/* convert to exclusive lock */
717			pp->p_selock = SE_WRITER;
718			mutex_exit(pse);
719			return (1);
720		}
721	}
722	mutex_exit(pse);
723	return (0);
724}
725
726/*
727 * Downgrade the "exclusive" lock on the page to a "shared" lock
728 * while holding the mutex protecting this page's p_selock field.
729 */
730void
731page_downgrade(page_t *pp)
732{
733	kmutex_t *pse = PAGE_SE_MUTEX(pp);
734	int excl_waiting;
735
736	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
737	ASSERT(PAGE_EXCL(pp));
738
739	mutex_enter(pse);
740	excl_waiting =  pp->p_selock & SE_EWANTED;
741	THREAD_KPRI_RELEASE();
742	pp->p_selock = SE_READER | excl_waiting;
743	if (CV_HAS_WAITERS(&pp->p_cv))
744		cv_broadcast(&pp->p_cv);
745	mutex_exit(pse);
746}
747
748void
749page_lock_delete(page_t *pp)
750{
751	kmutex_t *pse = PAGE_SE_MUTEX(pp);
752
753	ASSERT(PAGE_EXCL(pp));
754	ASSERT(pp->p_vnode == NULL);
755	ASSERT(pp->p_offset == (u_offset_t)-1);
756	ASSERT(!PP_ISFREE(pp));
757
758	mutex_enter(pse);
759	THREAD_KPRI_RELEASE();
760	pp->p_selock = SE_DELETED;
761	if (CV_HAS_WAITERS(&pp->p_cv))
762		cv_broadcast(&pp->p_cv);
763	mutex_exit(pse);
764}
765
766int
767page_deleted(page_t *pp)
768{
769	return (pp->p_selock == SE_DELETED);
770}
771
772/*
773 * Implement the io lock for pages
774 */
775void
776page_iolock_init(page_t *pp)
777{
778	pp->p_iolock_state = 0;
779	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
780}
781
782/*
783 * Acquire the i/o lock on a page.
784 */
785void
786page_io_lock(page_t *pp)
787{
788	kmutex_t *pio;
789
790	pio = PAGE_IO_MUTEX(pp);
791	mutex_enter(pio);
792	while (pp->p_iolock_state & PAGE_IO_INUSE) {
793		cv_wait(&(pp->p_io_cv), pio);
794	}
795	pp->p_iolock_state |= PAGE_IO_INUSE;
796	mutex_exit(pio);
797}
798
799/*
800 * Release the i/o lock on a page.
801 */
802void
803page_io_unlock(page_t *pp)
804{
805	kmutex_t *pio;
806
807	pio = PAGE_IO_MUTEX(pp);
808	mutex_enter(pio);
809	cv_broadcast(&pp->p_io_cv);
810	pp->p_iolock_state &= ~PAGE_IO_INUSE;
811	mutex_exit(pio);
812}
813
814/*
815 * Try to acquire the i/o lock on a page without blocking.
816 * Returns 1 on success, 0 on failure.
817 */
818int
819page_io_trylock(page_t *pp)
820{
821	kmutex_t *pio;
822
823	if (pp->p_iolock_state & PAGE_IO_INUSE)
824		return (0);
825
826	pio = PAGE_IO_MUTEX(pp);
827	mutex_enter(pio);
828
829	if (pp->p_iolock_state & PAGE_IO_INUSE) {
830		mutex_exit(pio);
831		return (0);
832	}
833	pp->p_iolock_state |= PAGE_IO_INUSE;
834	mutex_exit(pio);
835
836	return (1);
837}
838
839/*
840 * Wait until the i/o lock is not held.
841 */
842void
843page_io_wait(page_t *pp)
844{
845	kmutex_t *pio;
846
847	pio = PAGE_IO_MUTEX(pp);
848	mutex_enter(pio);
849	while (pp->p_iolock_state & PAGE_IO_INUSE) {
850		cv_wait(&(pp->p_io_cv), pio);
851	}
852	mutex_exit(pio);
853}
854
855/*
856 * Returns 1 on success, 0 on failure.
857 */
858int
859page_io_locked(page_t *pp)
860{
861	return (pp->p_iolock_state & PAGE_IO_INUSE);
862}
863
864/*
865 * Assert that the i/o lock on a page is held.
866 * Returns 1 on success, 0 on failure.
867 */
868int
869page_iolock_assert(page_t *pp)
870{
871	return (page_io_locked(pp));
872}
873
874/*
875 * Wrapper exported to kernel routines that are built
876 * platform-independent (the macro is platform-dependent;
877 * the size of vph_mutex[] is based on NCPU).
878 *
879 * Note that you can do stress testing on this by setting the
880 * variable page_vnode_mutex_stress to something other than
881 * zero in a DEBUG kernel in a debugger after loading the kernel.
882 * Setting it after the kernel is running may not work correctly.
883 */
884#ifdef DEBUG
885static int page_vnode_mutex_stress = 0;
886#endif
887
888kmutex_t *
889page_vnode_mutex(vnode_t *vp)
890{
891	if (vp == &kvp)
892		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
893
894	if (vp == &zvp)
895		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
896#ifdef DEBUG
897	if (page_vnode_mutex_stress != 0)
898		return (&vph_mutex[0]);
899#endif
900
901	return (&vph_mutex[VP_HASH_FUNC(vp)]);
902}
903
904kmutex_t *
905page_se_mutex(page_t *pp)
906{
907	return (PAGE_SE_MUTEX(pp));
908}
909
910#ifdef VM_STATS
911uint_t pszclck_stat[4];
912#endif
913/*
914 * Find, take and return a mutex held by hat_page_demote().
915 * Called by page_demote_vp_pages() before hat_page_demote() call and by
916 * routines that want to block hat_page_demote() but can't do it
917 * via locking all constituent pages.
918 *
919 * Return NULL if p_szc is 0.
920 *
921 * It should only be used for pages that can be demoted by hat_page_demote()
922 * i.e. non swapfs file system pages.  The logic here is lifted from
923 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
924 * since the page is locked and not free.
925 *
926 * Hash of the root page is used to find the lock.
927 * To find the root in the presense of hat_page_demote() chageing the location
928 * of the root this routine relies on the fact that hat_page_demote() changes
929 * root last.
930 *
931 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
932 * returned pp's p_szc may be any value.
933 */
934kmutex_t *
935page_szc_lock(page_t *pp)
936{
937	kmutex_t	*mtx;
938	page_t		*rootpp;
939	uint_t		szc;
940	uint_t		rszc;
941	uint_t		pszc = pp->p_szc;
942
943	ASSERT(pp != NULL);
944	ASSERT(PAGE_LOCKED(pp));
945	ASSERT(!PP_ISFREE(pp));
946	ASSERT(pp->p_vnode != NULL);
947	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
948	ASSERT(!PP_ISKAS(pp));
949
950again:
951	if (pszc == 0) {
952		VM_STAT_ADD(pszclck_stat[0]);
953		return (NULL);
954	}
955
956	/* The lock lives in the root page */
957
958	rootpp = PP_GROUPLEADER(pp, pszc);
959	mtx = PAGE_SZC_MUTEX(rootpp);
960	mutex_enter(mtx);
961
962	/*
963	 * since p_szc can only decrease if pp == rootpp
964	 * rootpp will be always the same i.e we have the right root
965	 * regardless of rootpp->p_szc.
966	 * If location of pp's root didn't change after we took
967	 * the lock we have the right root. return mutex hashed off it.
968	 */
969	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
970		VM_STAT_ADD(pszclck_stat[1]);
971		return (mtx);
972	}
973
974	/*
975	 * root location changed because page got demoted.
976	 * locate the new root.
977	 */
978	if (rszc < pszc) {
979		szc = pp->p_szc;
980		ASSERT(szc < pszc);
981		mutex_exit(mtx);
982		pszc = szc;
983		VM_STAT_ADD(pszclck_stat[2]);
984		goto again;
985	}
986
987	VM_STAT_ADD(pszclck_stat[3]);
988	/*
989	 * current hat_page_demote not done yet.
990	 * wait for it to finish.
991	 */
992	mutex_exit(mtx);
993	rootpp = PP_GROUPLEADER(rootpp, rszc);
994	mtx = PAGE_SZC_MUTEX(rootpp);
995	mutex_enter(mtx);
996	mutex_exit(mtx);
997	ASSERT(rootpp->p_szc < rszc);
998	goto again;
999}
1000
1001int
1002page_szc_lock_assert(page_t *pp)
1003{
1004	page_t *rootpp = PP_PAGEROOT(pp);
1005	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1006
1007	return (MUTEX_HELD(mtx));
1008}
1009
1010/*
1011 * memseg locking
1012 */
1013static krwlock_t memsegslock;
1014
1015/*
1016 * memlist (phys_install, phys_avail) locking.
1017 */
1018static krwlock_t memlists_lock;
1019
1020int
1021memsegs_trylock(int writer)
1022{
1023	return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1024}
1025
1026void
1027memsegs_lock(int writer)
1028{
1029	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1030}
1031
1032/*ARGSUSED*/
1033void
1034memsegs_unlock(int writer)
1035{
1036	rw_exit(&memsegslock);
1037}
1038
1039int
1040memsegs_lock_held(void)
1041{
1042	return (RW_LOCK_HELD(&memsegslock));
1043}
1044
1045void
1046memlist_read_lock(void)
1047{
1048	rw_enter(&memlists_lock, RW_READER);
1049}
1050
1051void
1052memlist_read_unlock(void)
1053{
1054	rw_exit(&memlists_lock);
1055}
1056
1057void
1058memlist_write_lock(void)
1059{
1060	rw_enter(&memlists_lock, RW_WRITER);
1061}
1062
1063void
1064memlist_write_unlock(void)
1065{
1066	rw_exit(&memlists_lock);
1067}
1068