1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26/*	 All Rights Reserved   */
27
28/*
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
32 *
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
36 */
37
38#ifndef	_VM_ANON_H
39#define	_VM_ANON_H
40
41#include <sys/cred.h>
42#include <sys/zone.h>
43#include <vm/seg.h>
44#include <vm/vpage.h>
45
46#ifdef	__cplusplus
47extern "C" {
48#endif
49
50/*
51 * VM - Anonymous pages.
52 */
53
54typedef	unsigned long anoff_t;		/* anon offsets */
55
56/*
57 *	Each anonymous page, either in memory or in swap, has an anon structure.
58 * The structure (slot) provides a level of indirection between anonymous pages
59 * and their backing store.
60 *
61 *	(an_vp, an_off) names the vnode of the anonymous page for this slot.
62 *
63 * 	(an_pvp, an_poff) names the location of the physical backing store
64 * 	for the page this slot represents. If the name is null there is no
65 * 	associated physical store. The physical backing store location can
66 *	change while the slot is in use.
67 *
68 *	an_hash is a hash list of anon slots. The list is hashed by
69 * 	(an_vp, an_off) of the associated anonymous page and provides a
70 *	method of going from the name of an anonymous page to its
71 * 	associated anon slot.
72 *
73 *	an_refcnt holds a reference count which is the number of separate
74 * 	copies that will need to be created in case of copy-on-write.
75 *	A refcnt > 0 protects the existence of the slot. The refcnt is
76 * 	initialized to 1 when the anon slot is created in anon_alloc().
77 *	If a client obtains an anon slot and allows multiple threads to
78 * 	share it, then it is the client's responsibility to insure that
79 *	it does not allow one thread to try to reference the slot at the
80 *	same time as another is trying to decrement the last count and
81 *	destroy the anon slot. E.g., the seg_vn segment type protects
82 *	against this with higher level locks.
83 */
84
85struct anon {
86	struct vnode *an_vp;	/* vnode of anon page */
87	struct vnode *an_pvp;	/* vnode of physical backing store */
88	anoff_t an_off;		/* offset of anon page */
89	anoff_t an_poff;	/* offset in vnode */
90	struct anon *an_hash;	/* hash table of anon slots */
91	int an_refcnt;		/* # of people sharing slot */
92};
93
94#define	AN_CACHE_ALIGN_LOG2	4	/* log2(AN_CACHE_ALIGN) */
95#define	AN_CACHE_ALIGN	(1U << AN_CACHE_ALIGN_LOG2) /* anon address aligned */
96						/* 16 bytes */
97
98
99#ifdef _KERNEL
100/*
101 * The swapinfo_lock protects:
102 *		swapinfo list
103 *		individual swapinfo structures
104 *
105 * The anoninfo_lock protects:
106 *		anoninfo counters
107 *
108 * The anonhash_lock protects:
109 *		anon hash lists
110 *		anon slot fields
111 *
112 * Fields in the anon slot which are read-only for the life of the slot
113 * (an_vp, an_off) do not require the anonhash_lock be held to access them.
114 * If you access a field without the anonhash_lock held you must be holding
115 * the slot with an_refcnt to make sure it isn't destroyed.
116 * To write (an_pvp, an_poff) in a given slot you must also hold the
117 * p_iolock of the anonymous page for slot.
118 */
119extern kmutex_t anoninfo_lock;
120extern kmutex_t swapinfo_lock;
121extern pad_mutex_t *anonhash_lock;
122extern pad_mutex_t anon_array_lock[];
123extern kcondvar_t anon_array_cv[];
124
125/*
126 * Global hash table to provide a function from (vp, off) -> ap
127 */
128extern size_t anon_hash_size;
129extern unsigned int anon_hash_shift;
130extern struct anon **anon_hash;
131#define	ANON_HASH_SIZE	anon_hash_size
132#define	ANON_HASHAVELEN	4
133/*
134 * Try to use as many bits of randomness from both vp and off as we can.
135 * This should help spreading evenly for a variety of workloads.  See comments
136 * for PAGE_HASH_FUNC for more explanation.
137 */
138#define	ANON_HASH(vp, off)	\
139	(((((uintptr_t)(off) >> PAGESHIFT) ^ \
140		((uintptr_t)(off) >> (PAGESHIFT + anon_hash_shift))) ^ \
141		(((uintptr_t)(vp) >> 3) ^ \
142		((uintptr_t)(vp) >> (3 + anon_hash_shift)) ^ \
143		((uintptr_t)(vp) >> (3 + 2 * anon_hash_shift)) ^ \
144		((uintptr_t)(vp) << \
145		    (anon_hash_shift - AN_VPSHIFT - VNODE_ALIGN_LOG2)))) & \
146		(anon_hash_size - 1))
147
148#define	AH_LOCK_SIZE	(2 << NCPU_LOG2)
149
150#define	AH_MUTEX(vp, off)				\
151	(&anonhash_lock[(ANON_HASH((vp), (off)) &	\
152	    (AH_LOCK_SIZE - 1))].pad_mutex)
153
154#endif	/* _KERNEL */
155
156/*
157 * Declaration for the Global counters to accurately
158 * track the kernel foot print in memory.
159 */
160extern  pgcnt_t pages_locked;
161extern  pgcnt_t pages_claimed;
162extern  pgcnt_t pages_useclaim;
163extern  pgcnt_t obp_pages;
164
165/*
166 * Anonymous backing store accounting structure for swapctl.
167 *
168 * ani_max = maximum amount of swap space
169 *	(including potentially available physical memory)
170 * ani_free = amount of unallocated anonymous memory
171 *	(some of which might be reserved and including
172 *	potentially available physical memory)
173 * ani_resv = amount of claimed (reserved) anonymous memory
174 *
175 * The swap data can be aquired more efficiently through the
176 * kstats interface.
177 * Total slots currently available for reservation =
178 *	MAX(ani_max - ani_resv, 0) + (availrmem - swapfs_minfree)
179 */
180struct anoninfo {
181	pgcnt_t	ani_max;
182	pgcnt_t	ani_free;
183	pgcnt_t	ani_resv;
184};
185
186#ifdef _SYSCALL32
187struct anoninfo32 {
188	size32_t ani_max;
189	size32_t ani_free;
190	size32_t ani_resv;
191};
192#endif /* _SYSCALL32 */
193
194/*
195 * Define the NCPU pool of the ani_free counters. Update the counter
196 * of the cpu on which the thread is running and in every clock intr
197 * sync anoninfo.ani_free with the current total off all the NCPU entries.
198 */
199
200typedef	struct	ani_free {
201	pgcnt_t		ani_count;
202	uchar_t		pad[64 - sizeof (pgcnt_t)];
203			/* XXX 64 = cacheline size */
204} ani_free_t;
205
206#define	ANI_MAX_POOL	(NCPU_P2)
207extern	ani_free_t	*ani_free_pool;
208
209/*
210 * Since each CPU has its own bucket in ani_free_pool, there should be no
211 * contention here.
212 */
213#define	ANI_ADD(inc)	{ \
214	pgcnt_t	*ani_countp; \
215	int	index; \
216	index = (CPU->cpu_seqid & (ANI_MAX_POOL - 1)); \
217	ani_countp = &ani_free_pool[index].ani_count; \
218	atomic_add_long(ani_countp, inc); \
219}
220
221extern void	set_anoninfo(void);
222
223/*
224 * Anon array pointers are allocated in chunks. Each chunk
225 * has PAGESIZE/sizeof(u_long *) of anon pointers.
226 * There are two levels of arrays for anon array pointers larger
227 * than a chunk. The first level points to anon array chunks.
228 * The second level consists of chunks of anon pointers.
229 *
230 * If anon array is smaller than a chunk then the whole anon array
231 * is created (memory is allocated for whole anon array).
232 * If anon array is larger than a chunk only first level array is
233 * allocated. Then other arrays (chunks) are allocated only when
234 * they are initialized with anon pointers.
235 */
236struct anon_hdr {
237	kmutex_t serial_lock;	/* serialize array chunk allocation */
238	pgcnt_t	size;		/* number of pointers to (anon) pages */
239	void	**array_chunk;	/* pointers to anon pointers or chunks of */
240				/* anon pointers */
241	int	flags;		/* ANON_ALLOC_FORCE force preallocation of */
242				/* whole anon array	*/
243};
244
245#ifdef	_LP64
246#define	ANON_PTRSHIFT	3
247#define	ANON_PTRMASK	~7
248#else
249#define	ANON_PTRSHIFT	2
250#define	ANON_PTRMASK	~3
251#endif
252
253#define	ANON_CHUNK_SIZE		(PAGESIZE >> ANON_PTRSHIFT)
254#define	ANON_CHUNK_SHIFT	(PAGESHIFT - ANON_PTRSHIFT)
255#define	ANON_CHUNK_OFF		(ANON_CHUNK_SIZE - 1)
256
257/*
258 * Anon flags.
259 */
260#define	ANON_SLEEP		0x0	/* ok to block */
261#define	ANON_NOSLEEP		0x1	/* non-blocking call */
262#define	ANON_ALLOC_FORCE	0x2	/* force single level anon array */
263#define	ANON_GROWDOWN		0x4	/* anon array should grow downward */
264
265struct kshmid;
266
267/*
268 * The anon_map structure is used by various clients of the anon layer to
269 * manage anonymous memory.   When anonymous memory is shared,
270 * then the different clients sharing it will point to the
271 * same anon_map structure.  Also, if a segment is unmapped
272 * in the middle where an anon_map structure exists, the
273 * newly created segment will also share the anon_map structure,
274 * although the two segments will use different ranges of the
275 * anon array.  When mappings are private (or shared with
276 * a reference count of 1), an unmap operation will free up
277 * a range of anon slots in the array given by the anon_map
278 * structure.  Because of fragmentation due to this unmapping,
279 * we have to store the size of the anon array in the anon_map
280 * structure so that we can free everything when the referernce
281 * count goes to zero.
282 *
283 * A new rangelock scheme is introduced to make the anon layer scale.
284 * A reader/writer lock per anon_amp and an array of system-wide hash
285 * locks, anon_array_lock[] are introduced to replace serial_lock and
286 * anonmap lock.  The writer lock is held when we want to singlethreaD
287 * the reference to the anon array pointers or when references to
288 * anon_map's members, whereas reader lock and anon_array_lock are
289 * held to allows multiple threads to reference different part of
290 * anon array.  A global set of condition variables, anon_array_cv,
291 * are used with anon_array_lock[] to make the hold time of the locks
292 * short.
293 *
294 * szc is used to calculate the index of hash locks and cv's.  We
295 * could've just used seg->s_szc if not for the possible sharing of
296 * anon_amp between SYSV shared memory and ISM, so now we introduce
297 * szc in the anon_map structure.  For MAP_SHARED, the amp->szc is either
298 * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
299 * the amp->szc could be anything in [0, page_num_pagesizes() - 1].
300 */
301typedef struct anon_map {
302	krwlock_t a_rwlock;	/* protect anon_map and anon array */
303	size_t	size;		/* size in bytes mapped by the anon array */
304	struct	anon_hdr *ahp; 	/* anon array header pointer, containing */
305				/* anon pointer array(s) */
306	size_t	swresv;		/* swap space reserved for this anon_map */
307	ulong_t	refcnt;		/* reference count on this structure */
308	ushort_t a_szc;		/* max szc among shared processes */
309	void	*locality;	/* lgroup locality info */
310	struct kshmid *a_sp;	/* kshmid if amp backs sysV, or NULL */
311	int	a_purgewait;	/* somebody waits for slocks to go away */
312	kcondvar_t a_purgecv;	/* cv for waiting for slocks to go away */
313	kmutex_t a_purgemtx;	/* mutex for anonmap_purge() */
314	spgcnt_t a_softlockcnt; /* number of pages locked in pcache */
315	kmutex_t a_pmtx;	/* protects amp's pcache list */
316	pcache_link_t a_phead;	/* head of amp's pcache list */
317} amp_t;
318
319#ifdef _KERNEL
320
321#define	ANON_BUSY		0x1
322#define	ANON_ISBUSY(slot)	(*(slot) & ANON_BUSY)
323#define	ANON_SETBUSY(slot)	(*(slot) |= ANON_BUSY)
324#define	ANON_CLRBUSY(slot)	(*(slot) &= ~ANON_BUSY)
325
326#define	ANON_MAP_SHIFT		6	/* log2(sizeof (struct anon_map)) */
327#define	ANON_ARRAY_SHIFT	7	/* log2(ANON_LOCKSIZE) */
328#define	ANON_LOCKSIZE		128
329
330#define	ANON_LOCK_ENTER(lock, type)	rw_enter((lock), (type))
331#define	ANON_LOCK_EXIT(lock)		rw_exit((lock))
332#define	ANON_LOCK_HELD(lock)		RW_LOCK_HELD((lock))
333#define	ANON_READ_HELD(lock)		RW_READ_HELD((lock))
334#define	ANON_WRITE_HELD(lock)		RW_WRITE_HELD((lock))
335
336#define	ANON_ARRAY_HASH(amp, idx)\
337	((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
338	((idx) >> (ANON_ARRAY_SHIFT << 1)) +\
339	((idx) >> (ANON_ARRAY_SHIFT + (ANON_ARRAY_SHIFT << 1)))) ^\
340	((uintptr_t)(amp) >> ANON_MAP_SHIFT)) & (ANON_LOCKSIZE - 1))
341
342typedef struct anon_sync_obj {
343	kmutex_t	*sync_mutex;
344	kcondvar_t	*sync_cv;
345	ulong_t		*sync_data;
346} anon_sync_obj_t;
347
348/*
349 * Anonymous backing store accounting structure for kernel.
350 * ani_max = total reservable slots on physical (disk-backed) swap
351 * ani_phys_resv = total phys slots reserved for use by clients
352 * ani_mem_resv = total mem slots reserved for use by clients
353 * ani_free = # unallocated physical slots + # of reserved unallocated
354 * memory slots
355 */
356
357/*
358 * Initial total swap slots available for reservation
359 */
360#define	TOTAL_AVAILABLE_SWAP \
361	(k_anoninfo.ani_max + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
362
363/*
364 * Swap slots currently available for reservation
365 */
366#define	CURRENT_TOTAL_AVAILABLE_SWAP				\
367	((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +	\
368	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
369
370struct k_anoninfo {
371	pgcnt_t	ani_max;	/* total reservable slots on phys */
372					/* (disk) swap */
373	pgcnt_t	ani_free;	/* # of unallocated phys and mem slots */
374	pgcnt_t	ani_phys_resv;	/* # of reserved phys (disk) slots */
375	pgcnt_t	ani_mem_resv;	/* # of reserved mem slots */
376	pgcnt_t	ani_locked_swap; /* # of swap slots locked in reserved */
377				/* mem swap */
378};
379
380extern	struct k_anoninfo k_anoninfo;
381
382extern void	anon_init(void);
383extern struct	anon *anon_alloc(struct vnode *, anoff_t);
384extern void	anon_dup(struct anon_hdr *, ulong_t,
385		    struct anon_hdr *, ulong_t, size_t);
386extern void	anon_dup_fill_holes(struct anon_hdr *, ulong_t,
387		    struct anon_hdr *, ulong_t, size_t, uint_t, int);
388extern int	anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *,
389		    ulong_t, struct vnode *, u_offset_t, size_t, uint_t,
390		    uint_t, struct vpage [], struct cred *);
391extern void	anon_free(struct anon_hdr *, ulong_t, size_t);
392extern void	anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t);
393extern void	anon_disclaim(struct anon_map *, ulong_t, size_t);
394extern int	anon_getpage(struct anon **, uint_t *, struct page **,
395		    size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
396extern int	swap_getconpage(struct vnode *, u_offset_t, size_t,
397		    uint_t *, page_t *[], size_t, page_t *, uint_t *,
398		    spgcnt_t *, struct seg *, caddr_t,
399		    enum seg_rw, struct cred *);
400extern int	anon_map_getpages(struct anon_map *, ulong_t,
401		    uint_t, struct seg *, caddr_t, uint_t,
402		    uint_t *, page_t *[], uint_t *,
403		    struct vpage [], enum seg_rw, int, int, int, struct cred *);
404extern int	anon_map_privatepages(struct anon_map *, ulong_t,
405		    uint_t, struct seg *, caddr_t, uint_t,
406		    page_t *[], struct vpage [], int, int, struct cred *);
407extern struct	page *anon_private(struct anon **, struct seg *,
408		    caddr_t, uint_t, struct page *,
409		    int, struct cred *);
410extern struct	page *anon_zero(struct seg *, caddr_t,
411		    struct anon **, struct cred *);
412extern int	anon_map_createpages(struct anon_map *, ulong_t,
413		    size_t, struct page **,
414		    struct seg *, caddr_t,
415		    enum seg_rw, struct cred *);
416extern int	anon_map_demotepages(struct anon_map *, ulong_t,
417		    struct seg *, caddr_t, uint_t,
418		    struct vpage [], struct cred *);
419extern void	anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
420extern int	anon_resvmem(size_t, boolean_t, zone_t *, int);
421extern void	anon_unresvmem(size_t, zone_t *);
422extern struct	anon_map *anonmap_alloc(size_t, size_t, int);
423extern void	anonmap_free(struct anon_map *);
424extern void	anonmap_purge(struct anon_map *);
425extern void	anon_swap_free(struct anon *, struct page *);
426extern void	anon_decref(struct anon *);
427extern int	non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
428extern pgcnt_t	anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
429extern int	anon_swap_adjust(pgcnt_t);
430extern void	anon_swap_restore(pgcnt_t);
431extern struct	anon_hdr *anon_create(pgcnt_t, int);
432extern void	anon_release(struct anon_hdr *, pgcnt_t);
433extern struct	anon *anon_get_ptr(struct anon_hdr *, ulong_t);
434extern ulong_t	*anon_get_slot(struct anon_hdr *, ulong_t);
435extern struct	anon *anon_get_next_ptr(struct anon_hdr *, ulong_t *);
436extern int	anon_set_ptr(struct anon_hdr *, ulong_t, struct anon *, int);
437extern int 	anon_copy_ptr(struct anon_hdr *, ulong_t,
438		    struct anon_hdr *, ulong_t, pgcnt_t, int);
439extern pgcnt_t	anon_grow(struct anon_hdr *, ulong_t *, pgcnt_t, pgcnt_t, int);
440extern void	anon_array_enter(struct anon_map *, ulong_t,
441			anon_sync_obj_t *);
442extern int	anon_array_try_enter(struct anon_map *, ulong_t,
443			anon_sync_obj_t *);
444extern void	anon_array_exit(anon_sync_obj_t *);
445
446/*
447 * anon_resv checks to see if there is enough swap space to fulfill a
448 * request and if so, reserves the appropriate anonymous memory resources.
449 * anon_checkspace just checks to see if there is space to fulfill the request,
450 * without taking any resources.  Both return 1 if successful and 0 if not.
451 *
452 * Macros are provided as anon reservation is usually charged to the zone of
453 * the current process.  In some cases (such as anon reserved by tmpfs), a
454 * zone pointer is needed to charge the appropriate zone.
455 */
456#define	anon_unresv(size)		anon_unresvmem(size, curproc->p_zone)
457#define	anon_unresv_zone(size, zone)	anon_unresvmem(size, zone)
458#define	anon_resv(size)			\
459	anon_resvmem((size), 1, curproc->p_zone, 1)
460#define	anon_resv_zone(size, zone)	anon_resvmem((size), 1, zone, 1)
461#define	anon_checkspace(size, zone)	anon_resvmem((size), 0, zone, 0)
462#define	anon_try_resv_zone(size, zone)	anon_resvmem((size), 1, zone, 0)
463
464/*
465 * Flags to anon_private
466 */
467#define	STEAL_PAGE	0x1	/* page can be stolen */
468#define	LOCK_PAGE	0x2	/* page must be ``logically'' locked */
469
470/*
471 * SEGKP ANON pages that are locked are assumed to be LWP stack pages
472 * and thus count towards the user pages locked count.
473 * This value is protected by the same lock as availrmem.
474 */
475extern pgcnt_t anon_segkp_pages_locked;
476
477extern int anon_debug;
478
479#ifdef ANON_DEBUG
480
481#define	A_ANON	0x01
482#define	A_RESV	0x02
483#define	A_MRESV	0x04
484
485/* vararg-like debugging macro. */
486#define	ANON_PRINT(f, printf_args) \
487		if (anon_debug & f) \
488			printf printf_args
489
490#else	/* ANON_DEBUG */
491
492#define	ANON_PRINT(f, printf_args)
493
494#endif	/* ANON_DEBUG */
495
496#endif	/* _KERNEL */
497
498#ifdef	__cplusplus
499}
500#endif
501
502#endif	/* _VM_ANON_H */
503