1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/errno.h>
30#include <sys/kmem.h>
31#include <sys/vnode.h>
32#include <sys/vfs_opreg.h>
33#include <sys/swap.h>
34#include <sys/sysmacros.h>
35#include <sys/buf.h>
36#include <sys/callb.h>
37#include <sys/debug.h>
38#include <vm/seg.h>
39#include <sys/fs/swapnode.h>
40#include <fs/fs_subr.h>
41#include <sys/cmn_err.h>
42#include <sys/mem_config.h>
43#include <sys/atomic.h>
44
45extern const fs_operation_def_t swap_vnodeops_template[];
46
47/*
48 * swapfs_minfree is the amount of physical memory (actually remaining
49 * availrmem) that we want to keep free for the rest of the system.  This
50 * means that swapfs can only grow to availrmem - swapfs_minfree.  This
51 * can be set as just constant value or a certain percentage of installed
52 * physical memory. It is set in swapinit().
53 *
54 * Users who want to change the amount of memory that can be used as swap
55 * space should do so by setting swapfs_desfree at boot time,
56 * not swapfs_minfree.
57 */
58
59pgcnt_t swapfs_desfree = 0;
60pgcnt_t swapfs_minfree = 0;
61pgcnt_t swapfs_reserve = 0;
62
63#ifdef SWAPFS_DEBUG
64int swapfs_debug;
65#endif /* SWAPFS_DEBUG */
66
67
68static int swapfs_vpcount;
69static kmutex_t swapfs_lock;
70static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
71
72static struct vnode **swap_vnodes;	/* ptr's to swap vnodes */
73
74static void swap_init_mem_config(void);
75
76static pgcnt_t initial_swapfs_desfree;
77static pgcnt_t initial_swapfs_minfree;
78static pgcnt_t initial_swapfs_reserve;
79
80static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
81
82static void
83swapfs_recalc_save_initial(void)
84{
85	initial_swapfs_desfree = swapfs_desfree;
86	initial_swapfs_minfree = swapfs_minfree;
87	initial_swapfs_reserve = swapfs_reserve;
88}
89
90static int
91swapfs_recalc(pgcnt_t pgs)
92{
93	pgcnt_t new_swapfs_desfree;
94	pgcnt_t new_swapfs_minfree;
95	pgcnt_t new_swapfs_reserve;
96
97	new_swapfs_desfree = initial_swapfs_desfree;
98	new_swapfs_minfree = initial_swapfs_minfree;
99	new_swapfs_reserve = initial_swapfs_reserve;
100
101	if (new_swapfs_desfree == 0)
102		new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
103
104	if (new_swapfs_minfree == 0) {
105		/*
106		 * We set this lower than we'd like here, 2Mb, because we
107		 * always boot on swapfs. It's up to a safer value,
108		 * swapfs_desfree, when/if we add physical swap devices
109		 * in swapadd(). Users who want to change the amount of
110		 * memory that can be used as swap space should do so by
111		 * setting swapfs_desfree at boot time, not swapfs_minfree.
112		 * However, swapfs_minfree is tunable by install as a
113		 * workaround for bugid 1147463.
114		 */
115		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
116	}
117
118	/*
119	 * priv processes can reserve memory as swap as long as availrmem
120	 * remains greater than swapfs_minfree; in the case of non-priv
121	 * processes, memory can be reserved as swap only if availrmem
122	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
123	 * swapfs_reserve amount of memswap is not available to non-priv
124	 * processes. This protects daemons such as automounter dying
125	 * as a result of application processes eating away almost entire
126	 * membased swap. This safeguard becomes useless if apps are run
127	 * with root access.
128	 *
129	 * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
130	 * is greater up to the limit of 128 MB.
131	 */
132	if (new_swapfs_reserve == 0)
133		new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
134		    MAX(btopr(4 * 1024 * 1024), pgs >> 7));
135
136	/* Test basic numeric viability. */
137	if (new_swapfs_minfree > pgs)
138		return (0);
139
140	/* Equivalent test to anon_resvmem() check. */
141	if (availrmem < new_swapfs_minfree) {
142		/*
143		 * If ism pages are being used, then there must be agreement
144		 * between these two policies.
145		 */
146		if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
147			new_swapfs_minfree = segspt_minfree;
148		} else {
149			return (0);
150		}
151	}
152
153	swapfs_desfree = new_swapfs_desfree;
154	swapfs_minfree = new_swapfs_minfree;
155	swapfs_reserve = new_swapfs_reserve;
156
157	return (1);
158}
159
160/*ARGSUSED1*/
161int
162swapinit(int fstype, char *name)
163{							/* reserve for mp */
164	ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
165	int i, error;
166
167	static const fs_operation_def_t swap_vfsops[] = {
168		VFSNAME_SYNC, { .vfs_sync = swap_sync },
169		NULL, NULL
170	};
171
172	SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
173	mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
174
175	swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
176	    KM_SLEEP);
177
178	swapfs_recalc_save_initial();
179	if (!swapfs_recalc(physmem))
180		cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
181		    swapfs_minfree, physmem);
182
183	/*
184	 * Arrange for a callback on memory size change.
185	 */
186	swap_init_mem_config();
187
188	sw_ar = (struct async_reqs *)
189	    kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
190
191	error = vfs_setfsops(fstype, swap_vfsops, NULL);
192	if (error != 0) {
193		cmn_err(CE_WARN, "swapinit: bad vfs ops template");
194		return (error);
195	}
196
197	error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
198	if (error != 0) {
199		(void) vfs_freevfsops_by_type(fstype);
200		cmn_err(CE_WARN, "swapinit: bad vnode ops template");
201		return (error);
202	}
203	sw_freelist = sw_ar;
204	for (i = 0; i < sw_freelist_size - 1; i++)
205		sw_ar[i].a_next = &sw_ar[i + 1];
206
207	return (0);
208}
209
210/*
211 * Get a swapfs vnode corresponding to the specified identifier.
212 */
213struct vnode *
214swapfs_getvp(ulong_t vidx)
215{
216	struct vnode *vp;
217
218	vp = swap_vnodes[vidx];
219	if (vp) {
220		return (vp);
221	}
222
223	mutex_enter(&swapfs_lock);
224	vp = swap_vnodes[vidx];
225	if (vp == NULL) {
226		vp = vn_alloc(KM_SLEEP);
227		vn_setops(vp, swap_vnodeops);
228		vp->v_type = VREG;
229		vp->v_flag |= (VISSWAP|VISSWAPFS);
230		swap_vnodes[vidx] = vp;
231		swapfs_vpcount++;
232	}
233	mutex_exit(&swapfs_lock);
234	return (vp);
235}
236
237int swap_lo;
238
239/*ARGSUSED*/
240static int
241swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
242{
243	struct vnode *vp;
244	int i;
245
246	if (!(flag & SYNC_ALL))
247		return (1);
248
249	/*
250	 * assumes that we are the only one left to access this so that
251	 * no need to use swapfs_lock (since it's staticly defined)
252	 */
253	for (i = 0; i < MAX_SWAP_VNODES; i++) {
254		vp = swap_vnodes[i];
255		if (vp) {
256			VN_HOLD(vp);
257			(void) VOP_PUTPAGE(vp, (offset_t)0, 0,
258			    (B_ASYNC | B_FREE), kcred, NULL);
259			VN_RELE(vp);
260		}
261	}
262	return (0);
263}
264
265extern int sw_pending_size;
266
267/*
268 * Take an async request off the pending queue
269 */
270struct async_reqs *
271sw_getreq()
272{
273	struct async_reqs *arg;
274
275	mutex_enter(&swapfs_lock);
276	arg = sw_pendlist;
277	if (arg) {
278		sw_pendlist = arg->a_next;
279		arg->a_next = NULL;
280		sw_pending_size -= PAGESIZE;
281	}
282	ASSERT(sw_pending_size >= 0);
283	mutex_exit(&swapfs_lock);
284	return (arg);
285}
286
287/*
288 * Put an async request on the pending queue
289 */
290void
291sw_putreq(struct async_reqs *arg)
292{
293	/* Hold onto it */
294	VN_HOLD(arg->a_vp);
295
296	mutex_enter(&swapfs_lock);
297	arg->a_next = sw_pendlist;
298	sw_pendlist = arg;
299	sw_pending_size += PAGESIZE;
300	mutex_exit(&swapfs_lock);
301}
302
303/*
304 * Put an async request back on the pending queue
305 */
306void
307sw_putbackreq(struct async_reqs *arg)
308{
309	mutex_enter(&swapfs_lock);
310	arg->a_next = sw_pendlist;
311	sw_pendlist = arg;
312	sw_pending_size += PAGESIZE;
313	mutex_exit(&swapfs_lock);
314}
315
316/*
317 * Take an async request structure off the free list
318 */
319struct async_reqs *
320sw_getfree()
321{
322	struct async_reqs *arg;
323
324	mutex_enter(&swapfs_lock);
325	arg = sw_freelist;
326	if (arg) {
327		sw_freelist = arg->a_next;
328		arg->a_next = NULL;
329	}
330	mutex_exit(&swapfs_lock);
331	return (arg);
332}
333
334/*
335 * Put an async request structure on the free list
336 */
337void
338sw_putfree(struct async_reqs *arg)
339{
340	/* Release our hold - should have locked the page by now */
341	VN_RELE(arg->a_vp);
342
343	mutex_enter(&swapfs_lock);
344	arg->a_next = sw_freelist;
345	sw_freelist = arg;
346	mutex_exit(&swapfs_lock);
347}
348
349static pgcnt_t swapfs_pending_delete;
350
351/*ARGSUSED*/
352static void
353swap_mem_config_post_add(
354	void *arg,
355	pgcnt_t delta_swaps)
356{
357	(void) swapfs_recalc(physmem - swapfs_pending_delete);
358}
359
360/*ARGSUSED*/
361static int
362swap_mem_config_pre_del(
363	void *arg,
364	pgcnt_t delta_swaps)
365{
366	pgcnt_t nv;
367
368	nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
369	if (!swapfs_recalc(physmem - nv)) {
370		/*
371		 * Tidy-up is done by the call to post_del which
372		 * is always made.
373		 */
374		cmn_err(CE_NOTE, "Memory operation refused to ensure system "
375		    "doesn't deadlock due to excessive consumption by swapfs.");
376		return (EBUSY);
377	}
378	return (0);
379}
380
381/*ARGSUSED*/
382static void
383swap_mem_config_post_del(
384	void *arg,
385	pgcnt_t delta_swaps,
386	int cancelled)
387{
388	pgcnt_t nv;
389
390	nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
391	(void) swapfs_recalc(physmem - nv);
392}
393
394static kphysm_setup_vector_t swap_mem_config_vec = {
395	KPHYSM_SETUP_VECTOR_VERSION,
396	swap_mem_config_post_add,
397	swap_mem_config_pre_del,
398	swap_mem_config_post_del,
399};
400
401static void
402swap_init_mem_config(void)
403{
404	int ret;
405
406	ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
407	ASSERT(ret == 0);
408}
409