Deleted Added
full compact
swap_pager.c (92029) swap_pager.c (92654)
1/*
2 * Copyright (c) 1998 Matthew Dillon,
3 * Copyright (c) 1994 John S. Dyson
4 * Copyright (c) 1990 University of Utah.
5 * Copyright (c) 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * New Swap System
41 * Matthew Dillon
42 *
43 * Radix Bitmap 'blists'.
44 *
45 * - The new swapper uses the new radix bitmap code. This should scale
46 * to arbitrarily small or arbitrarily large swap spaces and an almost
47 * arbitrary degree of fragmentation.
48 *
49 * Features:
50 *
51 * - on the fly reallocation of swap during putpages. The new system
52 * does not try to keep previously allocated swap blocks for dirty
53 * pages.
54 *
55 * - on the fly deallocation of swap
56 *
57 * - No more garbage collection required. Unnecessarily allocated swap
58 * blocks only exist for dirty vm_page_t's now and these are already
59 * cycled (in a high-load system) by the pager. We also do on-the-fly
60 * removal of invalidated swap blocks when a page is destroyed
61 * or renamed.
62 *
63 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
64 *
65 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
66 *
1/*
2 * Copyright (c) 1998 Matthew Dillon,
3 * Copyright (c) 1994 John S. Dyson
4 * Copyright (c) 1990 University of Utah.
5 * Copyright (c) 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * New Swap System
41 * Matthew Dillon
42 *
43 * Radix Bitmap 'blists'.
44 *
45 * - The new swapper uses the new radix bitmap code. This should scale
46 * to arbitrarily small or arbitrarily large swap spaces and an almost
47 * arbitrary degree of fragmentation.
48 *
49 * Features:
50 *
51 * - on the fly reallocation of swap during putpages. The new system
52 * does not try to keep previously allocated swap blocks for dirty
53 * pages.
54 *
55 * - on the fly deallocation of swap
56 *
57 * - No more garbage collection required. Unnecessarily allocated swap
58 * blocks only exist for dirty vm_page_t's now and these are already
59 * cycled (in a high-load system) by the pager. We also do on-the-fly
60 * removal of invalidated swap blocks when a page is destroyed
61 * or renamed.
62 *
63 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
64 *
65 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
66 *
67 * $FreeBSD: head/sys/vm/swap_pager.c 92029 2002-03-10 21:52:48Z eivind $
67 * $FreeBSD: head/sys/vm/swap_pager.c 92654 2002-03-19 09:11:49Z jeff $
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/conf.h>
73#include <sys/kernel.h>
74#include <sys/proc.h>
75#include <sys/bio.h>
76#include <sys/buf.h>
77#include <sys/vnode.h>
78#include <sys/malloc.h>
79#include <sys/vmmeter.h>
80#include <sys/sysctl.h>
81#include <sys/blist.h>
82#include <sys/lock.h>
83#include <sys/sx.h>
84#include <sys/vmmeter.h>
85
86#ifndef MAX_PAGEOUT_CLUSTER
87#define MAX_PAGEOUT_CLUSTER 16
88#endif
89
90#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
91
92#include "opt_swap.h"
93#include <vm/vm.h>
94#include <vm/pmap.h>
95#include <vm/vm_map.h>
96#include <vm/vm_kern.h>
97#include <vm/vm_object.h>
98#include <vm/vm_page.h>
99#include <vm/vm_pager.h>
100#include <vm/vm_pageout.h>
101#include <vm/vm_zone.h>
102#include <vm/swap_pager.h>
103#include <vm/vm_extern.h>
104
105#define SWM_FREE 0x02 /* free, period */
106#define SWM_POP 0x04 /* pop out */
107
108/*
109 * vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks
110 * in the old system.
111 */
112extern int vm_swap_size; /* number of free swap blocks, in pages */
113
114int swap_pager_full; /* swap space exhaustion (task killing) */
115static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
116static int nsw_rcount; /* free read buffers */
117static int nsw_wcount_sync; /* limit write buffers / synchronous */
118static int nsw_wcount_async; /* limit write buffers / asynchronous */
119static int nsw_wcount_async_max;/* assigned maximum */
120static int nsw_cluster_max; /* maximum VOP I/O allowed */
121
122struct blist *swapblist;
123static struct swblock **swhash;
124static int swhash_mask;
125static int swap_async_max = 4; /* maximum in-progress async I/O's */
126static struct sx sw_alloc_sx;
127
128/* from vm_swap.c */
129extern struct vnode *swapdev_vp;
130extern struct swdevt *swdevt;
131extern int nswdev;
132
133SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
134 CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
135
136#define BLK2DEVIDX(blk) (nswdev > 1 ? blk / dmmax % nswdev : 0)
137
138/*
139 * "named" and "unnamed" anon region objects. Try to reduce the overhead
140 * of searching a named list by hashing it just a little.
141 */
142
143#define NOBJLISTS 8
144
145#define NOBJLIST(handle) \
146 (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
147
148static struct mtx sw_alloc_mtx; /* protect list manipulation */
149static struct pagerlst swap_pager_object_list[NOBJLISTS];
150struct pagerlst swap_pager_un_object_list;
151vm_zone_t swap_zone;
152
153/*
154 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
155 * calls hooked from other parts of the VM system and do not appear here.
156 * (see vm/swap_pager.h).
157 */
158static vm_object_t
159 swap_pager_alloc __P((void *handle, vm_ooffset_t size,
160 vm_prot_t prot, vm_ooffset_t offset));
161static void swap_pager_dealloc __P((vm_object_t object));
162static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
163static void swap_pager_init __P((void));
164static void swap_pager_unswapped __P((vm_page_t));
165static void swap_pager_strategy __P((vm_object_t, struct bio *));
166
167struct pagerops swappagerops = {
168 swap_pager_init, /* early system initialization of pager */
169 swap_pager_alloc, /* allocate an OBJT_SWAP object */
170 swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
171 swap_pager_getpages, /* pagein */
172 swap_pager_putpages, /* pageout */
173 swap_pager_haspage, /* get backing store status for page */
174 swap_pager_unswapped, /* remove swap related to page */
175 swap_pager_strategy /* pager strategy call */
176};
177
178static struct buf *getchainbuf(struct bio *bp, struct vnode *vp, int flags);
179static void flushchainbuf(struct buf *nbp);
180static void waitchainbuf(struct bio *bp, int count, int done);
181
182/*
183 * dmmax is in page-sized chunks with the new swap system. It was
184 * dev-bsized chunks in the old. dmmax is always a power of 2.
185 *
186 * swap_*() routines are externally accessible. swp_*() routines are
187 * internal.
188 */
189int dmmax;
190static int dmmax_mask;
191int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
192int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
193
194SYSCTL_INT(_vm, OID_AUTO, dmmax,
195 CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
196
197static __inline void swp_sizecheck __P((void));
198static void swp_pager_sync_iodone __P((struct buf *bp));
199static void swp_pager_async_iodone __P((struct buf *bp));
200
201/*
202 * Swap bitmap functions
203 */
204static __inline void swp_pager_freeswapspace __P((daddr_t blk, int npages));
205static __inline daddr_t swp_pager_getswapspace __P((int npages));
206
207/*
208 * Metadata functions
209 */
210static void swp_pager_meta_build __P((vm_object_t, vm_pindex_t, daddr_t));
211static void swp_pager_meta_free __P((vm_object_t, vm_pindex_t, daddr_t));
212static void swp_pager_meta_free_all __P((vm_object_t));
213static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int));
214
215/*
216 * SWP_SIZECHECK() - update swap_pager_full indication
217 *
218 * update the swap_pager_almost_full indication and warn when we are
219 * about to run out of swap space, using lowat/hiwat hysteresis.
220 *
221 * Clear swap_pager_full ( task killing ) indication when lowat is met.
222 *
223 * No restrictions on call
224 * This routine may not block.
225 * This routine must be called at splvm()
226 */
227static __inline void
228swp_sizecheck()
229{
230 GIANT_REQUIRED;
231
232 if (vm_swap_size < nswap_lowat) {
233 if (swap_pager_almost_full == 0) {
234 printf("swap_pager: out of swap space\n");
235 swap_pager_almost_full = 1;
236 }
237 } else {
238 swap_pager_full = 0;
239 if (vm_swap_size > nswap_hiwat)
240 swap_pager_almost_full = 0;
241 }
242}
243
244/*
245 * SWAP_PAGER_INIT() - initialize the swap pager!
246 *
247 * Expected to be started from system init. NOTE: This code is run
248 * before much else so be careful what you depend on. Most of the VM
249 * system has yet to be initialized at this point.
250 */
251static void
252swap_pager_init()
253{
254 /*
255 * Initialize object lists
256 */
257 int i;
258
259 for (i = 0; i < NOBJLISTS; ++i)
260 TAILQ_INIT(&swap_pager_object_list[i]);
261 TAILQ_INIT(&swap_pager_un_object_list);
262 mtx_init(&sw_alloc_mtx, "swap_pager list", MTX_DEF);
263
264 /*
265 * Device Stripe, in PAGE_SIZE'd blocks
266 */
267 dmmax = SWB_NPAGES * 2;
268 dmmax_mask = ~(dmmax - 1);
269}
270
271/*
272 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
273 *
274 * Expected to be started from pageout process once, prior to entering
275 * its main loop.
276 */
277void
278swap_pager_swap_init()
279{
280 int n, n2;
281
282 /*
283 * Number of in-transit swap bp operations. Don't
284 * exhaust the pbufs completely. Make sure we
285 * initialize workable values (0 will work for hysteresis
286 * but it isn't very efficient).
287 *
288 * The nsw_cluster_max is constrained by the bp->b_pages[]
289 * array (MAXPHYS/PAGE_SIZE) and our locally defined
290 * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
291 * constrained by the swap device interleave stripe size.
292 *
293 * Currently we hardwire nsw_wcount_async to 4. This limit is
294 * designed to prevent other I/O from having high latencies due to
295 * our pageout I/O. The value 4 works well for one or two active swap
296 * devices but is probably a little low if you have more. Even so,
297 * a higher value would probably generate only a limited improvement
298 * with three or four active swap devices since the system does not
299 * typically have to pageout at extreme bandwidths. We will want
300 * at least 2 per swap devices, and 4 is a pretty good value if you
301 * have one NFS swap device due to the command/ack latency over NFS.
302 * So it all works out pretty well.
303 */
304 nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
305
306 mtx_lock(&pbuf_mtx);
307 nsw_rcount = (nswbuf + 1) / 2;
308 nsw_wcount_sync = (nswbuf + 3) / 4;
309 nsw_wcount_async = 4;
310 nsw_wcount_async_max = nsw_wcount_async;
311 mtx_unlock(&pbuf_mtx);
312
313 /*
314 * Initialize our zone. Right now I'm just guessing on the number
315 * we need based on the number of pages in the system. Each swblock
316 * can hold 16 pages, so this is probably overkill. This reservation
317 * is typically limited to around 70MB by default.
318 */
319 n = cnt.v_page_count;
320 if (maxswzone && n > maxswzone / sizeof(struct swblock))
321 n = maxswzone / sizeof(struct swblock);
322 n2 = n;
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/conf.h>
73#include <sys/kernel.h>
74#include <sys/proc.h>
75#include <sys/bio.h>
76#include <sys/buf.h>
77#include <sys/vnode.h>
78#include <sys/malloc.h>
79#include <sys/vmmeter.h>
80#include <sys/sysctl.h>
81#include <sys/blist.h>
82#include <sys/lock.h>
83#include <sys/sx.h>
84#include <sys/vmmeter.h>
85
86#ifndef MAX_PAGEOUT_CLUSTER
87#define MAX_PAGEOUT_CLUSTER 16
88#endif
89
90#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
91
92#include "opt_swap.h"
93#include <vm/vm.h>
94#include <vm/pmap.h>
95#include <vm/vm_map.h>
96#include <vm/vm_kern.h>
97#include <vm/vm_object.h>
98#include <vm/vm_page.h>
99#include <vm/vm_pager.h>
100#include <vm/vm_pageout.h>
101#include <vm/vm_zone.h>
102#include <vm/swap_pager.h>
103#include <vm/vm_extern.h>
104
105#define SWM_FREE 0x02 /* free, period */
106#define SWM_POP 0x04 /* pop out */
107
108/*
109 * vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks
110 * in the old system.
111 */
112extern int vm_swap_size; /* number of free swap blocks, in pages */
113
114int swap_pager_full; /* swap space exhaustion (task killing) */
115static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
116static int nsw_rcount; /* free read buffers */
117static int nsw_wcount_sync; /* limit write buffers / synchronous */
118static int nsw_wcount_async; /* limit write buffers / asynchronous */
119static int nsw_wcount_async_max;/* assigned maximum */
120static int nsw_cluster_max; /* maximum VOP I/O allowed */
121
122struct blist *swapblist;
123static struct swblock **swhash;
124static int swhash_mask;
125static int swap_async_max = 4; /* maximum in-progress async I/O's */
126static struct sx sw_alloc_sx;
127
128/* from vm_swap.c */
129extern struct vnode *swapdev_vp;
130extern struct swdevt *swdevt;
131extern int nswdev;
132
133SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
134 CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
135
136#define BLK2DEVIDX(blk) (nswdev > 1 ? blk / dmmax % nswdev : 0)
137
138/*
139 * "named" and "unnamed" anon region objects. Try to reduce the overhead
140 * of searching a named list by hashing it just a little.
141 */
142
143#define NOBJLISTS 8
144
145#define NOBJLIST(handle) \
146 (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
147
148static struct mtx sw_alloc_mtx; /* protect list manipulation */
149static struct pagerlst swap_pager_object_list[NOBJLISTS];
150struct pagerlst swap_pager_un_object_list;
151vm_zone_t swap_zone;
152
153/*
154 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
155 * calls hooked from other parts of the VM system and do not appear here.
156 * (see vm/swap_pager.h).
157 */
158static vm_object_t
159 swap_pager_alloc __P((void *handle, vm_ooffset_t size,
160 vm_prot_t prot, vm_ooffset_t offset));
161static void swap_pager_dealloc __P((vm_object_t object));
162static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
163static void swap_pager_init __P((void));
164static void swap_pager_unswapped __P((vm_page_t));
165static void swap_pager_strategy __P((vm_object_t, struct bio *));
166
167struct pagerops swappagerops = {
168 swap_pager_init, /* early system initialization of pager */
169 swap_pager_alloc, /* allocate an OBJT_SWAP object */
170 swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
171 swap_pager_getpages, /* pagein */
172 swap_pager_putpages, /* pageout */
173 swap_pager_haspage, /* get backing store status for page */
174 swap_pager_unswapped, /* remove swap related to page */
175 swap_pager_strategy /* pager strategy call */
176};
177
178static struct buf *getchainbuf(struct bio *bp, struct vnode *vp, int flags);
179static void flushchainbuf(struct buf *nbp);
180static void waitchainbuf(struct bio *bp, int count, int done);
181
182/*
183 * dmmax is in page-sized chunks with the new swap system. It was
184 * dev-bsized chunks in the old. dmmax is always a power of 2.
185 *
186 * swap_*() routines are externally accessible. swp_*() routines are
187 * internal.
188 */
189int dmmax;
190static int dmmax_mask;
191int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
192int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
193
194SYSCTL_INT(_vm, OID_AUTO, dmmax,
195 CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
196
197static __inline void swp_sizecheck __P((void));
198static void swp_pager_sync_iodone __P((struct buf *bp));
199static void swp_pager_async_iodone __P((struct buf *bp));
200
201/*
202 * Swap bitmap functions
203 */
204static __inline void swp_pager_freeswapspace __P((daddr_t blk, int npages));
205static __inline daddr_t swp_pager_getswapspace __P((int npages));
206
207/*
208 * Metadata functions
209 */
210static void swp_pager_meta_build __P((vm_object_t, vm_pindex_t, daddr_t));
211static void swp_pager_meta_free __P((vm_object_t, vm_pindex_t, daddr_t));
212static void swp_pager_meta_free_all __P((vm_object_t));
213static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int));
214
215/*
216 * SWP_SIZECHECK() - update swap_pager_full indication
217 *
218 * update the swap_pager_almost_full indication and warn when we are
219 * about to run out of swap space, using lowat/hiwat hysteresis.
220 *
221 * Clear swap_pager_full ( task killing ) indication when lowat is met.
222 *
223 * No restrictions on call
224 * This routine may not block.
225 * This routine must be called at splvm()
226 */
227static __inline void
228swp_sizecheck()
229{
230 GIANT_REQUIRED;
231
232 if (vm_swap_size < nswap_lowat) {
233 if (swap_pager_almost_full == 0) {
234 printf("swap_pager: out of swap space\n");
235 swap_pager_almost_full = 1;
236 }
237 } else {
238 swap_pager_full = 0;
239 if (vm_swap_size > nswap_hiwat)
240 swap_pager_almost_full = 0;
241 }
242}
243
244/*
245 * SWAP_PAGER_INIT() - initialize the swap pager!
246 *
247 * Expected to be started from system init. NOTE: This code is run
248 * before much else so be careful what you depend on. Most of the VM
249 * system has yet to be initialized at this point.
250 */
251static void
252swap_pager_init()
253{
254 /*
255 * Initialize object lists
256 */
257 int i;
258
259 for (i = 0; i < NOBJLISTS; ++i)
260 TAILQ_INIT(&swap_pager_object_list[i]);
261 TAILQ_INIT(&swap_pager_un_object_list);
262 mtx_init(&sw_alloc_mtx, "swap_pager list", MTX_DEF);
263
264 /*
265 * Device Stripe, in PAGE_SIZE'd blocks
266 */
267 dmmax = SWB_NPAGES * 2;
268 dmmax_mask = ~(dmmax - 1);
269}
270
271/*
272 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
273 *
274 * Expected to be started from pageout process once, prior to entering
275 * its main loop.
276 */
277void
278swap_pager_swap_init()
279{
280 int n, n2;
281
282 /*
283 * Number of in-transit swap bp operations. Don't
284 * exhaust the pbufs completely. Make sure we
285 * initialize workable values (0 will work for hysteresis
286 * but it isn't very efficient).
287 *
288 * The nsw_cluster_max is constrained by the bp->b_pages[]
289 * array (MAXPHYS/PAGE_SIZE) and our locally defined
290 * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
291 * constrained by the swap device interleave stripe size.
292 *
293 * Currently we hardwire nsw_wcount_async to 4. This limit is
294 * designed to prevent other I/O from having high latencies due to
295 * our pageout I/O. The value 4 works well for one or two active swap
296 * devices but is probably a little low if you have more. Even so,
297 * a higher value would probably generate only a limited improvement
298 * with three or four active swap devices since the system does not
299 * typically have to pageout at extreme bandwidths. We will want
300 * at least 2 per swap devices, and 4 is a pretty good value if you
301 * have one NFS swap device due to the command/ack latency over NFS.
302 * So it all works out pretty well.
303 */
304 nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
305
306 mtx_lock(&pbuf_mtx);
307 nsw_rcount = (nswbuf + 1) / 2;
308 nsw_wcount_sync = (nswbuf + 3) / 4;
309 nsw_wcount_async = 4;
310 nsw_wcount_async_max = nsw_wcount_async;
311 mtx_unlock(&pbuf_mtx);
312
313 /*
314 * Initialize our zone. Right now I'm just guessing on the number
315 * we need based on the number of pages in the system. Each swblock
316 * can hold 16 pages, so this is probably overkill. This reservation
317 * is typically limited to around 70MB by default.
318 */
319 n = cnt.v_page_count;
320 if (maxswzone && n > maxswzone / sizeof(struct swblock))
321 n = maxswzone / sizeof(struct swblock);
322 n2 = n;
323 swap_zone = zinit(
324 "SWAPMETA",
325 sizeof(struct swblock),
326 n,
327 ZONE_INTERRUPT,
328 1
329 );
323 do {
330 do {
324 swap_zone = zinit(
325 "SWAPMETA",
326 sizeof(struct swblock),
327 n,
328 ZONE_INTERRUPT,
329 1
330 );
331 if (swap_zone != NULL)
331 if (uma_zone_set_obj(swap_zone, NULL, n))
332 break;
333 /*
334 * if the allocation failed, try a zone two thirds the
335 * size of the previous attempt.
336 */
337 n -= ((n + 2) / 3);
338 } while (n > 0);
339 if (swap_zone == NULL)
340 panic("failed to zinit swap_zone.");
341 if (n2 != n)
342 printf("Swap zone entries reduced from %d to %d.\n", n2, n);
343 n2 = n;
344
345 /*
346 * Initialize our meta-data hash table. The swapper does not need to
347 * be quite as efficient as the VM system, so we do not use an
348 * oversized hash table.
349 *
350 * n: size of hash table, must be power of 2
351 * swhash_mask: hash table index mask
352 */
353 for (n = 1; n < n2 / 8; n *= 2)
354 ;
355 swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
356 swhash_mask = n - 1;
357}
358
359/*
360 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
361 * its metadata structures.
362 *
363 * This routine is called from the mmap and fork code to create a new
364 * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
365 * and then converting it with swp_pager_meta_build().
366 *
367 * This routine may block in vm_object_allocate() and create a named
368 * object lookup race, so we must interlock. We must also run at
369 * splvm() for the object lookup to handle races with interrupts, but
370 * we do not have to maintain splvm() in between the lookup and the
371 * add because (I believe) it is not possible to attempt to create
372 * a new swap object w/handle when a default object with that handle
373 * already exists.
374 */
375static vm_object_t
376swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
377 vm_ooffset_t offset)
378{
379 vm_object_t object;
380
381 GIANT_REQUIRED;
382
383 if (handle) {
384 /*
385 * Reference existing named region or allocate new one. There
386 * should not be a race here against swp_pager_meta_build()
387 * as called from vm_page_remove() in regards to the lookup
388 * of the handle.
389 */
390 sx_xlock(&sw_alloc_sx);
391 object = vm_pager_object_lookup(NOBJLIST(handle), handle);
392
393 if (object != NULL) {
394 vm_object_reference(object);
395 } else {
396 object = vm_object_allocate(OBJT_DEFAULT,
397 OFF_TO_IDX(offset + PAGE_MASK + size));
398 object->handle = handle;
399
400 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
401 }
402 sx_xunlock(&sw_alloc_sx);
403 } else {
404 object = vm_object_allocate(OBJT_DEFAULT,
405 OFF_TO_IDX(offset + PAGE_MASK + size));
406
407 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
408 }
409
410 return (object);
411}
412
413/*
414 * SWAP_PAGER_DEALLOC() - remove swap metadata from object
415 *
416 * The swap backing for the object is destroyed. The code is
417 * designed such that we can reinstantiate it later, but this
418 * routine is typically called only when the entire object is
419 * about to be destroyed.
420 *
421 * This routine may block, but no longer does.
422 *
423 * The object must be locked or unreferenceable.
424 */
425static void
426swap_pager_dealloc(object)
427 vm_object_t object;
428{
429 int s;
430
431 GIANT_REQUIRED;
432
433 /*
434 * Remove from list right away so lookups will fail if we block for
435 * pageout completion.
436 */
437 mtx_lock(&sw_alloc_mtx);
438 if (object->handle == NULL) {
439 TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
440 } else {
441 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
442 }
443 mtx_unlock(&sw_alloc_mtx);
444
445 vm_object_pip_wait(object, "swpdea");
446
447 /*
448 * Free all remaining metadata. We only bother to free it from
449 * the swap meta data. We do not attempt to free swapblk's still
450 * associated with vm_page_t's for this object. We do not care
451 * if paging is still in progress on some objects.
452 */
453 s = splvm();
454 swp_pager_meta_free_all(object);
455 splx(s);
456}
457
458/************************************************************************
459 * SWAP PAGER BITMAP ROUTINES *
460 ************************************************************************/
461
462/*
463 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
464 *
465 * Allocate swap for the requested number of pages. The starting
466 * swap block number (a page index) is returned or SWAPBLK_NONE
467 * if the allocation failed.
468 *
469 * Also has the side effect of advising that somebody made a mistake
470 * when they configured swap and didn't configure enough.
471 *
472 * Must be called at splvm() to avoid races with bitmap frees from
473 * vm_page_remove() aka swap_pager_page_removed().
474 *
475 * This routine may not block
476 * This routine must be called at splvm().
477 */
478static __inline daddr_t
479swp_pager_getswapspace(npages)
480 int npages;
481{
482 daddr_t blk;
483
484 GIANT_REQUIRED;
485
486 if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
487 if (swap_pager_full != 2) {
488 printf("swap_pager_getswapspace: failed\n");
489 swap_pager_full = 2;
490 swap_pager_almost_full = 1;
491 }
492 } else {
493 vm_swap_size -= npages;
494 /* per-swap area stats */
495 swdevt[BLK2DEVIDX(blk)].sw_used += npages;
496 swp_sizecheck();
497 }
498 return (blk);
499}
500
501/*
502 * SWP_PAGER_FREESWAPSPACE() - free raw swap space
503 *
504 * This routine returns the specified swap blocks back to the bitmap.
505 *
506 * Note: This routine may not block (it could in the old swap code),
507 * and through the use of the new blist routines it does not block.
508 *
509 * We must be called at splvm() to avoid races with bitmap frees from
510 * vm_page_remove() aka swap_pager_page_removed().
511 *
512 * This routine may not block
513 * This routine must be called at splvm().
514 */
515static __inline void
516swp_pager_freeswapspace(blk, npages)
517 daddr_t blk;
518 int npages;
519{
520 GIANT_REQUIRED;
521
522 blist_free(swapblist, blk, npages);
523 vm_swap_size += npages;
524 /* per-swap area stats */
525 swdevt[BLK2DEVIDX(blk)].sw_used -= npages;
526 swp_sizecheck();
527}
528
529/*
530 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
531 * range within an object.
532 *
533 * This is a globally accessible routine.
534 *
535 * This routine removes swapblk assignments from swap metadata.
536 *
537 * The external callers of this routine typically have already destroyed
538 * or renamed vm_page_t's associated with this range in the object so
539 * we should be ok.
540 *
541 * This routine may be called at any spl. We up our spl to splvm temporarily
542 * in order to perform the metadata removal.
543 */
544void
545swap_pager_freespace(object, start, size)
546 vm_object_t object;
547 vm_pindex_t start;
548 vm_size_t size;
549{
550 int s = splvm();
551
552 GIANT_REQUIRED;
553 swp_pager_meta_free(object, start, size);
554 splx(s);
555}
556
557/*
558 * SWAP_PAGER_RESERVE() - reserve swap blocks in object
559 *
560 * Assigns swap blocks to the specified range within the object. The
561 * swap blocks are not zerod. Any previous swap assignment is destroyed.
562 *
563 * Returns 0 on success, -1 on failure.
564 */
565int
566swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
567{
568 int s;
569 int n = 0;
570 daddr_t blk = SWAPBLK_NONE;
571 vm_pindex_t beg = start; /* save start index */
572
573 s = splvm();
574 while (size) {
575 if (n == 0) {
576 n = BLIST_MAX_ALLOC;
577 while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
578 n >>= 1;
579 if (n == 0) {
580 swp_pager_meta_free(object, beg, start - beg);
581 splx(s);
582 return (-1);
583 }
584 }
585 }
586 swp_pager_meta_build(object, start, blk);
587 --size;
588 ++start;
589 ++blk;
590 --n;
591 }
592 swp_pager_meta_free(object, start, n);
593 splx(s);
594 return (0);
595}
596
597/*
598 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
599 * and destroy the source.
600 *
601 * Copy any valid swapblks from the source to the destination. In
602 * cases where both the source and destination have a valid swapblk,
603 * we keep the destination's.
604 *
605 * This routine is allowed to block. It may block allocating metadata
606 * indirectly through swp_pager_meta_build() or if paging is still in
607 * progress on the source.
608 *
609 * This routine can be called at any spl
610 *
611 * XXX vm_page_collapse() kinda expects us not to block because we
612 * supposedly do not need to allocate memory, but for the moment we
613 * *may* have to get a little memory from the zone allocator, but
614 * it is taken from the interrupt memory. We should be ok.
615 *
616 * The source object contains no vm_page_t's (which is just as well)
617 *
618 * The source object is of type OBJT_SWAP.
619 *
620 * The source and destination objects must be locked or
621 * inaccessible (XXX are they ?)
622 */
623void
624swap_pager_copy(srcobject, dstobject, offset, destroysource)
625 vm_object_t srcobject;
626 vm_object_t dstobject;
627 vm_pindex_t offset;
628 int destroysource;
629{
630 vm_pindex_t i;
631 int s;
632
633 GIANT_REQUIRED;
634
635 s = splvm();
636 /*
637 * If destroysource is set, we remove the source object from the
638 * swap_pager internal queue now.
639 */
640 if (destroysource) {
641 mtx_lock(&sw_alloc_mtx);
642 if (srcobject->handle == NULL) {
643 TAILQ_REMOVE(
644 &swap_pager_un_object_list,
645 srcobject,
646 pager_object_list
647 );
648 } else {
649 TAILQ_REMOVE(
650 NOBJLIST(srcobject->handle),
651 srcobject,
652 pager_object_list
653 );
654 }
655 mtx_unlock(&sw_alloc_mtx);
656 }
657
658 /*
659 * transfer source to destination.
660 */
661 for (i = 0; i < dstobject->size; ++i) {
662 daddr_t dstaddr;
663
664 /*
665 * Locate (without changing) the swapblk on the destination,
666 * unless it is invalid in which case free it silently, or
667 * if the destination is a resident page, in which case the
668 * source is thrown away.
669 */
670 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
671
672 if (dstaddr == SWAPBLK_NONE) {
673 /*
674 * Destination has no swapblk and is not resident,
675 * copy source.
676 */
677 daddr_t srcaddr;
678
679 srcaddr = swp_pager_meta_ctl(
680 srcobject,
681 i + offset,
682 SWM_POP
683 );
684
685 if (srcaddr != SWAPBLK_NONE)
686 swp_pager_meta_build(dstobject, i, srcaddr);
687 } else {
688 /*
689 * Destination has valid swapblk or it is represented
690 * by a resident page. We destroy the sourceblock.
691 */
692
693 swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
694 }
695 }
696
697 /*
698 * Free left over swap blocks in source.
699 *
700 * We have to revert the type to OBJT_DEFAULT so we do not accidently
701 * double-remove the object from the swap queues.
702 */
703 if (destroysource) {
704 swp_pager_meta_free_all(srcobject);
705 /*
706 * Reverting the type is not necessary, the caller is going
707 * to destroy srcobject directly, but I'm doing it here
708 * for consistency since we've removed the object from its
709 * queues.
710 */
711 srcobject->type = OBJT_DEFAULT;
712 }
713 splx(s);
714}
715
716/*
717 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for
718 * the requested page.
719 *
720 * We determine whether good backing store exists for the requested
721 * page and return TRUE if it does, FALSE if it doesn't.
722 *
723 * If TRUE, we also try to determine how much valid, contiguous backing
724 * store exists before and after the requested page within a reasonable
725 * distance. We do not try to restrict it to the swap device stripe
726 * (that is handled in getpages/putpages). It probably isn't worth
727 * doing here.
728 */
729boolean_t
730swap_pager_haspage(object, pindex, before, after)
731 vm_object_t object;
732 vm_pindex_t pindex;
733 int *before;
734 int *after;
735{
736 daddr_t blk0;
737 int s;
738
739 /*
740 * do we have good backing store at the requested index ?
741 */
742 s = splvm();
743 blk0 = swp_pager_meta_ctl(object, pindex, 0);
744
745 if (blk0 == SWAPBLK_NONE) {
746 splx(s);
747 if (before)
748 *before = 0;
749 if (after)
750 *after = 0;
751 return (FALSE);
752 }
753
754 /*
755 * find backwards-looking contiguous good backing store
756 */
757 if (before != NULL) {
758 int i;
759
760 for (i = 1; i < (SWB_NPAGES/2); ++i) {
761 daddr_t blk;
762
763 if (i > pindex)
764 break;
765 blk = swp_pager_meta_ctl(object, pindex - i, 0);
766 if (blk != blk0 - i)
767 break;
768 }
769 *before = (i - 1);
770 }
771
772 /*
773 * find forward-looking contiguous good backing store
774 */
775 if (after != NULL) {
776 int i;
777
778 for (i = 1; i < (SWB_NPAGES/2); ++i) {
779 daddr_t blk;
780
781 blk = swp_pager_meta_ctl(object, pindex + i, 0);
782 if (blk != blk0 + i)
783 break;
784 }
785 *after = (i - 1);
786 }
787 splx(s);
788 return (TRUE);
789}
790
791/*
792 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
793 *
794 * This removes any associated swap backing store, whether valid or
795 * not, from the page.
796 *
797 * This routine is typically called when a page is made dirty, at
798 * which point any associated swap can be freed. MADV_FREE also
799 * calls us in a special-case situation
800 *
801 * NOTE!!! If the page is clean and the swap was valid, the caller
802 * should make the page dirty before calling this routine. This routine
803 * does NOT change the m->dirty status of the page. Also: MADV_FREE
804 * depends on it.
805 *
806 * This routine may not block
807 * This routine must be called at splvm()
808 */
809static void
810swap_pager_unswapped(m)
811 vm_page_t m;
812{
813 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
814}
815
816/*
817 * SWAP_PAGER_STRATEGY() - read, write, free blocks
818 *
819 * This implements the vm_pager_strategy() interface to swap and allows
820 * other parts of the system to directly access swap as backing store
821 * through vm_objects of type OBJT_SWAP. This is intended to be a
822 * cacheless interface ( i.e. caching occurs at higher levels ).
823 * Therefore we do not maintain any resident pages. All I/O goes
824 * directly to and from the swap device.
825 *
826 * Note that b_blkno is scaled for PAGE_SIZE
827 *
828 * We currently attempt to run I/O synchronously or asynchronously as
829 * the caller requests. This isn't perfect because we loose error
830 * sequencing when we run multiple ops in parallel to satisfy a request.
831 * But this is swap, so we let it all hang out.
832 */
833static void
834swap_pager_strategy(vm_object_t object, struct bio *bp)
835{
836 vm_pindex_t start;
837 int count;
838 int s;
839 char *data;
840 struct buf *nbp = NULL;
841
842 GIANT_REQUIRED;
843
844 /* XXX: KASSERT instead ? */
845 if (bp->bio_bcount & PAGE_MASK) {
846 biofinish(bp, NULL, EINVAL);
847 printf("swap_pager_strategy: bp %p blk %d size %d, not page bounded\n", bp, (int)bp->bio_pblkno, (int)bp->bio_bcount);
848 return;
849 }
850
851 /*
852 * Clear error indication, initialize page index, count, data pointer.
853 */
854 bp->bio_error = 0;
855 bp->bio_flags &= ~BIO_ERROR;
856 bp->bio_resid = bp->bio_bcount;
857 *(u_int *) &bp->bio_driver1 = 0;
858
859 start = bp->bio_pblkno;
860 count = howmany(bp->bio_bcount, PAGE_SIZE);
861 data = bp->bio_data;
862
863 s = splvm();
864
865 /*
866 * Deal with BIO_DELETE
867 */
868 if (bp->bio_cmd == BIO_DELETE) {
869 /*
870 * FREE PAGE(s) - destroy underlying swap that is no longer
871 * needed.
872 */
873 swp_pager_meta_free(object, start, count);
874 splx(s);
875 bp->bio_resid = 0;
876 biodone(bp);
877 return;
878 }
879
880 /*
881 * Execute read or write
882 */
883 while (count > 0) {
884 daddr_t blk;
885
886 /*
887 * Obtain block. If block not found and writing, allocate a
888 * new block and build it into the object.
889 */
890
891 blk = swp_pager_meta_ctl(object, start, 0);
892 if ((blk == SWAPBLK_NONE) && (bp->bio_cmd == BIO_WRITE)) {
893 blk = swp_pager_getswapspace(1);
894 if (blk == SWAPBLK_NONE) {
895 bp->bio_error = ENOMEM;
896 bp->bio_flags |= BIO_ERROR;
897 break;
898 }
899 swp_pager_meta_build(object, start, blk);
900 }
901
902 /*
903 * Do we have to flush our current collection? Yes if:
904 *
905 * - no swap block at this index
906 * - swap block is not contiguous
907 * - we cross a physical disk boundry in the
908 * stripe.
909 */
910 if (
911 nbp && (nbp->b_blkno + btoc(nbp->b_bcount) != blk ||
912 ((nbp->b_blkno ^ blk) & dmmax_mask)
913 )
914 ) {
915 splx(s);
916 if (bp->bio_cmd == BIO_READ) {
917 ++cnt.v_swapin;
918 cnt.v_swappgsin += btoc(nbp->b_bcount);
919 } else {
920 ++cnt.v_swapout;
921 cnt.v_swappgsout += btoc(nbp->b_bcount);
922 nbp->b_dirtyend = nbp->b_bcount;
923 }
924 flushchainbuf(nbp);
925 s = splvm();
926 nbp = NULL;
927 }
928
929 /*
930 * Add new swapblk to nbp, instantiating nbp if necessary.
931 * Zero-fill reads are able to take a shortcut.
932 */
933 if (blk == SWAPBLK_NONE) {
934 /*
935 * We can only get here if we are reading. Since
936 * we are at splvm() we can safely modify b_resid,
937 * even if chain ops are in progress.
938 */
939 bzero(data, PAGE_SIZE);
940 bp->bio_resid -= PAGE_SIZE;
941 } else {
942 if (nbp == NULL) {
943 nbp = getchainbuf(bp, swapdev_vp, B_ASYNC);
944 nbp->b_blkno = blk;
945 nbp->b_bcount = 0;
946 nbp->b_data = data;
947 }
948 nbp->b_bcount += PAGE_SIZE;
949 }
950 --count;
951 ++start;
952 data += PAGE_SIZE;
953 }
954
955 /*
956 * Flush out last buffer
957 */
958 splx(s);
959
960 if (nbp) {
961 if (nbp->b_iocmd == BIO_READ) {
962 ++cnt.v_swapin;
963 cnt.v_swappgsin += btoc(nbp->b_bcount);
964 } else {
965 ++cnt.v_swapout;
966 cnt.v_swappgsout += btoc(nbp->b_bcount);
967 nbp->b_dirtyend = nbp->b_bcount;
968 }
969 flushchainbuf(nbp);
970 /* nbp = NULL; */
971 }
972 /*
973 * Wait for completion.
974 */
975 waitchainbuf(bp, 0, 1);
976}
977
978/*
979 * SWAP_PAGER_GETPAGES() - bring pages in from swap
980 *
981 * Attempt to retrieve (m, count) pages from backing store, but make
982 * sure we retrieve at least m[reqpage]. We try to load in as large
983 * a chunk surrounding m[reqpage] as is contiguous in swap and which
984 * belongs to the same object.
985 *
986 * The code is designed for asynchronous operation and
987 * immediate-notification of 'reqpage' but tends not to be
988 * used that way. Please do not optimize-out this algorithmic
989 * feature, I intend to improve on it in the future.
990 *
991 * The parent has a single vm_object_pip_add() reference prior to
992 * calling us and we should return with the same.
993 *
994 * The parent has BUSY'd the pages. We should return with 'm'
995 * left busy, but the others adjusted.
996 */
997static int
998swap_pager_getpages(object, m, count, reqpage)
999 vm_object_t object;
1000 vm_page_t *m;
1001 int count, reqpage;
1002{
1003 struct buf *bp;
1004 vm_page_t mreq;
1005 int s;
1006 int i;
1007 int j;
1008 daddr_t blk;
1009 vm_offset_t kva;
1010 vm_pindex_t lastpindex;
1011
1012 GIANT_REQUIRED;
1013
1014 mreq = m[reqpage];
1015
1016 if (mreq->object != object) {
1017 panic("swap_pager_getpages: object mismatch %p/%p",
1018 object,
1019 mreq->object
1020 );
1021 }
1022 /*
1023 * Calculate range to retrieve. The pages have already been assigned
1024 * their swapblks. We require a *contiguous* range that falls entirely
1025 * within a single device stripe. If we do not supply it, bad things
1026 * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
1027 * loops are set up such that the case(s) are handled implicitly.
1028 *
1029 * The swp_*() calls must be made at splvm(). vm_page_free() does
1030 * not need to be, but it will go a little faster if it is.
1031 */
1032 s = splvm();
1033 blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1034
1035 for (i = reqpage - 1; i >= 0; --i) {
1036 daddr_t iblk;
1037
1038 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
1039 if (blk != iblk + (reqpage - i))
1040 break;
1041 if ((blk ^ iblk) & dmmax_mask)
1042 break;
1043 }
1044 ++i;
1045
1046 for (j = reqpage + 1; j < count; ++j) {
1047 daddr_t jblk;
1048
1049 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
1050 if (blk != jblk - (j - reqpage))
1051 break;
1052 if ((blk ^ jblk) & dmmax_mask)
1053 break;
1054 }
1055
1056 /*
1057 * free pages outside our collection range. Note: we never free
1058 * mreq, it must remain busy throughout.
1059 */
1060 {
1061 int k;
1062
1063 for (k = 0; k < i; ++k)
1064 vm_page_free(m[k]);
1065 for (k = j; k < count; ++k)
1066 vm_page_free(m[k]);
1067 }
1068 splx(s);
1069
1070
1071 /*
1072 * Return VM_PAGER_FAIL if we have nothing to do. Return mreq
1073 * still busy, but the others unbusied.
1074 */
1075 if (blk == SWAPBLK_NONE)
1076 return (VM_PAGER_FAIL);
1077
1078 /*
1079 * Get a swap buffer header to perform the IO
1080 */
1081 bp = getpbuf(&nsw_rcount);
1082 kva = (vm_offset_t) bp->b_data;
1083
1084 /*
1085 * map our page(s) into kva for input
1086 *
1087 * NOTE: B_PAGING is set by pbgetvp()
1088 */
1089 pmap_qenter(kva, m + i, j - i);
1090
1091 bp->b_iocmd = BIO_READ;
1092 bp->b_iodone = swp_pager_async_iodone;
1093 bp->b_rcred = crhold(thread0.td_ucred);
1094 bp->b_wcred = crhold(thread0.td_ucred);
1095 bp->b_data = (caddr_t) kva;
1096 bp->b_blkno = blk - (reqpage - i);
1097 bp->b_bcount = PAGE_SIZE * (j - i);
1098 bp->b_bufsize = PAGE_SIZE * (j - i);
1099 bp->b_pager.pg_reqpage = reqpage - i;
1100
1101 {
1102 int k;
1103
1104 for (k = i; k < j; ++k) {
1105 bp->b_pages[k - i] = m[k];
1106 vm_page_flag_set(m[k], PG_SWAPINPROG);
1107 }
1108 }
1109 bp->b_npages = j - i;
1110
1111 pbgetvp(swapdev_vp, bp);
1112
1113 cnt.v_swapin++;
1114 cnt.v_swappgsin += bp->b_npages;
1115
1116 /*
1117 * We still hold the lock on mreq, and our automatic completion routine
1118 * does not remove it.
1119 */
1120 vm_object_pip_add(mreq->object, bp->b_npages);
1121 lastpindex = m[j-1]->pindex;
1122
1123 /*
1124 * perform the I/O. NOTE!!! bp cannot be considered valid after
1125 * this point because we automatically release it on completion.
1126 * Instead, we look at the one page we are interested in which we
1127 * still hold a lock on even through the I/O completion.
1128 *
1129 * The other pages in our m[] array are also released on completion,
1130 * so we cannot assume they are valid anymore either.
1131 *
1132 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
1133 */
1134 BUF_KERNPROC(bp);
1135 BUF_STRATEGY(bp);
1136
1137 /*
1138 * wait for the page we want to complete. PG_SWAPINPROG is always
1139 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE
1140 * is set in the meta-data.
1141 */
1142 s = splvm();
1143 while ((mreq->flags & PG_SWAPINPROG) != 0) {
1144 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
1145 cnt.v_intrans++;
1146 if (tsleep(mreq, PSWP, "swread", hz*20)) {
1147 printf(
1148 "swap_pager: indefinite wait buffer: device:"
1149 " %s, blkno: %ld, size: %ld\n",
1150 devtoname(bp->b_dev), (long)bp->b_blkno,
1151 bp->b_bcount
1152 );
1153 }
1154 }
1155 splx(s);
1156
1157 /*
1158 * mreq is left busied after completion, but all the other pages
1159 * are freed. If we had an unrecoverable read error the page will
1160 * not be valid.
1161 */
1162 if (mreq->valid != VM_PAGE_BITS_ALL) {
1163 return (VM_PAGER_ERROR);
1164 } else {
1165 return (VM_PAGER_OK);
1166 }
1167
1168 /*
1169 * A final note: in a low swap situation, we cannot deallocate swap
1170 * and mark a page dirty here because the caller is likely to mark
1171 * the page clean when we return, causing the page to possibly revert
1172 * to all-zero's later.
1173 */
1174}
1175
1176/*
1177 * swap_pager_putpages:
1178 *
1179 * Assign swap (if necessary) and initiate I/O on the specified pages.
1180 *
1181 * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
1182 * are automatically converted to SWAP objects.
1183 *
1184 * In a low memory situation we may block in VOP_STRATEGY(), but the new
1185 * vm_page reservation system coupled with properly written VFS devices
1186 * should ensure that no low-memory deadlock occurs. This is an area
1187 * which needs work.
1188 *
1189 * The parent has N vm_object_pip_add() references prior to
1190 * calling us and will remove references for rtvals[] that are
1191 * not set to VM_PAGER_PEND. We need to remove the rest on I/O
1192 * completion.
1193 *
1194 * The parent has soft-busy'd the pages it passes us and will unbusy
1195 * those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1196 * We need to unbusy the rest on I/O completion.
1197 */
1198void
1199swap_pager_putpages(object, m, count, sync, rtvals)
1200 vm_object_t object;
1201 vm_page_t *m;
1202 int count;
1203 boolean_t sync;
1204 int *rtvals;
1205{
1206 int i;
1207 int n = 0;
1208
1209 GIANT_REQUIRED;
1210 if (count && m[0]->object != object) {
1211 panic("swap_pager_getpages: object mismatch %p/%p",
1212 object,
1213 m[0]->object
1214 );
1215 }
1216 /*
1217 * Step 1
1218 *
1219 * Turn object into OBJT_SWAP
1220 * check for bogus sysops
1221 * force sync if not pageout process
1222 */
1223 if (object->type != OBJT_SWAP)
1224 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
1225
1226 if (curproc != pageproc)
1227 sync = TRUE;
1228
1229 /*
1230 * Step 2
1231 *
1232 * Update nsw parameters from swap_async_max sysctl values.
1233 * Do not let the sysop crash the machine with bogus numbers.
1234 */
1235 mtx_lock(&pbuf_mtx);
1236 if (swap_async_max != nsw_wcount_async_max) {
1237 int n;
1238 int s;
1239
1240 /*
1241 * limit range
1242 */
1243 if ((n = swap_async_max) > nswbuf / 2)
1244 n = nswbuf / 2;
1245 if (n < 1)
1246 n = 1;
1247 swap_async_max = n;
1248
1249 /*
1250 * Adjust difference ( if possible ). If the current async
1251 * count is too low, we may not be able to make the adjustment
1252 * at this time.
1253 */
1254 s = splvm();
1255 n -= nsw_wcount_async_max;
1256 if (nsw_wcount_async + n >= 0) {
1257 nsw_wcount_async += n;
1258 nsw_wcount_async_max += n;
1259 wakeup(&nsw_wcount_async);
1260 }
1261 splx(s);
1262 }
1263 mtx_unlock(&pbuf_mtx);
1264
1265 /*
1266 * Step 3
1267 *
1268 * Assign swap blocks and issue I/O. We reallocate swap on the fly.
1269 * The page is left dirty until the pageout operation completes
1270 * successfully.
1271 */
1272 for (i = 0; i < count; i += n) {
1273 int s;
1274 int j;
1275 struct buf *bp;
1276 daddr_t blk;
1277
1278 /*
1279 * Maximum I/O size is limited by a number of factors.
1280 */
1281 n = min(BLIST_MAX_ALLOC, count - i);
1282 n = min(n, nsw_cluster_max);
1283
1284 s = splvm();
1285
1286 /*
1287 * Get biggest block of swap we can. If we fail, fall
1288 * back and try to allocate a smaller block. Don't go
1289 * overboard trying to allocate space if it would overly
1290 * fragment swap.
1291 */
1292 while (
1293 (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
1294 n > 4
1295 ) {
1296 n >>= 1;
1297 }
1298 if (blk == SWAPBLK_NONE) {
1299 for (j = 0; j < n; ++j)
1300 rtvals[i+j] = VM_PAGER_FAIL;
1301 splx(s);
1302 continue;
1303 }
1304
1305 /*
1306 * The I/O we are constructing cannot cross a physical
1307 * disk boundry in the swap stripe. Note: we are still
1308 * at splvm().
1309 */
1310 if ((blk ^ (blk + n)) & dmmax_mask) {
1311 j = ((blk + dmmax) & dmmax_mask) - blk;
1312 swp_pager_freeswapspace(blk + j, n - j);
1313 n = j;
1314 }
1315
1316 /*
1317 * All I/O parameters have been satisfied, build the I/O
1318 * request and assign the swap space.
1319 *
1320 * NOTE: B_PAGING is set by pbgetvp()
1321 */
1322 if (sync == TRUE) {
1323 bp = getpbuf(&nsw_wcount_sync);
1324 } else {
1325 bp = getpbuf(&nsw_wcount_async);
1326 bp->b_flags = B_ASYNC;
1327 }
1328 bp->b_iocmd = BIO_WRITE;
1329 bp->b_spc = NULL; /* not used, but NULL-out anyway */
1330
1331 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
1332
1333 bp->b_rcred = crhold(thread0.td_ucred);
1334 bp->b_wcred = crhold(thread0.td_ucred);
1335 bp->b_bcount = PAGE_SIZE * n;
1336 bp->b_bufsize = PAGE_SIZE * n;
1337 bp->b_blkno = blk;
1338
1339 pbgetvp(swapdev_vp, bp);
1340
1341 for (j = 0; j < n; ++j) {
1342 vm_page_t mreq = m[i+j];
1343
1344 swp_pager_meta_build(
1345 mreq->object,
1346 mreq->pindex,
1347 blk + j
1348 );
1349 vm_page_dirty(mreq);
1350 rtvals[i+j] = VM_PAGER_OK;
1351
1352 vm_page_flag_set(mreq, PG_SWAPINPROG);
1353 bp->b_pages[j] = mreq;
1354 }
1355 bp->b_npages = n;
1356 /*
1357 * Must set dirty range for NFS to work.
1358 */
1359 bp->b_dirtyoff = 0;
1360 bp->b_dirtyend = bp->b_bcount;
1361
1362 cnt.v_swapout++;
1363 cnt.v_swappgsout += bp->b_npages;
1364 swapdev_vp->v_numoutput++;
1365
1366 splx(s);
1367
1368 /*
1369 * asynchronous
1370 *
1371 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
1372 */
1373 if (sync == FALSE) {
1374 bp->b_iodone = swp_pager_async_iodone;
1375 BUF_KERNPROC(bp);
1376 BUF_STRATEGY(bp);
1377
1378 for (j = 0; j < n; ++j)
1379 rtvals[i+j] = VM_PAGER_PEND;
1380 /* restart outter loop */
1381 continue;
1382 }
1383
1384 /*
1385 * synchronous
1386 *
1387 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
1388 */
1389 bp->b_iodone = swp_pager_sync_iodone;
1390 BUF_STRATEGY(bp);
1391
1392 /*
1393 * Wait for the sync I/O to complete, then update rtvals.
1394 * We just set the rtvals[] to VM_PAGER_PEND so we can call
1395 * our async completion routine at the end, thus avoiding a
1396 * double-free.
1397 */
1398 s = splbio();
1399 while ((bp->b_flags & B_DONE) == 0) {
1400 tsleep(bp, PVM, "swwrt", 0);
1401 }
1402 for (j = 0; j < n; ++j)
1403 rtvals[i+j] = VM_PAGER_PEND;
1404 /*
1405 * Now that we are through with the bp, we can call the
1406 * normal async completion, which frees everything up.
1407 */
1408 swp_pager_async_iodone(bp);
1409 splx(s);
1410 }
1411}
1412
1413/*
1414 * swap_pager_sync_iodone:
1415 *
1416 * Completion routine for synchronous reads and writes from/to swap.
1417 * We just mark the bp is complete and wake up anyone waiting on it.
1418 *
1419 * This routine may not block. This routine is called at splbio() or better.
1420 */
1421static void
1422swp_pager_sync_iodone(bp)
1423 struct buf *bp;
1424{
1425 bp->b_flags |= B_DONE;
1426 bp->b_flags &= ~B_ASYNC;
1427 wakeup(bp);
1428}
1429
1430/*
1431 * swp_pager_async_iodone:
1432 *
1433 * Completion routine for asynchronous reads and writes from/to swap.
1434 * Also called manually by synchronous code to finish up a bp.
1435 *
1436 * For READ operations, the pages are PG_BUSY'd. For WRITE operations,
1437 * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY
1438 * unbusy all pages except the 'main' request page. For WRITE
1439 * operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1440 * because we marked them all VM_PAGER_PEND on return from putpages ).
1441 *
1442 * This routine may not block.
1443 * This routine is called at splbio() or better
1444 *
1445 * We up ourselves to splvm() as required for various vm_page related
1446 * calls.
1447 */
1448static void
1449swp_pager_async_iodone(bp)
1450 struct buf *bp;
1451{
1452 int s;
1453 int i;
1454 vm_object_t object = NULL;
1455
1456 GIANT_REQUIRED;
1457 bp->b_flags |= B_DONE;
1458
1459 /*
1460 * report error
1461 */
1462 if (bp->b_ioflags & BIO_ERROR) {
1463 printf(
1464 "swap_pager: I/O error - %s failed; blkno %ld,"
1465 "size %ld, error %d\n",
1466 ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
1467 (long)bp->b_blkno,
1468 (long)bp->b_bcount,
1469 bp->b_error
1470 );
1471 }
1472
1473 /*
1474 * set object, raise to splvm().
1475 */
1476 if (bp->b_npages)
1477 object = bp->b_pages[0]->object;
1478 s = splvm();
1479
1480 /*
1481 * remove the mapping for kernel virtual
1482 */
1483 pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
1484
1485 /*
1486 * cleanup pages. If an error occurs writing to swap, we are in
1487 * very serious trouble. If it happens to be a disk error, though,
1488 * we may be able to recover by reassigning the swap later on. So
1489 * in this case we remove the m->swapblk assignment for the page
1490 * but do not free it in the rlist. The errornous block(s) are thus
1491 * never reallocated as swap. Redirty the page and continue.
1492 */
1493 for (i = 0; i < bp->b_npages; ++i) {
1494 vm_page_t m = bp->b_pages[i];
1495
1496 vm_page_flag_clear(m, PG_SWAPINPROG);
1497
1498 if (bp->b_ioflags & BIO_ERROR) {
1499 /*
1500 * If an error occurs I'd love to throw the swapblk
1501 * away without freeing it back to swapspace, so it
1502 * can never be used again. But I can't from an
1503 * interrupt.
1504 */
1505 if (bp->b_iocmd == BIO_READ) {
1506 /*
1507 * When reading, reqpage needs to stay
1508 * locked for the parent, but all other
1509 * pages can be freed. We still want to
1510 * wakeup the parent waiting on the page,
1511 * though. ( also: pg_reqpage can be -1 and
1512 * not match anything ).
1513 *
1514 * We have to wake specifically requested pages
1515 * up too because we cleared PG_SWAPINPROG and
1516 * someone may be waiting for that.
1517 *
1518 * NOTE: for reads, m->dirty will probably
1519 * be overridden by the original caller of
1520 * getpages so don't play cute tricks here.
1521 *
1522 * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
1523 * AS THIS MESSES WITH object->memq, and it is
1524 * not legal to mess with object->memq from an
1525 * interrupt.
1526 */
1527 m->valid = 0;
1528 vm_page_flag_clear(m, PG_ZERO);
1529 if (i != bp->b_pager.pg_reqpage)
1530 vm_page_free(m);
1531 else
1532 vm_page_flash(m);
1533 /*
1534 * If i == bp->b_pager.pg_reqpage, do not wake
1535 * the page up. The caller needs to.
1536 */
1537 } else {
1538 /*
1539 * If a write error occurs, reactivate page
1540 * so it doesn't clog the inactive list,
1541 * then finish the I/O.
1542 */
1543 vm_page_dirty(m);
1544 vm_page_activate(m);
1545 vm_page_io_finish(m);
1546 }
1547 } else if (bp->b_iocmd == BIO_READ) {
1548 /*
1549 * For read success, clear dirty bits. Nobody should
1550 * have this page mapped but don't take any chances,
1551 * make sure the pmap modify bits are also cleared.
1552 *
1553 * NOTE: for reads, m->dirty will probably be
1554 * overridden by the original caller of getpages so
1555 * we cannot set them in order to free the underlying
1556 * swap in a low-swap situation. I don't think we'd
1557 * want to do that anyway, but it was an optimization
1558 * that existed in the old swapper for a time before
1559 * it got ripped out due to precisely this problem.
1560 *
1561 * clear PG_ZERO in page.
1562 *
1563 * If not the requested page then deactivate it.
1564 *
1565 * Note that the requested page, reqpage, is left
1566 * busied, but we still have to wake it up. The
1567 * other pages are released (unbusied) by
1568 * vm_page_wakeup(). We do not set reqpage's
1569 * valid bits here, it is up to the caller.
1570 */
1571 pmap_clear_modify(m);
1572 m->valid = VM_PAGE_BITS_ALL;
1573 vm_page_undirty(m);
1574 vm_page_flag_clear(m, PG_ZERO);
1575
1576 /*
1577 * We have to wake specifically requested pages
1578 * up too because we cleared PG_SWAPINPROG and
1579 * could be waiting for it in getpages. However,
1580 * be sure to not unbusy getpages specifically
1581 * requested page - getpages expects it to be
1582 * left busy.
1583 */
1584 if (i != bp->b_pager.pg_reqpage) {
1585 vm_page_deactivate(m);
1586 vm_page_wakeup(m);
1587 } else {
1588 vm_page_flash(m);
1589 }
1590 } else {
1591 /*
1592 * For write success, clear the modify and dirty
1593 * status, then finish the I/O ( which decrements the
1594 * busy count and possibly wakes waiter's up ).
1595 */
1596 pmap_clear_modify(m);
1597 vm_page_undirty(m);
1598 vm_page_io_finish(m);
1599 if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
1600 vm_page_protect(m, VM_PROT_READ);
1601 }
1602 }
1603
1604 /*
1605 * adjust pip. NOTE: the original parent may still have its own
1606 * pip refs on the object.
1607 */
1608 if (object)
1609 vm_object_pip_wakeupn(object, bp->b_npages);
1610
1611 /*
1612 * release the physical I/O buffer
1613 */
1614 relpbuf(
1615 bp,
1616 ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
1617 ((bp->b_flags & B_ASYNC) ?
1618 &nsw_wcount_async :
1619 &nsw_wcount_sync
1620 )
1621 )
1622 );
1623 splx(s);
1624}
1625
1626/************************************************************************
1627 * SWAP META DATA *
1628 ************************************************************************
1629 *
1630 * These routines manipulate the swap metadata stored in the
1631 * OBJT_SWAP object. All swp_*() routines must be called at
1632 * splvm() because swap can be freed up by the low level vm_page
1633 * code which might be called from interrupts beyond what splbio() covers.
1634 *
1635 * Swap metadata is implemented with a global hash and not directly
1636 * linked into the object. Instead the object simply contains
1637 * appropriate tracking counters.
1638 */
1639
1640/*
1641 * SWP_PAGER_HASH() - hash swap meta data
1642 *
1643 * This is an inline helper function which hashes the swapblk given
1644 * the object and page index. It returns a pointer to a pointer
1645 * to the object, or a pointer to a NULL pointer if it could not
1646 * find a swapblk.
1647 *
1648 * This routine must be called at splvm().
1649 */
1650static __inline struct swblock **
1651swp_pager_hash(vm_object_t object, vm_pindex_t index)
1652{
1653 struct swblock **pswap;
1654 struct swblock *swap;
1655
1656 index &= ~SWAP_META_MASK;
1657 pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
1658 while ((swap = *pswap) != NULL) {
1659 if (swap->swb_object == object &&
1660 swap->swb_index == index
1661 ) {
1662 break;
1663 }
1664 pswap = &swap->swb_hnext;
1665 }
1666 return (pswap);
1667}
1668
1669/*
1670 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
1671 *
1672 * We first convert the object to a swap object if it is a default
1673 * object.
1674 *
1675 * The specified swapblk is added to the object's swap metadata. If
1676 * the swapblk is not valid, it is freed instead. Any previously
1677 * assigned swapblk is freed.
1678 *
1679 * This routine must be called at splvm(), except when used to convert
1680 * an OBJT_DEFAULT object into an OBJT_SWAP object.
1681 */
1682static void
1683swp_pager_meta_build(
1684 vm_object_t object,
1685 vm_pindex_t index,
1686 daddr_t swapblk
1687) {
1688 struct swblock *swap;
1689 struct swblock **pswap;
1690
1691 GIANT_REQUIRED;
1692 /*
1693 * Convert default object to swap object if necessary
1694 */
1695 if (object->type != OBJT_SWAP) {
1696 object->type = OBJT_SWAP;
1697 object->un_pager.swp.swp_bcount = 0;
1698
1699 mtx_lock(&sw_alloc_mtx);
1700 if (object->handle != NULL) {
1701 TAILQ_INSERT_TAIL(
1702 NOBJLIST(object->handle),
1703 object,
1704 pager_object_list
1705 );
1706 } else {
1707 TAILQ_INSERT_TAIL(
1708 &swap_pager_un_object_list,
1709 object,
1710 pager_object_list
1711 );
1712 }
1713 mtx_unlock(&sw_alloc_mtx);
1714 }
1715
1716 /*
1717 * Locate hash entry. If not found create, but if we aren't adding
1718 * anything just return. If we run out of space in the map we wait
1719 * and, since the hash table may have changed, retry.
1720 */
1721retry:
1722 pswap = swp_pager_hash(object, index);
1723
1724 if ((swap = *pswap) == NULL) {
1725 int i;
1726
1727 if (swapblk == SWAPBLK_NONE)
1728 return;
1729
1730 swap = *pswap = zalloc(swap_zone);
1731 if (swap == NULL) {
1732 VM_WAIT;
1733 goto retry;
1734 }
1735 swap->swb_hnext = NULL;
1736 swap->swb_object = object;
1737 swap->swb_index = index & ~SWAP_META_MASK;
1738 swap->swb_count = 0;
1739
1740 ++object->un_pager.swp.swp_bcount;
1741
1742 for (i = 0; i < SWAP_META_PAGES; ++i)
1743 swap->swb_pages[i] = SWAPBLK_NONE;
1744 }
1745
1746 /*
1747 * Delete prior contents of metadata
1748 */
1749 index &= SWAP_META_MASK;
1750
1751 if (swap->swb_pages[index] != SWAPBLK_NONE) {
1752 swp_pager_freeswapspace(swap->swb_pages[index], 1);
1753 --swap->swb_count;
1754 }
1755
1756 /*
1757 * Enter block into metadata
1758 */
1759 swap->swb_pages[index] = swapblk;
1760 if (swapblk != SWAPBLK_NONE)
1761 ++swap->swb_count;
1762}
1763
1764/*
1765 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
1766 *
1767 * The requested range of blocks is freed, with any associated swap
1768 * returned to the swap bitmap.
1769 *
1770 * This routine will free swap metadata structures as they are cleaned
1771 * out. This routine does *NOT* operate on swap metadata associated
1772 * with resident pages.
1773 *
1774 * This routine must be called at splvm()
1775 */
1776static void
1777swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
1778{
1779 GIANT_REQUIRED;
1780
1781 if (object->type != OBJT_SWAP)
1782 return;
1783
1784 while (count > 0) {
1785 struct swblock **pswap;
1786 struct swblock *swap;
1787
1788 pswap = swp_pager_hash(object, index);
1789
1790 if ((swap = *pswap) != NULL) {
1791 daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
1792
1793 if (v != SWAPBLK_NONE) {
1794 swp_pager_freeswapspace(v, 1);
1795 swap->swb_pages[index & SWAP_META_MASK] =
1796 SWAPBLK_NONE;
1797 if (--swap->swb_count == 0) {
1798 *pswap = swap->swb_hnext;
1799 zfree(swap_zone, swap);
1800 --object->un_pager.swp.swp_bcount;
1801 }
1802 }
1803 --count;
1804 ++index;
1805 } else {
1806 int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
1807 count -= n;
1808 index += n;
1809 }
1810 }
1811}
1812
1813/*
1814 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
1815 *
1816 * This routine locates and destroys all swap metadata associated with
1817 * an object.
1818 *
1819 * This routine must be called at splvm()
1820 */
1821static void
1822swp_pager_meta_free_all(vm_object_t object)
1823{
1824 daddr_t index = 0;
1825
1826 GIANT_REQUIRED;
1827
1828 if (object->type != OBJT_SWAP)
1829 return;
1830
1831 while (object->un_pager.swp.swp_bcount) {
1832 struct swblock **pswap;
1833 struct swblock *swap;
1834
1835 pswap = swp_pager_hash(object, index);
1836 if ((swap = *pswap) != NULL) {
1837 int i;
1838
1839 for (i = 0; i < SWAP_META_PAGES; ++i) {
1840 daddr_t v = swap->swb_pages[i];
1841 if (v != SWAPBLK_NONE) {
1842 --swap->swb_count;
1843 swp_pager_freeswapspace(v, 1);
1844 }
1845 }
1846 if (swap->swb_count != 0)
1847 panic("swap_pager_meta_free_all: swb_count != 0");
1848 *pswap = swap->swb_hnext;
1849 zfree(swap_zone, swap);
1850 --object->un_pager.swp.swp_bcount;
1851 }
1852 index += SWAP_META_PAGES;
1853 if (index > 0x20000000)
1854 panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
1855 }
1856}
1857
1858/*
1859 * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data.
1860 *
1861 * This routine is capable of looking up, popping, or freeing
1862 * swapblk assignments in the swap meta data or in the vm_page_t.
1863 * The routine typically returns the swapblk being looked-up, or popped,
1864 * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
1865 * was invalid. This routine will automatically free any invalid
1866 * meta-data swapblks.
1867 *
1868 * It is not possible to store invalid swapblks in the swap meta data
1869 * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
1870 *
1871 * When acting on a busy resident page and paging is in progress, we
1872 * have to wait until paging is complete but otherwise can act on the
1873 * busy page.
1874 *
1875 * This routine must be called at splvm().
1876 *
1877 * SWM_FREE remove and free swap block from metadata
1878 * SWM_POP remove from meta data but do not free.. pop it out
1879 */
1880static daddr_t
1881swp_pager_meta_ctl(
1882 vm_object_t object,
1883 vm_pindex_t index,
1884 int flags
1885) {
1886 struct swblock **pswap;
1887 struct swblock *swap;
1888 daddr_t r1;
1889
1890 GIANT_REQUIRED;
1891 /*
1892 * The meta data only exists of the object is OBJT_SWAP
1893 * and even then might not be allocated yet.
1894 */
1895 if (object->type != OBJT_SWAP)
1896 return (SWAPBLK_NONE);
1897
1898 r1 = SWAPBLK_NONE;
1899 pswap = swp_pager_hash(object, index);
1900
1901 if ((swap = *pswap) != NULL) {
1902 index &= SWAP_META_MASK;
1903 r1 = swap->swb_pages[index];
1904
1905 if (r1 != SWAPBLK_NONE) {
1906 if (flags & SWM_FREE) {
1907 swp_pager_freeswapspace(r1, 1);
1908 r1 = SWAPBLK_NONE;
1909 }
1910 if (flags & (SWM_FREE|SWM_POP)) {
1911 swap->swb_pages[index] = SWAPBLK_NONE;
1912 if (--swap->swb_count == 0) {
1913 *pswap = swap->swb_hnext;
1914 zfree(swap_zone, swap);
1915 --object->un_pager.swp.swp_bcount;
1916 }
1917 }
1918 }
1919 }
1920 return (r1);
1921}
1922
1923/********************************************************
1924 * CHAINING FUNCTIONS *
1925 ********************************************************
1926 *
1927 * These functions support recursion of I/O operations
1928 * on bp's, typically by chaining one or more 'child' bp's
1929 * to the parent. Synchronous, asynchronous, and semi-synchronous
1930 * chaining is possible.
1931 */
1932
1933/*
1934 * vm_pager_chain_iodone:
1935 *
1936 * io completion routine for child bp. Currently we fudge a bit
1937 * on dealing with b_resid. Since users of these routines may issue
1938 * multiple children simultaneously, sequencing of the error can be lost.
1939 */
1940static void
1941vm_pager_chain_iodone(struct buf *nbp)
1942{
1943 struct bio *bp;
1944 u_int *count;
1945
1946 bp = nbp->b_caller1;
1947 count = (u_int *)&(bp->bio_driver1);
1948 if (bp != NULL) {
1949 if (nbp->b_ioflags & BIO_ERROR) {
1950 bp->bio_flags |= BIO_ERROR;
1951 bp->bio_error = nbp->b_error;
1952 } else if (nbp->b_resid != 0) {
1953 bp->bio_flags |= BIO_ERROR;
1954 bp->bio_error = EINVAL;
1955 } else {
1956 bp->bio_resid -= nbp->b_bcount;
1957 }
1958 nbp->b_caller1 = NULL;
1959 --(*count);
1960 if (bp->bio_flags & BIO_FLAG1) {
1961 bp->bio_flags &= ~BIO_FLAG1;
1962 wakeup(bp);
1963 }
1964 }
1965 nbp->b_flags |= B_DONE;
1966 nbp->b_flags &= ~B_ASYNC;
1967 relpbuf(nbp, NULL);
1968}
1969
1970/*
1971 * getchainbuf:
1972 *
1973 * Obtain a physical buffer and chain it to its parent buffer. When
1974 * I/O completes, the parent buffer will be B_SIGNAL'd. Errors are
1975 * automatically propagated to the parent
1976 */
1977struct buf *
1978getchainbuf(struct bio *bp, struct vnode *vp, int flags)
1979{
1980 struct buf *nbp;
1981 u_int *count;
1982
1983 GIANT_REQUIRED;
1984 nbp = getpbuf(NULL);
1985 count = (u_int *)&(bp->bio_driver1);
1986
1987 nbp->b_caller1 = bp;
1988 ++(*count);
1989
1990 if (*count > 4)
1991 waitchainbuf(bp, 4, 0);
1992
1993 nbp->b_iocmd = bp->bio_cmd;
1994 nbp->b_ioflags = 0;
1995 nbp->b_flags = flags;
1996 nbp->b_rcred = crhold(thread0.td_ucred);
1997 nbp->b_wcred = crhold(thread0.td_ucred);
1998 nbp->b_iodone = vm_pager_chain_iodone;
1999
2000 if (vp)
2001 pbgetvp(vp, nbp);
2002 return (nbp);
2003}
2004
2005void
2006flushchainbuf(struct buf *nbp)
2007{
2008 GIANT_REQUIRED;
2009 if (nbp->b_bcount) {
2010 nbp->b_bufsize = nbp->b_bcount;
2011 if (nbp->b_iocmd == BIO_WRITE)
2012 nbp->b_dirtyend = nbp->b_bcount;
2013 BUF_KERNPROC(nbp);
2014 BUF_STRATEGY(nbp);
2015 } else {
2016 bufdone(nbp);
2017 }
2018}
2019
2020static void
2021waitchainbuf(struct bio *bp, int limit, int done)
2022{
2023 int s;
2024 u_int *count;
2025
2026 GIANT_REQUIRED;
2027 count = (u_int *)&(bp->bio_driver1);
2028 s = splbio();
2029 while (*count > limit) {
2030 bp->bio_flags |= BIO_FLAG1;
2031 tsleep(bp, PRIBIO + 4, "bpchain", 0);
2032 }
2033 if (done) {
2034 if (bp->bio_resid != 0 && !(bp->bio_flags & BIO_ERROR)) {
2035 bp->bio_flags |= BIO_ERROR;
2036 bp->bio_error = EINVAL;
2037 }
2038 biodone(bp);
2039 }
2040 splx(s);
2041}
2042
332 break;
333 /*
334 * if the allocation failed, try a zone two thirds the
335 * size of the previous attempt.
336 */
337 n -= ((n + 2) / 3);
338 } while (n > 0);
339 if (swap_zone == NULL)
340 panic("failed to zinit swap_zone.");
341 if (n2 != n)
342 printf("Swap zone entries reduced from %d to %d.\n", n2, n);
343 n2 = n;
344
345 /*
346 * Initialize our meta-data hash table. The swapper does not need to
347 * be quite as efficient as the VM system, so we do not use an
348 * oversized hash table.
349 *
350 * n: size of hash table, must be power of 2
351 * swhash_mask: hash table index mask
352 */
353 for (n = 1; n < n2 / 8; n *= 2)
354 ;
355 swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
356 swhash_mask = n - 1;
357}
358
359/*
360 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
361 * its metadata structures.
362 *
363 * This routine is called from the mmap and fork code to create a new
364 * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
365 * and then converting it with swp_pager_meta_build().
366 *
367 * This routine may block in vm_object_allocate() and create a named
368 * object lookup race, so we must interlock. We must also run at
369 * splvm() for the object lookup to handle races with interrupts, but
370 * we do not have to maintain splvm() in between the lookup and the
371 * add because (I believe) it is not possible to attempt to create
372 * a new swap object w/handle when a default object with that handle
373 * already exists.
374 */
375static vm_object_t
376swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
377 vm_ooffset_t offset)
378{
379 vm_object_t object;
380
381 GIANT_REQUIRED;
382
383 if (handle) {
384 /*
385 * Reference existing named region or allocate new one. There
386 * should not be a race here against swp_pager_meta_build()
387 * as called from vm_page_remove() in regards to the lookup
388 * of the handle.
389 */
390 sx_xlock(&sw_alloc_sx);
391 object = vm_pager_object_lookup(NOBJLIST(handle), handle);
392
393 if (object != NULL) {
394 vm_object_reference(object);
395 } else {
396 object = vm_object_allocate(OBJT_DEFAULT,
397 OFF_TO_IDX(offset + PAGE_MASK + size));
398 object->handle = handle;
399
400 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
401 }
402 sx_xunlock(&sw_alloc_sx);
403 } else {
404 object = vm_object_allocate(OBJT_DEFAULT,
405 OFF_TO_IDX(offset + PAGE_MASK + size));
406
407 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
408 }
409
410 return (object);
411}
412
413/*
414 * SWAP_PAGER_DEALLOC() - remove swap metadata from object
415 *
416 * The swap backing for the object is destroyed. The code is
417 * designed such that we can reinstantiate it later, but this
418 * routine is typically called only when the entire object is
419 * about to be destroyed.
420 *
421 * This routine may block, but no longer does.
422 *
423 * The object must be locked or unreferenceable.
424 */
425static void
426swap_pager_dealloc(object)
427 vm_object_t object;
428{
429 int s;
430
431 GIANT_REQUIRED;
432
433 /*
434 * Remove from list right away so lookups will fail if we block for
435 * pageout completion.
436 */
437 mtx_lock(&sw_alloc_mtx);
438 if (object->handle == NULL) {
439 TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
440 } else {
441 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
442 }
443 mtx_unlock(&sw_alloc_mtx);
444
445 vm_object_pip_wait(object, "swpdea");
446
447 /*
448 * Free all remaining metadata. We only bother to free it from
449 * the swap meta data. We do not attempt to free swapblk's still
450 * associated with vm_page_t's for this object. We do not care
451 * if paging is still in progress on some objects.
452 */
453 s = splvm();
454 swp_pager_meta_free_all(object);
455 splx(s);
456}
457
458/************************************************************************
459 * SWAP PAGER BITMAP ROUTINES *
460 ************************************************************************/
461
462/*
463 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
464 *
465 * Allocate swap for the requested number of pages. The starting
466 * swap block number (a page index) is returned or SWAPBLK_NONE
467 * if the allocation failed.
468 *
469 * Also has the side effect of advising that somebody made a mistake
470 * when they configured swap and didn't configure enough.
471 *
472 * Must be called at splvm() to avoid races with bitmap frees from
473 * vm_page_remove() aka swap_pager_page_removed().
474 *
475 * This routine may not block
476 * This routine must be called at splvm().
477 */
478static __inline daddr_t
479swp_pager_getswapspace(npages)
480 int npages;
481{
482 daddr_t blk;
483
484 GIANT_REQUIRED;
485
486 if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
487 if (swap_pager_full != 2) {
488 printf("swap_pager_getswapspace: failed\n");
489 swap_pager_full = 2;
490 swap_pager_almost_full = 1;
491 }
492 } else {
493 vm_swap_size -= npages;
494 /* per-swap area stats */
495 swdevt[BLK2DEVIDX(blk)].sw_used += npages;
496 swp_sizecheck();
497 }
498 return (blk);
499}
500
501/*
502 * SWP_PAGER_FREESWAPSPACE() - free raw swap space
503 *
504 * This routine returns the specified swap blocks back to the bitmap.
505 *
506 * Note: This routine may not block (it could in the old swap code),
507 * and through the use of the new blist routines it does not block.
508 *
509 * We must be called at splvm() to avoid races with bitmap frees from
510 * vm_page_remove() aka swap_pager_page_removed().
511 *
512 * This routine may not block
513 * This routine must be called at splvm().
514 */
515static __inline void
516swp_pager_freeswapspace(blk, npages)
517 daddr_t blk;
518 int npages;
519{
520 GIANT_REQUIRED;
521
522 blist_free(swapblist, blk, npages);
523 vm_swap_size += npages;
524 /* per-swap area stats */
525 swdevt[BLK2DEVIDX(blk)].sw_used -= npages;
526 swp_sizecheck();
527}
528
529/*
530 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
531 * range within an object.
532 *
533 * This is a globally accessible routine.
534 *
535 * This routine removes swapblk assignments from swap metadata.
536 *
537 * The external callers of this routine typically have already destroyed
538 * or renamed vm_page_t's associated with this range in the object so
539 * we should be ok.
540 *
541 * This routine may be called at any spl. We up our spl to splvm temporarily
542 * in order to perform the metadata removal.
543 */
544void
545swap_pager_freespace(object, start, size)
546 vm_object_t object;
547 vm_pindex_t start;
548 vm_size_t size;
549{
550 int s = splvm();
551
552 GIANT_REQUIRED;
553 swp_pager_meta_free(object, start, size);
554 splx(s);
555}
556
557/*
558 * SWAP_PAGER_RESERVE() - reserve swap blocks in object
559 *
560 * Assigns swap blocks to the specified range within the object. The
561 * swap blocks are not zerod. Any previous swap assignment is destroyed.
562 *
563 * Returns 0 on success, -1 on failure.
564 */
565int
566swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
567{
568 int s;
569 int n = 0;
570 daddr_t blk = SWAPBLK_NONE;
571 vm_pindex_t beg = start; /* save start index */
572
573 s = splvm();
574 while (size) {
575 if (n == 0) {
576 n = BLIST_MAX_ALLOC;
577 while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
578 n >>= 1;
579 if (n == 0) {
580 swp_pager_meta_free(object, beg, start - beg);
581 splx(s);
582 return (-1);
583 }
584 }
585 }
586 swp_pager_meta_build(object, start, blk);
587 --size;
588 ++start;
589 ++blk;
590 --n;
591 }
592 swp_pager_meta_free(object, start, n);
593 splx(s);
594 return (0);
595}
596
597/*
598 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
599 * and destroy the source.
600 *
601 * Copy any valid swapblks from the source to the destination. In
602 * cases where both the source and destination have a valid swapblk,
603 * we keep the destination's.
604 *
605 * This routine is allowed to block. It may block allocating metadata
606 * indirectly through swp_pager_meta_build() or if paging is still in
607 * progress on the source.
608 *
609 * This routine can be called at any spl
610 *
611 * XXX vm_page_collapse() kinda expects us not to block because we
612 * supposedly do not need to allocate memory, but for the moment we
613 * *may* have to get a little memory from the zone allocator, but
614 * it is taken from the interrupt memory. We should be ok.
615 *
616 * The source object contains no vm_page_t's (which is just as well)
617 *
618 * The source object is of type OBJT_SWAP.
619 *
620 * The source and destination objects must be locked or
621 * inaccessible (XXX are they ?)
622 */
623void
624swap_pager_copy(srcobject, dstobject, offset, destroysource)
625 vm_object_t srcobject;
626 vm_object_t dstobject;
627 vm_pindex_t offset;
628 int destroysource;
629{
630 vm_pindex_t i;
631 int s;
632
633 GIANT_REQUIRED;
634
635 s = splvm();
636 /*
637 * If destroysource is set, we remove the source object from the
638 * swap_pager internal queue now.
639 */
640 if (destroysource) {
641 mtx_lock(&sw_alloc_mtx);
642 if (srcobject->handle == NULL) {
643 TAILQ_REMOVE(
644 &swap_pager_un_object_list,
645 srcobject,
646 pager_object_list
647 );
648 } else {
649 TAILQ_REMOVE(
650 NOBJLIST(srcobject->handle),
651 srcobject,
652 pager_object_list
653 );
654 }
655 mtx_unlock(&sw_alloc_mtx);
656 }
657
658 /*
659 * transfer source to destination.
660 */
661 for (i = 0; i < dstobject->size; ++i) {
662 daddr_t dstaddr;
663
664 /*
665 * Locate (without changing) the swapblk on the destination,
666 * unless it is invalid in which case free it silently, or
667 * if the destination is a resident page, in which case the
668 * source is thrown away.
669 */
670 dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
671
672 if (dstaddr == SWAPBLK_NONE) {
673 /*
674 * Destination has no swapblk and is not resident,
675 * copy source.
676 */
677 daddr_t srcaddr;
678
679 srcaddr = swp_pager_meta_ctl(
680 srcobject,
681 i + offset,
682 SWM_POP
683 );
684
685 if (srcaddr != SWAPBLK_NONE)
686 swp_pager_meta_build(dstobject, i, srcaddr);
687 } else {
688 /*
689 * Destination has valid swapblk or it is represented
690 * by a resident page. We destroy the sourceblock.
691 */
692
693 swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
694 }
695 }
696
697 /*
698 * Free left over swap blocks in source.
699 *
700 * We have to revert the type to OBJT_DEFAULT so we do not accidently
701 * double-remove the object from the swap queues.
702 */
703 if (destroysource) {
704 swp_pager_meta_free_all(srcobject);
705 /*
706 * Reverting the type is not necessary, the caller is going
707 * to destroy srcobject directly, but I'm doing it here
708 * for consistency since we've removed the object from its
709 * queues.
710 */
711 srcobject->type = OBJT_DEFAULT;
712 }
713 splx(s);
714}
715
716/*
717 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for
718 * the requested page.
719 *
720 * We determine whether good backing store exists for the requested
721 * page and return TRUE if it does, FALSE if it doesn't.
722 *
723 * If TRUE, we also try to determine how much valid, contiguous backing
724 * store exists before and after the requested page within a reasonable
725 * distance. We do not try to restrict it to the swap device stripe
726 * (that is handled in getpages/putpages). It probably isn't worth
727 * doing here.
728 */
729boolean_t
730swap_pager_haspage(object, pindex, before, after)
731 vm_object_t object;
732 vm_pindex_t pindex;
733 int *before;
734 int *after;
735{
736 daddr_t blk0;
737 int s;
738
739 /*
740 * do we have good backing store at the requested index ?
741 */
742 s = splvm();
743 blk0 = swp_pager_meta_ctl(object, pindex, 0);
744
745 if (blk0 == SWAPBLK_NONE) {
746 splx(s);
747 if (before)
748 *before = 0;
749 if (after)
750 *after = 0;
751 return (FALSE);
752 }
753
754 /*
755 * find backwards-looking contiguous good backing store
756 */
757 if (before != NULL) {
758 int i;
759
760 for (i = 1; i < (SWB_NPAGES/2); ++i) {
761 daddr_t blk;
762
763 if (i > pindex)
764 break;
765 blk = swp_pager_meta_ctl(object, pindex - i, 0);
766 if (blk != blk0 - i)
767 break;
768 }
769 *before = (i - 1);
770 }
771
772 /*
773 * find forward-looking contiguous good backing store
774 */
775 if (after != NULL) {
776 int i;
777
778 for (i = 1; i < (SWB_NPAGES/2); ++i) {
779 daddr_t blk;
780
781 blk = swp_pager_meta_ctl(object, pindex + i, 0);
782 if (blk != blk0 + i)
783 break;
784 }
785 *after = (i - 1);
786 }
787 splx(s);
788 return (TRUE);
789}
790
791/*
792 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
793 *
794 * This removes any associated swap backing store, whether valid or
795 * not, from the page.
796 *
797 * This routine is typically called when a page is made dirty, at
798 * which point any associated swap can be freed. MADV_FREE also
799 * calls us in a special-case situation
800 *
801 * NOTE!!! If the page is clean and the swap was valid, the caller
802 * should make the page dirty before calling this routine. This routine
803 * does NOT change the m->dirty status of the page. Also: MADV_FREE
804 * depends on it.
805 *
806 * This routine may not block
807 * This routine must be called at splvm()
808 */
809static void
810swap_pager_unswapped(m)
811 vm_page_t m;
812{
813 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
814}
815
816/*
817 * SWAP_PAGER_STRATEGY() - read, write, free blocks
818 *
819 * This implements the vm_pager_strategy() interface to swap and allows
820 * other parts of the system to directly access swap as backing store
821 * through vm_objects of type OBJT_SWAP. This is intended to be a
822 * cacheless interface ( i.e. caching occurs at higher levels ).
823 * Therefore we do not maintain any resident pages. All I/O goes
824 * directly to and from the swap device.
825 *
826 * Note that b_blkno is scaled for PAGE_SIZE
827 *
828 * We currently attempt to run I/O synchronously or asynchronously as
829 * the caller requests. This isn't perfect because we loose error
830 * sequencing when we run multiple ops in parallel to satisfy a request.
831 * But this is swap, so we let it all hang out.
832 */
833static void
834swap_pager_strategy(vm_object_t object, struct bio *bp)
835{
836 vm_pindex_t start;
837 int count;
838 int s;
839 char *data;
840 struct buf *nbp = NULL;
841
842 GIANT_REQUIRED;
843
844 /* XXX: KASSERT instead ? */
845 if (bp->bio_bcount & PAGE_MASK) {
846 biofinish(bp, NULL, EINVAL);
847 printf("swap_pager_strategy: bp %p blk %d size %d, not page bounded\n", bp, (int)bp->bio_pblkno, (int)bp->bio_bcount);
848 return;
849 }
850
851 /*
852 * Clear error indication, initialize page index, count, data pointer.
853 */
854 bp->bio_error = 0;
855 bp->bio_flags &= ~BIO_ERROR;
856 bp->bio_resid = bp->bio_bcount;
857 *(u_int *) &bp->bio_driver1 = 0;
858
859 start = bp->bio_pblkno;
860 count = howmany(bp->bio_bcount, PAGE_SIZE);
861 data = bp->bio_data;
862
863 s = splvm();
864
865 /*
866 * Deal with BIO_DELETE
867 */
868 if (bp->bio_cmd == BIO_DELETE) {
869 /*
870 * FREE PAGE(s) - destroy underlying swap that is no longer
871 * needed.
872 */
873 swp_pager_meta_free(object, start, count);
874 splx(s);
875 bp->bio_resid = 0;
876 biodone(bp);
877 return;
878 }
879
880 /*
881 * Execute read or write
882 */
883 while (count > 0) {
884 daddr_t blk;
885
886 /*
887 * Obtain block. If block not found and writing, allocate a
888 * new block and build it into the object.
889 */
890
891 blk = swp_pager_meta_ctl(object, start, 0);
892 if ((blk == SWAPBLK_NONE) && (bp->bio_cmd == BIO_WRITE)) {
893 blk = swp_pager_getswapspace(1);
894 if (blk == SWAPBLK_NONE) {
895 bp->bio_error = ENOMEM;
896 bp->bio_flags |= BIO_ERROR;
897 break;
898 }
899 swp_pager_meta_build(object, start, blk);
900 }
901
902 /*
903 * Do we have to flush our current collection? Yes if:
904 *
905 * - no swap block at this index
906 * - swap block is not contiguous
907 * - we cross a physical disk boundry in the
908 * stripe.
909 */
910 if (
911 nbp && (nbp->b_blkno + btoc(nbp->b_bcount) != blk ||
912 ((nbp->b_blkno ^ blk) & dmmax_mask)
913 )
914 ) {
915 splx(s);
916 if (bp->bio_cmd == BIO_READ) {
917 ++cnt.v_swapin;
918 cnt.v_swappgsin += btoc(nbp->b_bcount);
919 } else {
920 ++cnt.v_swapout;
921 cnt.v_swappgsout += btoc(nbp->b_bcount);
922 nbp->b_dirtyend = nbp->b_bcount;
923 }
924 flushchainbuf(nbp);
925 s = splvm();
926 nbp = NULL;
927 }
928
929 /*
930 * Add new swapblk to nbp, instantiating nbp if necessary.
931 * Zero-fill reads are able to take a shortcut.
932 */
933 if (blk == SWAPBLK_NONE) {
934 /*
935 * We can only get here if we are reading. Since
936 * we are at splvm() we can safely modify b_resid,
937 * even if chain ops are in progress.
938 */
939 bzero(data, PAGE_SIZE);
940 bp->bio_resid -= PAGE_SIZE;
941 } else {
942 if (nbp == NULL) {
943 nbp = getchainbuf(bp, swapdev_vp, B_ASYNC);
944 nbp->b_blkno = blk;
945 nbp->b_bcount = 0;
946 nbp->b_data = data;
947 }
948 nbp->b_bcount += PAGE_SIZE;
949 }
950 --count;
951 ++start;
952 data += PAGE_SIZE;
953 }
954
955 /*
956 * Flush out last buffer
957 */
958 splx(s);
959
960 if (nbp) {
961 if (nbp->b_iocmd == BIO_READ) {
962 ++cnt.v_swapin;
963 cnt.v_swappgsin += btoc(nbp->b_bcount);
964 } else {
965 ++cnt.v_swapout;
966 cnt.v_swappgsout += btoc(nbp->b_bcount);
967 nbp->b_dirtyend = nbp->b_bcount;
968 }
969 flushchainbuf(nbp);
970 /* nbp = NULL; */
971 }
972 /*
973 * Wait for completion.
974 */
975 waitchainbuf(bp, 0, 1);
976}
977
978/*
979 * SWAP_PAGER_GETPAGES() - bring pages in from swap
980 *
981 * Attempt to retrieve (m, count) pages from backing store, but make
982 * sure we retrieve at least m[reqpage]. We try to load in as large
983 * a chunk surrounding m[reqpage] as is contiguous in swap and which
984 * belongs to the same object.
985 *
986 * The code is designed for asynchronous operation and
987 * immediate-notification of 'reqpage' but tends not to be
988 * used that way. Please do not optimize-out this algorithmic
989 * feature, I intend to improve on it in the future.
990 *
991 * The parent has a single vm_object_pip_add() reference prior to
992 * calling us and we should return with the same.
993 *
994 * The parent has BUSY'd the pages. We should return with 'm'
995 * left busy, but the others adjusted.
996 */
997static int
998swap_pager_getpages(object, m, count, reqpage)
999 vm_object_t object;
1000 vm_page_t *m;
1001 int count, reqpage;
1002{
1003 struct buf *bp;
1004 vm_page_t mreq;
1005 int s;
1006 int i;
1007 int j;
1008 daddr_t blk;
1009 vm_offset_t kva;
1010 vm_pindex_t lastpindex;
1011
1012 GIANT_REQUIRED;
1013
1014 mreq = m[reqpage];
1015
1016 if (mreq->object != object) {
1017 panic("swap_pager_getpages: object mismatch %p/%p",
1018 object,
1019 mreq->object
1020 );
1021 }
1022 /*
1023 * Calculate range to retrieve. The pages have already been assigned
1024 * their swapblks. We require a *contiguous* range that falls entirely
1025 * within a single device stripe. If we do not supply it, bad things
1026 * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
1027 * loops are set up such that the case(s) are handled implicitly.
1028 *
1029 * The swp_*() calls must be made at splvm(). vm_page_free() does
1030 * not need to be, but it will go a little faster if it is.
1031 */
1032 s = splvm();
1033 blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1034
1035 for (i = reqpage - 1; i >= 0; --i) {
1036 daddr_t iblk;
1037
1038 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
1039 if (blk != iblk + (reqpage - i))
1040 break;
1041 if ((blk ^ iblk) & dmmax_mask)
1042 break;
1043 }
1044 ++i;
1045
1046 for (j = reqpage + 1; j < count; ++j) {
1047 daddr_t jblk;
1048
1049 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
1050 if (blk != jblk - (j - reqpage))
1051 break;
1052 if ((blk ^ jblk) & dmmax_mask)
1053 break;
1054 }
1055
1056 /*
1057 * free pages outside our collection range. Note: we never free
1058 * mreq, it must remain busy throughout.
1059 */
1060 {
1061 int k;
1062
1063 for (k = 0; k < i; ++k)
1064 vm_page_free(m[k]);
1065 for (k = j; k < count; ++k)
1066 vm_page_free(m[k]);
1067 }
1068 splx(s);
1069
1070
1071 /*
1072 * Return VM_PAGER_FAIL if we have nothing to do. Return mreq
1073 * still busy, but the others unbusied.
1074 */
1075 if (blk == SWAPBLK_NONE)
1076 return (VM_PAGER_FAIL);
1077
1078 /*
1079 * Get a swap buffer header to perform the IO
1080 */
1081 bp = getpbuf(&nsw_rcount);
1082 kva = (vm_offset_t) bp->b_data;
1083
1084 /*
1085 * map our page(s) into kva for input
1086 *
1087 * NOTE: B_PAGING is set by pbgetvp()
1088 */
1089 pmap_qenter(kva, m + i, j - i);
1090
1091 bp->b_iocmd = BIO_READ;
1092 bp->b_iodone = swp_pager_async_iodone;
1093 bp->b_rcred = crhold(thread0.td_ucred);
1094 bp->b_wcred = crhold(thread0.td_ucred);
1095 bp->b_data = (caddr_t) kva;
1096 bp->b_blkno = blk - (reqpage - i);
1097 bp->b_bcount = PAGE_SIZE * (j - i);
1098 bp->b_bufsize = PAGE_SIZE * (j - i);
1099 bp->b_pager.pg_reqpage = reqpage - i;
1100
1101 {
1102 int k;
1103
1104 for (k = i; k < j; ++k) {
1105 bp->b_pages[k - i] = m[k];
1106 vm_page_flag_set(m[k], PG_SWAPINPROG);
1107 }
1108 }
1109 bp->b_npages = j - i;
1110
1111 pbgetvp(swapdev_vp, bp);
1112
1113 cnt.v_swapin++;
1114 cnt.v_swappgsin += bp->b_npages;
1115
1116 /*
1117 * We still hold the lock on mreq, and our automatic completion routine
1118 * does not remove it.
1119 */
1120 vm_object_pip_add(mreq->object, bp->b_npages);
1121 lastpindex = m[j-1]->pindex;
1122
1123 /*
1124 * perform the I/O. NOTE!!! bp cannot be considered valid after
1125 * this point because we automatically release it on completion.
1126 * Instead, we look at the one page we are interested in which we
1127 * still hold a lock on even through the I/O completion.
1128 *
1129 * The other pages in our m[] array are also released on completion,
1130 * so we cannot assume they are valid anymore either.
1131 *
1132 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
1133 */
1134 BUF_KERNPROC(bp);
1135 BUF_STRATEGY(bp);
1136
1137 /*
1138 * wait for the page we want to complete. PG_SWAPINPROG is always
1139 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE
1140 * is set in the meta-data.
1141 */
1142 s = splvm();
1143 while ((mreq->flags & PG_SWAPINPROG) != 0) {
1144 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
1145 cnt.v_intrans++;
1146 if (tsleep(mreq, PSWP, "swread", hz*20)) {
1147 printf(
1148 "swap_pager: indefinite wait buffer: device:"
1149 " %s, blkno: %ld, size: %ld\n",
1150 devtoname(bp->b_dev), (long)bp->b_blkno,
1151 bp->b_bcount
1152 );
1153 }
1154 }
1155 splx(s);
1156
1157 /*
1158 * mreq is left busied after completion, but all the other pages
1159 * are freed. If we had an unrecoverable read error the page will
1160 * not be valid.
1161 */
1162 if (mreq->valid != VM_PAGE_BITS_ALL) {
1163 return (VM_PAGER_ERROR);
1164 } else {
1165 return (VM_PAGER_OK);
1166 }
1167
1168 /*
1169 * A final note: in a low swap situation, we cannot deallocate swap
1170 * and mark a page dirty here because the caller is likely to mark
1171 * the page clean when we return, causing the page to possibly revert
1172 * to all-zero's later.
1173 */
1174}
1175
1176/*
1177 * swap_pager_putpages:
1178 *
1179 * Assign swap (if necessary) and initiate I/O on the specified pages.
1180 *
1181 * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
1182 * are automatically converted to SWAP objects.
1183 *
1184 * In a low memory situation we may block in VOP_STRATEGY(), but the new
1185 * vm_page reservation system coupled with properly written VFS devices
1186 * should ensure that no low-memory deadlock occurs. This is an area
1187 * which needs work.
1188 *
1189 * The parent has N vm_object_pip_add() references prior to
1190 * calling us and will remove references for rtvals[] that are
1191 * not set to VM_PAGER_PEND. We need to remove the rest on I/O
1192 * completion.
1193 *
1194 * The parent has soft-busy'd the pages it passes us and will unbusy
1195 * those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1196 * We need to unbusy the rest on I/O completion.
1197 */
1198void
1199swap_pager_putpages(object, m, count, sync, rtvals)
1200 vm_object_t object;
1201 vm_page_t *m;
1202 int count;
1203 boolean_t sync;
1204 int *rtvals;
1205{
1206 int i;
1207 int n = 0;
1208
1209 GIANT_REQUIRED;
1210 if (count && m[0]->object != object) {
1211 panic("swap_pager_getpages: object mismatch %p/%p",
1212 object,
1213 m[0]->object
1214 );
1215 }
1216 /*
1217 * Step 1
1218 *
1219 * Turn object into OBJT_SWAP
1220 * check for bogus sysops
1221 * force sync if not pageout process
1222 */
1223 if (object->type != OBJT_SWAP)
1224 swp_pager_meta_build(object, 0, SWAPBLK_NONE);
1225
1226 if (curproc != pageproc)
1227 sync = TRUE;
1228
1229 /*
1230 * Step 2
1231 *
1232 * Update nsw parameters from swap_async_max sysctl values.
1233 * Do not let the sysop crash the machine with bogus numbers.
1234 */
1235 mtx_lock(&pbuf_mtx);
1236 if (swap_async_max != nsw_wcount_async_max) {
1237 int n;
1238 int s;
1239
1240 /*
1241 * limit range
1242 */
1243 if ((n = swap_async_max) > nswbuf / 2)
1244 n = nswbuf / 2;
1245 if (n < 1)
1246 n = 1;
1247 swap_async_max = n;
1248
1249 /*
1250 * Adjust difference ( if possible ). If the current async
1251 * count is too low, we may not be able to make the adjustment
1252 * at this time.
1253 */
1254 s = splvm();
1255 n -= nsw_wcount_async_max;
1256 if (nsw_wcount_async + n >= 0) {
1257 nsw_wcount_async += n;
1258 nsw_wcount_async_max += n;
1259 wakeup(&nsw_wcount_async);
1260 }
1261 splx(s);
1262 }
1263 mtx_unlock(&pbuf_mtx);
1264
1265 /*
1266 * Step 3
1267 *
1268 * Assign swap blocks and issue I/O. We reallocate swap on the fly.
1269 * The page is left dirty until the pageout operation completes
1270 * successfully.
1271 */
1272 for (i = 0; i < count; i += n) {
1273 int s;
1274 int j;
1275 struct buf *bp;
1276 daddr_t blk;
1277
1278 /*
1279 * Maximum I/O size is limited by a number of factors.
1280 */
1281 n = min(BLIST_MAX_ALLOC, count - i);
1282 n = min(n, nsw_cluster_max);
1283
1284 s = splvm();
1285
1286 /*
1287 * Get biggest block of swap we can. If we fail, fall
1288 * back and try to allocate a smaller block. Don't go
1289 * overboard trying to allocate space if it would overly
1290 * fragment swap.
1291 */
1292 while (
1293 (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
1294 n > 4
1295 ) {
1296 n >>= 1;
1297 }
1298 if (blk == SWAPBLK_NONE) {
1299 for (j = 0; j < n; ++j)
1300 rtvals[i+j] = VM_PAGER_FAIL;
1301 splx(s);
1302 continue;
1303 }
1304
1305 /*
1306 * The I/O we are constructing cannot cross a physical
1307 * disk boundry in the swap stripe. Note: we are still
1308 * at splvm().
1309 */
1310 if ((blk ^ (blk + n)) & dmmax_mask) {
1311 j = ((blk + dmmax) & dmmax_mask) - blk;
1312 swp_pager_freeswapspace(blk + j, n - j);
1313 n = j;
1314 }
1315
1316 /*
1317 * All I/O parameters have been satisfied, build the I/O
1318 * request and assign the swap space.
1319 *
1320 * NOTE: B_PAGING is set by pbgetvp()
1321 */
1322 if (sync == TRUE) {
1323 bp = getpbuf(&nsw_wcount_sync);
1324 } else {
1325 bp = getpbuf(&nsw_wcount_async);
1326 bp->b_flags = B_ASYNC;
1327 }
1328 bp->b_iocmd = BIO_WRITE;
1329 bp->b_spc = NULL; /* not used, but NULL-out anyway */
1330
1331 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
1332
1333 bp->b_rcred = crhold(thread0.td_ucred);
1334 bp->b_wcred = crhold(thread0.td_ucred);
1335 bp->b_bcount = PAGE_SIZE * n;
1336 bp->b_bufsize = PAGE_SIZE * n;
1337 bp->b_blkno = blk;
1338
1339 pbgetvp(swapdev_vp, bp);
1340
1341 for (j = 0; j < n; ++j) {
1342 vm_page_t mreq = m[i+j];
1343
1344 swp_pager_meta_build(
1345 mreq->object,
1346 mreq->pindex,
1347 blk + j
1348 );
1349 vm_page_dirty(mreq);
1350 rtvals[i+j] = VM_PAGER_OK;
1351
1352 vm_page_flag_set(mreq, PG_SWAPINPROG);
1353 bp->b_pages[j] = mreq;
1354 }
1355 bp->b_npages = n;
1356 /*
1357 * Must set dirty range for NFS to work.
1358 */
1359 bp->b_dirtyoff = 0;
1360 bp->b_dirtyend = bp->b_bcount;
1361
1362 cnt.v_swapout++;
1363 cnt.v_swappgsout += bp->b_npages;
1364 swapdev_vp->v_numoutput++;
1365
1366 splx(s);
1367
1368 /*
1369 * asynchronous
1370 *
1371 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
1372 */
1373 if (sync == FALSE) {
1374 bp->b_iodone = swp_pager_async_iodone;
1375 BUF_KERNPROC(bp);
1376 BUF_STRATEGY(bp);
1377
1378 for (j = 0; j < n; ++j)
1379 rtvals[i+j] = VM_PAGER_PEND;
1380 /* restart outter loop */
1381 continue;
1382 }
1383
1384 /*
1385 * synchronous
1386 *
1387 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY
1388 */
1389 bp->b_iodone = swp_pager_sync_iodone;
1390 BUF_STRATEGY(bp);
1391
1392 /*
1393 * Wait for the sync I/O to complete, then update rtvals.
1394 * We just set the rtvals[] to VM_PAGER_PEND so we can call
1395 * our async completion routine at the end, thus avoiding a
1396 * double-free.
1397 */
1398 s = splbio();
1399 while ((bp->b_flags & B_DONE) == 0) {
1400 tsleep(bp, PVM, "swwrt", 0);
1401 }
1402 for (j = 0; j < n; ++j)
1403 rtvals[i+j] = VM_PAGER_PEND;
1404 /*
1405 * Now that we are through with the bp, we can call the
1406 * normal async completion, which frees everything up.
1407 */
1408 swp_pager_async_iodone(bp);
1409 splx(s);
1410 }
1411}
1412
1413/*
1414 * swap_pager_sync_iodone:
1415 *
1416 * Completion routine for synchronous reads and writes from/to swap.
1417 * We just mark the bp is complete and wake up anyone waiting on it.
1418 *
1419 * This routine may not block. This routine is called at splbio() or better.
1420 */
1421static void
1422swp_pager_sync_iodone(bp)
1423 struct buf *bp;
1424{
1425 bp->b_flags |= B_DONE;
1426 bp->b_flags &= ~B_ASYNC;
1427 wakeup(bp);
1428}
1429
1430/*
1431 * swp_pager_async_iodone:
1432 *
1433 * Completion routine for asynchronous reads and writes from/to swap.
1434 * Also called manually by synchronous code to finish up a bp.
1435 *
1436 * For READ operations, the pages are PG_BUSY'd. For WRITE operations,
1437 * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY
1438 * unbusy all pages except the 'main' request page. For WRITE
1439 * operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1440 * because we marked them all VM_PAGER_PEND on return from putpages ).
1441 *
1442 * This routine may not block.
1443 * This routine is called at splbio() or better
1444 *
1445 * We up ourselves to splvm() as required for various vm_page related
1446 * calls.
1447 */
1448static void
1449swp_pager_async_iodone(bp)
1450 struct buf *bp;
1451{
1452 int s;
1453 int i;
1454 vm_object_t object = NULL;
1455
1456 GIANT_REQUIRED;
1457 bp->b_flags |= B_DONE;
1458
1459 /*
1460 * report error
1461 */
1462 if (bp->b_ioflags & BIO_ERROR) {
1463 printf(
1464 "swap_pager: I/O error - %s failed; blkno %ld,"
1465 "size %ld, error %d\n",
1466 ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
1467 (long)bp->b_blkno,
1468 (long)bp->b_bcount,
1469 bp->b_error
1470 );
1471 }
1472
1473 /*
1474 * set object, raise to splvm().
1475 */
1476 if (bp->b_npages)
1477 object = bp->b_pages[0]->object;
1478 s = splvm();
1479
1480 /*
1481 * remove the mapping for kernel virtual
1482 */
1483 pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
1484
1485 /*
1486 * cleanup pages. If an error occurs writing to swap, we are in
1487 * very serious trouble. If it happens to be a disk error, though,
1488 * we may be able to recover by reassigning the swap later on. So
1489 * in this case we remove the m->swapblk assignment for the page
1490 * but do not free it in the rlist. The errornous block(s) are thus
1491 * never reallocated as swap. Redirty the page and continue.
1492 */
1493 for (i = 0; i < bp->b_npages; ++i) {
1494 vm_page_t m = bp->b_pages[i];
1495
1496 vm_page_flag_clear(m, PG_SWAPINPROG);
1497
1498 if (bp->b_ioflags & BIO_ERROR) {
1499 /*
1500 * If an error occurs I'd love to throw the swapblk
1501 * away without freeing it back to swapspace, so it
1502 * can never be used again. But I can't from an
1503 * interrupt.
1504 */
1505 if (bp->b_iocmd == BIO_READ) {
1506 /*
1507 * When reading, reqpage needs to stay
1508 * locked for the parent, but all other
1509 * pages can be freed. We still want to
1510 * wakeup the parent waiting on the page,
1511 * though. ( also: pg_reqpage can be -1 and
1512 * not match anything ).
1513 *
1514 * We have to wake specifically requested pages
1515 * up too because we cleared PG_SWAPINPROG and
1516 * someone may be waiting for that.
1517 *
1518 * NOTE: for reads, m->dirty will probably
1519 * be overridden by the original caller of
1520 * getpages so don't play cute tricks here.
1521 *
1522 * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
1523 * AS THIS MESSES WITH object->memq, and it is
1524 * not legal to mess with object->memq from an
1525 * interrupt.
1526 */
1527 m->valid = 0;
1528 vm_page_flag_clear(m, PG_ZERO);
1529 if (i != bp->b_pager.pg_reqpage)
1530 vm_page_free(m);
1531 else
1532 vm_page_flash(m);
1533 /*
1534 * If i == bp->b_pager.pg_reqpage, do not wake
1535 * the page up. The caller needs to.
1536 */
1537 } else {
1538 /*
1539 * If a write error occurs, reactivate page
1540 * so it doesn't clog the inactive list,
1541 * then finish the I/O.
1542 */
1543 vm_page_dirty(m);
1544 vm_page_activate(m);
1545 vm_page_io_finish(m);
1546 }
1547 } else if (bp->b_iocmd == BIO_READ) {
1548 /*
1549 * For read success, clear dirty bits. Nobody should
1550 * have this page mapped but don't take any chances,
1551 * make sure the pmap modify bits are also cleared.
1552 *
1553 * NOTE: for reads, m->dirty will probably be
1554 * overridden by the original caller of getpages so
1555 * we cannot set them in order to free the underlying
1556 * swap in a low-swap situation. I don't think we'd
1557 * want to do that anyway, but it was an optimization
1558 * that existed in the old swapper for a time before
1559 * it got ripped out due to precisely this problem.
1560 *
1561 * clear PG_ZERO in page.
1562 *
1563 * If not the requested page then deactivate it.
1564 *
1565 * Note that the requested page, reqpage, is left
1566 * busied, but we still have to wake it up. The
1567 * other pages are released (unbusied) by
1568 * vm_page_wakeup(). We do not set reqpage's
1569 * valid bits here, it is up to the caller.
1570 */
1571 pmap_clear_modify(m);
1572 m->valid = VM_PAGE_BITS_ALL;
1573 vm_page_undirty(m);
1574 vm_page_flag_clear(m, PG_ZERO);
1575
1576 /*
1577 * We have to wake specifically requested pages
1578 * up too because we cleared PG_SWAPINPROG and
1579 * could be waiting for it in getpages. However,
1580 * be sure to not unbusy getpages specifically
1581 * requested page - getpages expects it to be
1582 * left busy.
1583 */
1584 if (i != bp->b_pager.pg_reqpage) {
1585 vm_page_deactivate(m);
1586 vm_page_wakeup(m);
1587 } else {
1588 vm_page_flash(m);
1589 }
1590 } else {
1591 /*
1592 * For write success, clear the modify and dirty
1593 * status, then finish the I/O ( which decrements the
1594 * busy count and possibly wakes waiter's up ).
1595 */
1596 pmap_clear_modify(m);
1597 vm_page_undirty(m);
1598 vm_page_io_finish(m);
1599 if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
1600 vm_page_protect(m, VM_PROT_READ);
1601 }
1602 }
1603
1604 /*
1605 * adjust pip. NOTE: the original parent may still have its own
1606 * pip refs on the object.
1607 */
1608 if (object)
1609 vm_object_pip_wakeupn(object, bp->b_npages);
1610
1611 /*
1612 * release the physical I/O buffer
1613 */
1614 relpbuf(
1615 bp,
1616 ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
1617 ((bp->b_flags & B_ASYNC) ?
1618 &nsw_wcount_async :
1619 &nsw_wcount_sync
1620 )
1621 )
1622 );
1623 splx(s);
1624}
1625
1626/************************************************************************
1627 * SWAP META DATA *
1628 ************************************************************************
1629 *
1630 * These routines manipulate the swap metadata stored in the
1631 * OBJT_SWAP object. All swp_*() routines must be called at
1632 * splvm() because swap can be freed up by the low level vm_page
1633 * code which might be called from interrupts beyond what splbio() covers.
1634 *
1635 * Swap metadata is implemented with a global hash and not directly
1636 * linked into the object. Instead the object simply contains
1637 * appropriate tracking counters.
1638 */
1639
1640/*
1641 * SWP_PAGER_HASH() - hash swap meta data
1642 *
1643 * This is an inline helper function which hashes the swapblk given
1644 * the object and page index. It returns a pointer to a pointer
1645 * to the object, or a pointer to a NULL pointer if it could not
1646 * find a swapblk.
1647 *
1648 * This routine must be called at splvm().
1649 */
1650static __inline struct swblock **
1651swp_pager_hash(vm_object_t object, vm_pindex_t index)
1652{
1653 struct swblock **pswap;
1654 struct swblock *swap;
1655
1656 index &= ~SWAP_META_MASK;
1657 pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
1658 while ((swap = *pswap) != NULL) {
1659 if (swap->swb_object == object &&
1660 swap->swb_index == index
1661 ) {
1662 break;
1663 }
1664 pswap = &swap->swb_hnext;
1665 }
1666 return (pswap);
1667}
1668
1669/*
1670 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
1671 *
1672 * We first convert the object to a swap object if it is a default
1673 * object.
1674 *
1675 * The specified swapblk is added to the object's swap metadata. If
1676 * the swapblk is not valid, it is freed instead. Any previously
1677 * assigned swapblk is freed.
1678 *
1679 * This routine must be called at splvm(), except when used to convert
1680 * an OBJT_DEFAULT object into an OBJT_SWAP object.
1681 */
1682static void
1683swp_pager_meta_build(
1684 vm_object_t object,
1685 vm_pindex_t index,
1686 daddr_t swapblk
1687) {
1688 struct swblock *swap;
1689 struct swblock **pswap;
1690
1691 GIANT_REQUIRED;
1692 /*
1693 * Convert default object to swap object if necessary
1694 */
1695 if (object->type != OBJT_SWAP) {
1696 object->type = OBJT_SWAP;
1697 object->un_pager.swp.swp_bcount = 0;
1698
1699 mtx_lock(&sw_alloc_mtx);
1700 if (object->handle != NULL) {
1701 TAILQ_INSERT_TAIL(
1702 NOBJLIST(object->handle),
1703 object,
1704 pager_object_list
1705 );
1706 } else {
1707 TAILQ_INSERT_TAIL(
1708 &swap_pager_un_object_list,
1709 object,
1710 pager_object_list
1711 );
1712 }
1713 mtx_unlock(&sw_alloc_mtx);
1714 }
1715
1716 /*
1717 * Locate hash entry. If not found create, but if we aren't adding
1718 * anything just return. If we run out of space in the map we wait
1719 * and, since the hash table may have changed, retry.
1720 */
1721retry:
1722 pswap = swp_pager_hash(object, index);
1723
1724 if ((swap = *pswap) == NULL) {
1725 int i;
1726
1727 if (swapblk == SWAPBLK_NONE)
1728 return;
1729
1730 swap = *pswap = zalloc(swap_zone);
1731 if (swap == NULL) {
1732 VM_WAIT;
1733 goto retry;
1734 }
1735 swap->swb_hnext = NULL;
1736 swap->swb_object = object;
1737 swap->swb_index = index & ~SWAP_META_MASK;
1738 swap->swb_count = 0;
1739
1740 ++object->un_pager.swp.swp_bcount;
1741
1742 for (i = 0; i < SWAP_META_PAGES; ++i)
1743 swap->swb_pages[i] = SWAPBLK_NONE;
1744 }
1745
1746 /*
1747 * Delete prior contents of metadata
1748 */
1749 index &= SWAP_META_MASK;
1750
1751 if (swap->swb_pages[index] != SWAPBLK_NONE) {
1752 swp_pager_freeswapspace(swap->swb_pages[index], 1);
1753 --swap->swb_count;
1754 }
1755
1756 /*
1757 * Enter block into metadata
1758 */
1759 swap->swb_pages[index] = swapblk;
1760 if (swapblk != SWAPBLK_NONE)
1761 ++swap->swb_count;
1762}
1763
1764/*
1765 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
1766 *
1767 * The requested range of blocks is freed, with any associated swap
1768 * returned to the swap bitmap.
1769 *
1770 * This routine will free swap metadata structures as they are cleaned
1771 * out. This routine does *NOT* operate on swap metadata associated
1772 * with resident pages.
1773 *
1774 * This routine must be called at splvm()
1775 */
1776static void
1777swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
1778{
1779 GIANT_REQUIRED;
1780
1781 if (object->type != OBJT_SWAP)
1782 return;
1783
1784 while (count > 0) {
1785 struct swblock **pswap;
1786 struct swblock *swap;
1787
1788 pswap = swp_pager_hash(object, index);
1789
1790 if ((swap = *pswap) != NULL) {
1791 daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
1792
1793 if (v != SWAPBLK_NONE) {
1794 swp_pager_freeswapspace(v, 1);
1795 swap->swb_pages[index & SWAP_META_MASK] =
1796 SWAPBLK_NONE;
1797 if (--swap->swb_count == 0) {
1798 *pswap = swap->swb_hnext;
1799 zfree(swap_zone, swap);
1800 --object->un_pager.swp.swp_bcount;
1801 }
1802 }
1803 --count;
1804 ++index;
1805 } else {
1806 int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
1807 count -= n;
1808 index += n;
1809 }
1810 }
1811}
1812
1813/*
1814 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
1815 *
1816 * This routine locates and destroys all swap metadata associated with
1817 * an object.
1818 *
1819 * This routine must be called at splvm()
1820 */
1821static void
1822swp_pager_meta_free_all(vm_object_t object)
1823{
1824 daddr_t index = 0;
1825
1826 GIANT_REQUIRED;
1827
1828 if (object->type != OBJT_SWAP)
1829 return;
1830
1831 while (object->un_pager.swp.swp_bcount) {
1832 struct swblock **pswap;
1833 struct swblock *swap;
1834
1835 pswap = swp_pager_hash(object, index);
1836 if ((swap = *pswap) != NULL) {
1837 int i;
1838
1839 for (i = 0; i < SWAP_META_PAGES; ++i) {
1840 daddr_t v = swap->swb_pages[i];
1841 if (v != SWAPBLK_NONE) {
1842 --swap->swb_count;
1843 swp_pager_freeswapspace(v, 1);
1844 }
1845 }
1846 if (swap->swb_count != 0)
1847 panic("swap_pager_meta_free_all: swb_count != 0");
1848 *pswap = swap->swb_hnext;
1849 zfree(swap_zone, swap);
1850 --object->un_pager.swp.swp_bcount;
1851 }
1852 index += SWAP_META_PAGES;
1853 if (index > 0x20000000)
1854 panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
1855 }
1856}
1857
1858/*
1859 * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data.
1860 *
1861 * This routine is capable of looking up, popping, or freeing
1862 * swapblk assignments in the swap meta data or in the vm_page_t.
1863 * The routine typically returns the swapblk being looked-up, or popped,
1864 * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
1865 * was invalid. This routine will automatically free any invalid
1866 * meta-data swapblks.
1867 *
1868 * It is not possible to store invalid swapblks in the swap meta data
1869 * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
1870 *
1871 * When acting on a busy resident page and paging is in progress, we
1872 * have to wait until paging is complete but otherwise can act on the
1873 * busy page.
1874 *
1875 * This routine must be called at splvm().
1876 *
1877 * SWM_FREE remove and free swap block from metadata
1878 * SWM_POP remove from meta data but do not free.. pop it out
1879 */
1880static daddr_t
1881swp_pager_meta_ctl(
1882 vm_object_t object,
1883 vm_pindex_t index,
1884 int flags
1885) {
1886 struct swblock **pswap;
1887 struct swblock *swap;
1888 daddr_t r1;
1889
1890 GIANT_REQUIRED;
1891 /*
1892 * The meta data only exists of the object is OBJT_SWAP
1893 * and even then might not be allocated yet.
1894 */
1895 if (object->type != OBJT_SWAP)
1896 return (SWAPBLK_NONE);
1897
1898 r1 = SWAPBLK_NONE;
1899 pswap = swp_pager_hash(object, index);
1900
1901 if ((swap = *pswap) != NULL) {
1902 index &= SWAP_META_MASK;
1903 r1 = swap->swb_pages[index];
1904
1905 if (r1 != SWAPBLK_NONE) {
1906 if (flags & SWM_FREE) {
1907 swp_pager_freeswapspace(r1, 1);
1908 r1 = SWAPBLK_NONE;
1909 }
1910 if (flags & (SWM_FREE|SWM_POP)) {
1911 swap->swb_pages[index] = SWAPBLK_NONE;
1912 if (--swap->swb_count == 0) {
1913 *pswap = swap->swb_hnext;
1914 zfree(swap_zone, swap);
1915 --object->un_pager.swp.swp_bcount;
1916 }
1917 }
1918 }
1919 }
1920 return (r1);
1921}
1922
1923/********************************************************
1924 * CHAINING FUNCTIONS *
1925 ********************************************************
1926 *
1927 * These functions support recursion of I/O operations
1928 * on bp's, typically by chaining one or more 'child' bp's
1929 * to the parent. Synchronous, asynchronous, and semi-synchronous
1930 * chaining is possible.
1931 */
1932
1933/*
1934 * vm_pager_chain_iodone:
1935 *
1936 * io completion routine for child bp. Currently we fudge a bit
1937 * on dealing with b_resid. Since users of these routines may issue
1938 * multiple children simultaneously, sequencing of the error can be lost.
1939 */
1940static void
1941vm_pager_chain_iodone(struct buf *nbp)
1942{
1943 struct bio *bp;
1944 u_int *count;
1945
1946 bp = nbp->b_caller1;
1947 count = (u_int *)&(bp->bio_driver1);
1948 if (bp != NULL) {
1949 if (nbp->b_ioflags & BIO_ERROR) {
1950 bp->bio_flags |= BIO_ERROR;
1951 bp->bio_error = nbp->b_error;
1952 } else if (nbp->b_resid != 0) {
1953 bp->bio_flags |= BIO_ERROR;
1954 bp->bio_error = EINVAL;
1955 } else {
1956 bp->bio_resid -= nbp->b_bcount;
1957 }
1958 nbp->b_caller1 = NULL;
1959 --(*count);
1960 if (bp->bio_flags & BIO_FLAG1) {
1961 bp->bio_flags &= ~BIO_FLAG1;
1962 wakeup(bp);
1963 }
1964 }
1965 nbp->b_flags |= B_DONE;
1966 nbp->b_flags &= ~B_ASYNC;
1967 relpbuf(nbp, NULL);
1968}
1969
1970/*
1971 * getchainbuf:
1972 *
1973 * Obtain a physical buffer and chain it to its parent buffer. When
1974 * I/O completes, the parent buffer will be B_SIGNAL'd. Errors are
1975 * automatically propagated to the parent
1976 */
1977struct buf *
1978getchainbuf(struct bio *bp, struct vnode *vp, int flags)
1979{
1980 struct buf *nbp;
1981 u_int *count;
1982
1983 GIANT_REQUIRED;
1984 nbp = getpbuf(NULL);
1985 count = (u_int *)&(bp->bio_driver1);
1986
1987 nbp->b_caller1 = bp;
1988 ++(*count);
1989
1990 if (*count > 4)
1991 waitchainbuf(bp, 4, 0);
1992
1993 nbp->b_iocmd = bp->bio_cmd;
1994 nbp->b_ioflags = 0;
1995 nbp->b_flags = flags;
1996 nbp->b_rcred = crhold(thread0.td_ucred);
1997 nbp->b_wcred = crhold(thread0.td_ucred);
1998 nbp->b_iodone = vm_pager_chain_iodone;
1999
2000 if (vp)
2001 pbgetvp(vp, nbp);
2002 return (nbp);
2003}
2004
2005void
2006flushchainbuf(struct buf *nbp)
2007{
2008 GIANT_REQUIRED;
2009 if (nbp->b_bcount) {
2010 nbp->b_bufsize = nbp->b_bcount;
2011 if (nbp->b_iocmd == BIO_WRITE)
2012 nbp->b_dirtyend = nbp->b_bcount;
2013 BUF_KERNPROC(nbp);
2014 BUF_STRATEGY(nbp);
2015 } else {
2016 bufdone(nbp);
2017 }
2018}
2019
2020static void
2021waitchainbuf(struct bio *bp, int limit, int done)
2022{
2023 int s;
2024 u_int *count;
2025
2026 GIANT_REQUIRED;
2027 count = (u_int *)&(bp->bio_driver1);
2028 s = splbio();
2029 while (*count > limit) {
2030 bp->bio_flags |= BIO_FLAG1;
2031 tsleep(bp, PRIBIO + 4, "bpchain", 0);
2032 }
2033 if (done) {
2034 if (bp->bio_resid != 0 && !(bp->bio_flags & BIO_ERROR)) {
2035 bp->bio_flags |= BIO_ERROR;
2036 bp->bio_error = EINVAL;
2037 }
2038 biodone(bp);
2039 }
2040 splx(s);
2041}
2042