1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57/*
58 *	Default Pager.
59 *		Paging File Management.
60 */
61
62#include <mach/host_priv.h>
63#include <mach/memory_object_control.h>
64#include <mach/memory_object_server.h>
65#include <mach/upl.h>
66#include <default_pager/default_pager_internal.h>
67#include <default_pager/default_pager_alerts.h>
68#include <default_pager/default_pager_object_server.h>
69
70#include <ipc/ipc_types.h>
71#include <ipc/ipc_port.h>
72#include <ipc/ipc_space.h>
73
74#include <kern/kern_types.h>
75#include <kern/host.h>
76#include <kern/queue.h>
77#include <kern/counters.h>
78#include <kern/sched_prim.h>
79
80#include <vm/vm_kern.h>
81#include <vm/vm_pageout.h>
82#include <vm/vm_map.h>
83#include <vm/vm_object.h>
84#include <vm/vm_protos.h>
85
86
87/* todo - need large internal object support */
88
89/*
90 * ALLOC_STRIDE... the maximum number of bytes allocated from
91 * a swap file before moving on to the next swap file... if
92 * all swap files reside on a single disk, this value should
93 * be very large (this is the default assumption)... if the
94 * swap files are spread across multiple disks, than this value
95 * should be small (128 * 1024)...
96 *
97 * This should be determined dynamically in the future
98 */
99
100#define ALLOC_STRIDE  (1024 * 1024 * 1024)
101int physical_transfer_cluster_count = 0;
102
103#define VM_SUPER_CLUSTER	0x40000
104#define VM_SUPER_PAGES          (VM_SUPER_CLUSTER / PAGE_MIN_SIZE)
105
106/*
107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean
108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on.
109 */
110#define VSTRUCT_MIN_CLSHIFT	0
111
112#define VSTRUCT_DEF_CLSHIFT	2
113int default_pager_clsize = 0;
114
115int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT;
116
117/* statistics */
118unsigned int clustered_writes[VM_SUPER_PAGES+1];
119unsigned int clustered_reads[VM_SUPER_PAGES+1];
120
121/*
122 * Globals used for asynchronous paging operations:
123 * 	vs_async_list:	head of list of to-be-completed I/O ops
124 *	async_num_queued: number of pages completed, but not yet
125 *		processed by async thread.
126 *	async_requests_out: number of pages of requests not completed.
127 */
128
129#if 0
130struct vs_async *vs_async_list;
131int	async_num_queued;
132int	async_requests_out;
133#endif
134
135
136#define VS_ASYNC_REUSE 1
137struct vs_async *vs_async_free_list;
138
139lck_mtx_t	default_pager_async_lock;	/* Protects globals above */
140
141
142int vs_alloc_async_failed = 0;			/* statistics */
143int vs_alloc_async_count = 0;			/* statistics */
144struct vs_async *vs_alloc_async(void);		/* forward */
145void vs_free_async(struct vs_async *vsa);	/* forward */
146
147
148#define VS_ALLOC_ASYNC()	vs_alloc_async()
149#define VS_FREE_ASYNC(vsa)	vs_free_async(vsa)
150
151#define VS_ASYNC_LOCK()		lck_mtx_lock(&default_pager_async_lock)
152#define VS_ASYNC_UNLOCK()	lck_mtx_unlock(&default_pager_async_lock)
153#define VS_ASYNC_LOCK_INIT()	lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr)
154#define VS_ASYNC_LOCK_DESTROY()	lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp)
155#define VS_ASYNC_LOCK_ADDR()	(&default_pager_async_lock)
156/*
157 *  Paging Space Hysteresis triggers and the target notification port
158 *
159 */
160unsigned int	dp_pages_free_drift_count = 0;
161unsigned int	dp_pages_free_drifted_max = 0;
162unsigned int	minimum_pages_remaining	= 0;
163unsigned int	maximum_pages_free = 0;
164ipc_port_t	min_pages_trigger_port = NULL;
165ipc_port_t	max_pages_trigger_port = NULL;
166
167#if CONFIG_FREEZE
168boolean_t	use_emergency_swap_file_first = TRUE;
169#else
170boolean_t	use_emergency_swap_file_first = FALSE;
171#endif
172boolean_t	bs_low = FALSE;
173int		backing_store_release_trigger_disable = 0;
174boolean_t	backing_store_stop_compaction = FALSE;
175boolean_t	backing_store_abort_compaction = FALSE;
176
177/* Have we decided if swap needs to be encrypted yet ? */
178boolean_t	dp_encryption_inited = FALSE;
179/* Should we encrypt swap ? */
180boolean_t	dp_encryption = FALSE;
181
182boolean_t	dp_isssd = FALSE;
183
184/*
185 * Object sizes are rounded up to the next power of 2,
186 * unless they are bigger than a given maximum size.
187 */
188vm_size_t	max_doubled_size = 4 * 1024 * 1024;	/* 4 meg */
189
190/*
191 * List of all backing store and segments.
192 */
193MACH_PORT_FACE		emergency_segment_backing_store;
194struct backing_store_list_head backing_store_list;
195paging_segment_t	paging_segments[MAX_NUM_PAGING_SEGMENTS];
196lck_mtx_t			paging_segments_lock;
197int			paging_segment_max = 0;
198int			paging_segment_count = 0;
199int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 };
200
201
202/*
203 * Total pages free in system
204 * This differs from clusters committed/avail which is a measure of the
205 * over commitment of paging segments to backing store.  An idea which is
206 * likely to be deprecated.
207 */
208unsigned  int	dp_pages_free = 0;
209unsigned  int	dp_pages_reserve = 0;
210unsigned  int	cluster_transfer_minimum = 100;
211
212/*
213 * Trim state
214 */
215struct ps_vnode_trim_data {
216	struct vnode *vp;
217	dp_offset_t   offset;
218	dp_size_t     length;
219};
220
221/* forward declarations */
222kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int);	/* forward */
223kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int);	/* forward */
224default_pager_thread_t *get_read_buffer( void );
225kern_return_t ps_vstruct_transfer_from_segment(
226	vstruct_t	 vs,
227	paging_segment_t segment,
228	upl_t		 upl);
229kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int);	/* forward */
230kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *);	/* forward */
231kern_return_t vs_cluster_transfer(
232	vstruct_t	vs,
233	dp_offset_t	offset,
234	dp_size_t	cnt,
235	upl_t		upl);
236vs_map_t vs_get_map_entry(
237	vstruct_t	vs,
238	dp_offset_t	offset);
239
240kern_return_t
241default_pager_backing_store_delete_internal( MACH_PORT_FACE );
242
243static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data);
244static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data);
245static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length);
246
247default_pager_thread_t *
248get_read_buffer( void )
249{
250	int	i;
251
252	DPT_LOCK(dpt_lock);
253	while(TRUE) {
254		for (i=0; i<default_pager_internal_count; i++) {
255			if(dpt_array[i]->checked_out == FALSE) {
256			  dpt_array[i]->checked_out = TRUE;
257			  DPT_UNLOCK(dpt_lock);
258			  return  dpt_array[i];
259			}
260		}
261		DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT);
262	}
263}
264
265void
266bs_initialize(void)
267{
268	int i;
269
270	/*
271	 * List of all backing store.
272	 */
273	BSL_LOCK_INIT();
274	queue_init(&backing_store_list.bsl_queue);
275	PSL_LOCK_INIT();
276
277	VS_ASYNC_LOCK_INIT();
278#if	VS_ASYNC_REUSE
279	vs_async_free_list = NULL;
280#endif	/* VS_ASYNC_REUSE */
281
282	for (i = 0; i < VM_SUPER_PAGES + 1; i++) {
283		clustered_writes[i] = 0;
284		clustered_reads[i] = 0;
285	}
286
287}
288
289/*
290 * When things do not quite workout...
291 */
292void bs_no_paging_space(boolean_t);	/* forward */
293
294void
295bs_no_paging_space(
296	boolean_t out_of_memory)
297{
298
299	if (out_of_memory)
300		dprintf(("*** OUT OF MEMORY ***\n"));
301	panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE");
302}
303
304void bs_more_space(int);	/* forward */
305void bs_commit(int);		/* forward */
306
307boolean_t	user_warned = FALSE;
308unsigned int	clusters_committed = 0;
309unsigned int	clusters_available = 0;
310unsigned int	clusters_committed_peak = 0;
311
312void
313bs_more_space(
314	int	nclusters)
315{
316	BSL_LOCK();
317	/*
318	 * Account for new paging space.
319	 */
320	clusters_available += nclusters;
321
322	if (clusters_available >= clusters_committed) {
323		if (verbose && user_warned) {
324			printf("%s%s - %d excess clusters now.\n",
325			       my_name,
326			       "paging space is OK now",
327			       clusters_available - clusters_committed);
328			user_warned = FALSE;
329			clusters_committed_peak = 0;
330		}
331	} else {
332		if (verbose && user_warned) {
333			printf("%s%s - still short of %d clusters.\n",
334			       my_name,
335			       "WARNING: paging space over-committed",
336			       clusters_committed - clusters_available);
337			clusters_committed_peak -= nclusters;
338		}
339	}
340	BSL_UNLOCK();
341
342	return;
343}
344
345void
346bs_commit(
347	int	nclusters)
348{
349	BSL_LOCK();
350	clusters_committed += nclusters;
351	if (clusters_committed > clusters_available) {
352		if (verbose && !user_warned) {
353			user_warned = TRUE;
354			printf("%s%s - short of %d clusters.\n",
355			       my_name,
356			       "WARNING: paging space over-committed",
357			       clusters_committed - clusters_available);
358		}
359		if (clusters_committed > clusters_committed_peak) {
360			clusters_committed_peak = clusters_committed;
361		}
362	} else {
363		if (verbose && user_warned) {
364			printf("%s%s - was short of up to %d clusters.\n",
365			       my_name,
366			       "paging space is OK now",
367			       clusters_committed_peak - clusters_available);
368			user_warned = FALSE;
369			clusters_committed_peak = 0;
370		}
371	}
372	BSL_UNLOCK();
373
374	return;
375}
376
377int default_pager_info_verbose = 1;
378
379void
380bs_global_info(
381	uint64_t	*totalp,
382	uint64_t	*freep)
383{
384	uint64_t		pages_total, pages_free;
385	paging_segment_t	ps;
386	int			i;
387
388	PSL_LOCK();
389	pages_total = pages_free = 0;
390	for (i = 0; i <= paging_segment_max; i++) {
391		ps = paging_segments[i];
392		if (ps == PAGING_SEGMENT_NULL)
393			continue;
394
395		/*
396		 * no need to lock: by the time this data
397		 * gets back to any remote requestor it
398		 * will be obsolete anyways
399		 */
400		pages_total += ps->ps_pgnum;
401		pages_free += ps->ps_clcount << ps->ps_clshift;
402		DP_DEBUG(DEBUG_BS_INTERNAL,
403			 ("segment #%d: %d total, %d free\n",
404			  i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift));
405	}
406	*totalp = pages_total;
407	*freep = pages_free;
408	if (verbose && user_warned && default_pager_info_verbose) {
409		if (clusters_available < clusters_committed) {
410			printf("%s %d clusters committed, %d available.\n",
411			       my_name,
412			       clusters_committed,
413			       clusters_available);
414		}
415	}
416	PSL_UNLOCK();
417}
418
419backing_store_t backing_store_alloc(void);	/* forward */
420
421backing_store_t
422backing_store_alloc(void)
423{
424	backing_store_t bs;
425
426	bs = (backing_store_t) kalloc(sizeof (struct backing_store));
427	if (bs == BACKING_STORE_NULL)
428		panic("backing_store_alloc: no memory");
429
430	BS_LOCK_INIT(bs);
431	bs->bs_port = MACH_PORT_NULL;
432	bs->bs_priority = 0;
433	bs->bs_clsize = 0;
434	bs->bs_pages_total = 0;
435	bs->bs_pages_in = 0;
436	bs->bs_pages_in_fail = 0;
437	bs->bs_pages_out = 0;
438	bs->bs_pages_out_fail = 0;
439
440	return bs;
441}
442
443backing_store_t backing_store_lookup(MACH_PORT_FACE);	/* forward */
444
445/* Even in both the component space and external versions of this pager, */
446/* backing_store_lookup will be called from tasks in the application space */
447backing_store_t
448backing_store_lookup(
449	MACH_PORT_FACE port)
450{
451	backing_store_t	bs;
452
453/*
454	port is currently backed with a vs structure in the alias field
455	we could create an ISBS alias and a port_is_bs call but frankly
456	I see no reason for the test, the bs->port == port check below
457	will work properly on junk entries.
458
459	if ((port == MACH_PORT_NULL) || port_is_vs(port))
460*/
461	if (port == MACH_PORT_NULL)
462		return BACKING_STORE_NULL;
463
464	BSL_LOCK();
465	queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t,
466		      bs_links) {
467		BS_LOCK(bs);
468		if (bs->bs_port == port) {
469			BSL_UNLOCK();
470			/* Success, return it locked. */
471			return bs;
472		}
473		BS_UNLOCK(bs);
474	}
475	BSL_UNLOCK();
476	return BACKING_STORE_NULL;
477}
478
479void backing_store_add(backing_store_t);	/* forward */
480
481void
482backing_store_add(
483	__unused backing_store_t bs)
484{
485//	MACH_PORT_FACE		port = bs->bs_port;
486//	MACH_PORT_FACE		pset = default_pager_default_set;
487	kern_return_t		kr = KERN_SUCCESS;
488
489	if (kr != KERN_SUCCESS)
490		panic("backing_store_add: add to set");
491
492}
493
494/*
495 * Set up default page shift, but only if not already
496 * set and argument is within range.
497 */
498boolean_t
499bs_set_default_clsize(unsigned int npages)
500{
501	switch(npages){
502	    case 1:
503	    case 2:
504	    case 4:
505	    case 8:
506		if (default_pager_clsize == 0)	/* if not yet set */
507			vstruct_def_clshift = local_log2(npages);
508		return(TRUE);
509	}
510	return(FALSE);
511}
512
513int bs_get_global_clsize(int clsize);	/* forward */
514
515int
516bs_get_global_clsize(
517	int	clsize)
518{
519	int			i;
520	memory_object_default_t	dmm;
521	kern_return_t		kr;
522
523	/*
524	 * Only allow setting of cluster size once. If called
525	 * with no cluster size (default), we use the compiled-in default
526	 * for the duration. The same cluster size is used for all
527	 * paging segments.
528	 */
529	if (default_pager_clsize == 0) {
530		/*
531		 * Keep cluster size in bit shift because it's quicker
532		 * arithmetic, and easier to keep at a power of 2.
533		 */
534		if (clsize != NO_CLSIZE) {
535			for (i = 0; (1 << i) < clsize; i++);
536			if (i > MAX_CLUSTER_SHIFT)
537				i = MAX_CLUSTER_SHIFT;
538			vstruct_def_clshift = i;
539		}
540		default_pager_clsize = (1 << vstruct_def_clshift);
541
542		/*
543		 * Let the user know the new (and definitive) cluster size.
544		 */
545		if (verbose)
546			printf("%scluster size = %d page%s\n",
547		       		my_name, default_pager_clsize,
548		       		(default_pager_clsize == 1) ? "" : "s");
549
550		/*
551		 * Let the kernel know too, in case it hasn't used the
552		 * default value provided in main() yet.
553		 */
554		dmm = default_pager_object;
555		clsize = default_pager_clsize * vm_page_size;	/* in bytes */
556		kr = host_default_memory_manager(host_priv_self(),
557						 &dmm,
558						 clsize);
559		memory_object_default_deallocate(dmm);
560
561		if (kr != KERN_SUCCESS) {
562		   panic("bs_get_global_cl_size:host_default_memory_manager");
563		}
564		if (dmm != default_pager_object) {
565		  panic("bs_get_global_cl_size:there is another default pager");
566		}
567	}
568	ASSERT(default_pager_clsize > 0 &&
569	       (default_pager_clsize & (default_pager_clsize - 1)) == 0);
570
571	return default_pager_clsize;
572}
573
574kern_return_t
575default_pager_backing_store_create(
576	memory_object_default_t	pager,
577	int			priority,
578	int			clsize,		/* in bytes */
579	MACH_PORT_FACE		*backing_store)
580{
581	backing_store_t	bs;
582	MACH_PORT_FACE	port;
583//	kern_return_t	kr;
584	struct vstruct_alias *alias_struct;
585
586	if (pager != default_pager_object)
587		return KERN_INVALID_ARGUMENT;
588
589	bs = backing_store_alloc();
590	port = ipc_port_alloc_kernel();
591	ipc_port_make_send(port);
592	assert (port != IP_NULL);
593
594	DP_DEBUG(DEBUG_BS_EXTERNAL,
595		 ("priority=%d clsize=%d bs_port=0x%x\n",
596		  priority, clsize, (int) backing_store));
597
598	alias_struct = (struct vstruct_alias *)
599				kalloc(sizeof (struct vstruct_alias));
600	if(alias_struct != NULL) {
601		alias_struct->vs = (struct vstruct *)bs;
602		alias_struct->name = &default_pager_ops;
603		port->ip_alias = (uintptr_t) alias_struct;
604	}
605	else {
606		ipc_port_dealloc_kernel((MACH_PORT_FACE)(port));
607
608		BS_LOCK_DESTROY(bs);
609		kfree(bs, sizeof (struct backing_store));
610
611		return KERN_RESOURCE_SHORTAGE;
612	}
613
614	bs->bs_port = port;
615	if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI)
616		priority = BS_MAXPRI;
617	else if (priority == BS_NOPRI)
618		priority = BS_MAXPRI;
619	else
620		priority = BS_MINPRI;
621	bs->bs_priority = priority;
622
623	bs->bs_clsize = bs_get_global_clsize(atop_32(clsize));
624
625	BSL_LOCK();
626	queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t,
627		    bs_links);
628	BSL_UNLOCK();
629
630	backing_store_add(bs);
631
632	*backing_store = port;
633	return KERN_SUCCESS;
634}
635
636kern_return_t
637default_pager_backing_store_info(
638	MACH_PORT_FACE		backing_store,
639	backing_store_flavor_t	flavour,
640	backing_store_info_t	info,
641	mach_msg_type_number_t	*size)
642{
643	backing_store_t			bs;
644	backing_store_basic_info_t	basic;
645	int				i;
646	paging_segment_t		ps;
647
648	if (flavour != BACKING_STORE_BASIC_INFO ||
649	    *size < BACKING_STORE_BASIC_INFO_COUNT)
650		return KERN_INVALID_ARGUMENT;
651
652	basic = (backing_store_basic_info_t)info;
653	*size = BACKING_STORE_BASIC_INFO_COUNT;
654
655	VSTATS_LOCK(&global_stats.gs_lock);
656	basic->pageout_calls	= global_stats.gs_pageout_calls;
657	basic->pagein_calls	= global_stats.gs_pagein_calls;
658	basic->pages_in		= global_stats.gs_pages_in;
659	basic->pages_out	= global_stats.gs_pages_out;
660	basic->pages_unavail	= global_stats.gs_pages_unavail;
661	basic->pages_init	= global_stats.gs_pages_init;
662	basic->pages_init_writes= global_stats.gs_pages_init_writes;
663	VSTATS_UNLOCK(&global_stats.gs_lock);
664
665	if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
666		return KERN_INVALID_ARGUMENT;
667
668	basic->bs_pages_total	= bs->bs_pages_total;
669	PSL_LOCK();
670	bs->bs_pages_free = 0;
671	for (i = 0; i <= paging_segment_max; i++) {
672		ps = paging_segments[i];
673		if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) {
674			PS_LOCK(ps);
675			bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
676			PS_UNLOCK(ps);
677		}
678	}
679	PSL_UNLOCK();
680	basic->bs_pages_free	= bs->bs_pages_free;
681	basic->bs_pages_in	= bs->bs_pages_in;
682	basic->bs_pages_in_fail	= bs->bs_pages_in_fail;
683	basic->bs_pages_out	= bs->bs_pages_out;
684	basic->bs_pages_out_fail= bs->bs_pages_out_fail;
685
686	basic->bs_priority	= bs->bs_priority;
687	basic->bs_clsize	= ptoa_32(bs->bs_clsize);	/* in bytes */
688
689	BS_UNLOCK(bs);
690
691	return KERN_SUCCESS;
692}
693
694int ps_delete(paging_segment_t);	/* forward */
695boolean_t current_thread_aborted(void);
696
697int
698ps_delete(
699	paging_segment_t ps)
700{
701	vstruct_t	vs;
702	kern_return_t	error = KERN_SUCCESS;
703	int		vs_count;
704
705	VSL_LOCK();  		/* get the lock on the list of vs's	 */
706
707	/* The lock relationship and sequence is farily complicated  	 */
708	/* this code looks at a live list, locking and unlocking the list */
709	/* as it traverses it.  It depends on the locking behavior of	 */
710	/* default_pager_no_senders.  no_senders always locks the vstruct */
711	/* targeted for removal before locking the vstruct list.  However */
712	/* it will remove that member of the list without locking its    */
713	/* neighbors.  We can be sure when we hold a lock on a vstruct   */
714	/* it cannot be removed from the list but we must hold the list  */
715	/* lock to be sure that its pointers to its neighbors are valid. */
716	/* Also, we can hold off destruction of a vstruct when the list  */
717	/* lock and the vs locks are not being held by bumping the 	 */
718	/* vs_async_pending count.      */
719
720
721	while(backing_store_release_trigger_disable != 0) {
722		VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT);
723	}
724
725	/* we will choose instead to hold a send right */
726	vs_count = vstruct_list.vsl_count;
727	vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue));
728	if(vs == (vstruct_t)&vstruct_list)  {
729		VSL_UNLOCK();
730		return KERN_SUCCESS;
731	}
732	VS_LOCK(vs);
733	vs_async_wait(vs);  /* wait for any pending async writes */
734	if ((vs_count != 0) && (vs != NULL))
735		vs->vs_async_pending += 1;  /* hold parties calling  */
736					    /* vs_async_wait */
737
738	if (bs_low == FALSE)
739		backing_store_abort_compaction = FALSE;
740
741	VS_UNLOCK(vs);
742	VSL_UNLOCK();
743	while((vs_count != 0) && (vs != NULL)) {
744		/* We take the count of AMO's before beginning the         */
745		/* transfer of of the target segment.                      */
746		/* We are guaranteed that the target segment cannot get    */
747		/* more users.  We also know that queue entries are        */
748		/* made at the back of the list.  If some of the entries   */
749		/* we would check disappear while we are traversing the    */
750		/* list then we will either check new entries which        */
751		/* do not have any backing store in the target segment     */
752		/* or re-check old entries.  This might not be optimal     */
753		/* but it will always be correct. The alternative is to    */
754		/* take a snapshot of the list.			   	   */
755		vstruct_t	next_vs;
756
757		if(dp_pages_free < cluster_transfer_minimum)
758			error = KERN_FAILURE;
759		else {
760			vm_object_t	transfer_object;
761			unsigned int	count;
762			upl_t		upl;
763			int		upl_flags;
764
765			transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER);
766			count = 0;
767			upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE |
768				     UPL_SET_LITE | UPL_SET_INTERNAL);
769			if (dp_encryption) {
770				/* mark the pages as "encrypted" when they come in */
771				upl_flags |= UPL_ENCRYPT;
772			}
773			error = vm_object_upl_request(transfer_object,
774				(vm_object_offset_t)0, VM_SUPER_CLUSTER,
775				&upl, NULL, &count, upl_flags);
776
777			if(error == KERN_SUCCESS) {
778				error = ps_vstruct_transfer_from_segment(
779							vs, ps, upl);
780				upl_commit(upl, NULL, 0);
781				upl_deallocate(upl);
782			} else {
783				error = KERN_FAILURE;
784			}
785			vm_object_deallocate(transfer_object);
786		}
787		if(error || current_thread_aborted()) {
788			VS_LOCK(vs);
789			vs->vs_async_pending -= 1;  /* release vs_async_wait */
790			if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
791				vs->vs_waiting_async = FALSE;
792				VS_UNLOCK(vs);
793				thread_wakeup(&vs->vs_async_pending);
794			} else {
795				VS_UNLOCK(vs);
796			}
797			return KERN_FAILURE;
798		}
799
800		VSL_LOCK();
801
802		while(backing_store_release_trigger_disable != 0) {
803			VSL_SLEEP(&backing_store_release_trigger_disable,
804				  THREAD_UNINT);
805		}
806
807		next_vs = (vstruct_t) queue_next(&(vs->vs_links));
808		if((next_vs != (vstruct_t)&vstruct_list) &&
809				(vs != next_vs) && (vs_count != 1)) {
810			VS_LOCK(next_vs);
811			vs_async_wait(next_vs);  /* wait for any  */
812						 /* pending async writes */
813			next_vs->vs_async_pending += 1; /* hold parties  */
814						/* calling vs_async_wait */
815			VS_UNLOCK(next_vs);
816		}
817		VSL_UNLOCK();
818		VS_LOCK(vs);
819		vs->vs_async_pending -= 1;
820		if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
821			vs->vs_waiting_async = FALSE;
822			VS_UNLOCK(vs);
823			thread_wakeup(&vs->vs_async_pending);
824		} else {
825			VS_UNLOCK(vs);
826		}
827		if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list))
828			vs = NULL;
829		else
830			vs = next_vs;
831		vs_count--;
832	}
833	return KERN_SUCCESS;
834}
835
836
837kern_return_t
838default_pager_backing_store_delete_internal(
839	MACH_PORT_FACE backing_store)
840{
841	backing_store_t		bs;
842	int			i;
843	paging_segment_t	ps;
844	int			error;
845	int			interim_pages_removed = 0;
846	boolean_t		dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store );
847
848	if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL)
849		return KERN_INVALID_ARGUMENT;
850
851restart:
852	PSL_LOCK();
853	error = KERN_SUCCESS;
854	for (i = 0; i <= paging_segment_max; i++) {
855		ps = paging_segments[i];
856		if (ps != PAGING_SEGMENT_NULL &&
857		    ps->ps_bs == bs &&
858		    ! IS_PS_GOING_AWAY(ps)) {
859			PS_LOCK(ps);
860
861			if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) {
862			/*
863			 * Someone is already busy reclamining this paging segment.
864			 * If it's the emergency segment we are looking at then check
865			 * that someone has not already recovered it and set the right
866			 * state i.e. online but not activated.
867			 */
868				PS_UNLOCK(ps);
869				continue;
870			}
871
872			/* disable access to this segment */
873			ps->ps_state &= ~PS_CAN_USE;
874			ps->ps_state |= PS_GOING_AWAY;
875			PS_UNLOCK(ps);
876			/*
877			 * The "ps" segment is "off-line" now,
878			 * we can try and delete it...
879			 */
880			if(dp_pages_free < (cluster_transfer_minimum
881				 			+ ps->ps_pgcount)) {
882				error = KERN_FAILURE;
883				PSL_UNLOCK();
884			}
885			else {
886				/* remove all pages associated with the  */
887				/* segment from the list of free pages   */
888				/* when transfer is through, all target  */
889				/* segment pages will appear to be free  */
890
891				dp_pages_free -=  ps->ps_pgcount;
892				interim_pages_removed += ps->ps_pgcount;
893				PSL_UNLOCK();
894				error = ps_delete(ps);
895			}
896			if (error != KERN_SUCCESS) {
897				/*
898				 * We couldn't delete the segment,
899				 * probably because there's not enough
900				 * virtual memory left.
901				 * Re-enable all the segments.
902				 */
903				PSL_LOCK();
904				break;
905			}
906			goto restart;
907		}
908	}
909
910	if (error != KERN_SUCCESS) {
911		for (i = 0; i <= paging_segment_max; i++) {
912			ps = paging_segments[i];
913			if (ps != PAGING_SEGMENT_NULL &&
914			    ps->ps_bs == bs &&
915			    IS_PS_GOING_AWAY(ps)) {
916				PS_LOCK(ps);
917
918				if( !IS_PS_GOING_AWAY(ps)) {
919					PS_UNLOCK(ps);
920					continue;
921				}
922				/* Handle the special clusters that came in while we let go the lock*/
923				if( ps->ps_special_clusters) {
924					dp_pages_free += ps->ps_special_clusters << ps->ps_clshift;
925					ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift;
926					ps->ps_clcount += ps->ps_special_clusters;
927					if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) {
928						ps_select_array[ps->ps_bs->bs_priority] = 0;
929					}
930					ps->ps_special_clusters = 0;
931				}
932				/* re-enable access to this segment */
933				ps->ps_state &= ~PS_GOING_AWAY;
934				ps->ps_state |= PS_CAN_USE;
935				PS_UNLOCK(ps);
936			}
937		}
938		dp_pages_free += interim_pages_removed;
939		PSL_UNLOCK();
940		BS_UNLOCK(bs);
941		return error;
942	}
943
944	for (i = 0; i <= paging_segment_max; i++) {
945		ps = paging_segments[i];
946		if (ps != PAGING_SEGMENT_NULL &&
947		    ps->ps_bs == bs) {
948			if(IS_PS_GOING_AWAY(ps)) {
949				if(IS_PS_EMERGENCY_SEGMENT(ps)) {
950					PS_LOCK(ps);
951					ps->ps_state &= ~PS_GOING_AWAY;
952					ps->ps_special_clusters = 0;
953					ps->ps_pgcount = ps->ps_pgnum;
954					ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
955					dp_pages_reserve += ps->ps_pgcount;
956					PS_UNLOCK(ps);
957				} else {
958					paging_segments[i] = PAGING_SEGMENT_NULL;
959					paging_segment_count--;
960					PS_LOCK(ps);
961					kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
962					kfree(ps, sizeof *ps);
963				}
964			}
965		}
966	}
967
968	/* Scan the entire ps array separately to make certain we find the */
969	/* proper paging_segment_max                                       */
970	for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
971		if(paging_segments[i] != PAGING_SEGMENT_NULL)
972		   paging_segment_max = i;
973	}
974
975	PSL_UNLOCK();
976
977	if( dealing_with_emergency_segment ) {
978		BS_UNLOCK(bs);
979		return KERN_SUCCESS;
980	}
981
982	/*
983	 * All the segments have been deleted.
984	 * We can remove the backing store.
985	 */
986
987	/*
988	 * Disable lookups of this backing store.
989	 */
990	if((void *)bs->bs_port->ip_alias != NULL)
991		kfree((void *) bs->bs_port->ip_alias,
992		      sizeof (struct vstruct_alias));
993	ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port));
994	bs->bs_port = MACH_PORT_NULL;
995	BS_UNLOCK(bs);
996
997	/*
998	 * Remove backing store from backing_store list.
999	 */
1000	BSL_LOCK();
1001	queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t,
1002		     bs_links);
1003	BSL_UNLOCK();
1004
1005	/*
1006	 * Free the backing store structure.
1007	 */
1008	BS_LOCK_DESTROY(bs);
1009	kfree(bs, sizeof *bs);
1010
1011	return KERN_SUCCESS;
1012}
1013
1014kern_return_t
1015default_pager_backing_store_delete(
1016	MACH_PORT_FACE backing_store)
1017{
1018	if( backing_store != emergency_segment_backing_store ) {
1019		default_pager_backing_store_delete_internal(emergency_segment_backing_store);
1020	}
1021	return(default_pager_backing_store_delete_internal(backing_store));
1022}
1023
1024int	ps_enter(paging_segment_t);	/* forward */
1025
1026int
1027ps_enter(
1028	paging_segment_t ps)
1029{
1030	int i;
1031
1032	PSL_LOCK();
1033
1034	for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) {
1035		if (paging_segments[i] == PAGING_SEGMENT_NULL)
1036			break;
1037	}
1038
1039	if (i < MAX_NUM_PAGING_SEGMENTS) {
1040		paging_segments[i] = ps;
1041		if (i > paging_segment_max)
1042			paging_segment_max = i;
1043		paging_segment_count++;
1044		if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) ||
1045			(ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI))
1046			ps_select_array[ps->ps_bs->bs_priority] = 0;
1047		i = 0;
1048	} else {
1049		PSL_UNLOCK();
1050		return KERN_RESOURCE_SHORTAGE;
1051	}
1052
1053	PSL_UNLOCK();
1054	return i;
1055}
1056
1057#ifdef DEVICE_PAGING
1058kern_return_t
1059default_pager_add_segment(
1060	MACH_PORT_FACE	backing_store,
1061	MACH_PORT_FACE	device,
1062	recnum_t	offset,
1063	recnum_t	count,
1064	int		record_size)
1065{
1066	backing_store_t		bs;
1067	paging_segment_t	ps;
1068	int			i;
1069	int			error;
1070
1071	if ((bs = backing_store_lookup(backing_store))
1072	    == BACKING_STORE_NULL)
1073		return KERN_INVALID_ARGUMENT;
1074
1075	PSL_LOCK();
1076	for (i = 0; i <= paging_segment_max; i++) {
1077		ps = paging_segments[i];
1078		if (ps == PAGING_SEGMENT_NULL)
1079			continue;
1080
1081		/*
1082		 * Check for overlap on same device.
1083		 */
1084		if (!(ps->ps_device != device
1085		      || offset >= ps->ps_offset + ps->ps_recnum
1086		      || offset + count <= ps->ps_offset)) {
1087			PSL_UNLOCK();
1088			BS_UNLOCK(bs);
1089			return KERN_INVALID_ARGUMENT;
1090		}
1091	}
1092	PSL_UNLOCK();
1093
1094	/*
1095	 * Set up the paging segment
1096	 */
1097	ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
1098	if (ps == PAGING_SEGMENT_NULL) {
1099		BS_UNLOCK(bs);
1100		return KERN_RESOURCE_SHORTAGE;
1101	}
1102
1103	ps->ps_segtype = PS_PARTITION;
1104	ps->ps_device = device;
1105	ps->ps_offset = offset;
1106	ps->ps_record_shift = local_log2(vm_page_size / record_size);
1107	ps->ps_recnum = count;
1108	ps->ps_pgnum = count >> ps->ps_record_shift;
1109
1110	ps->ps_pgcount = ps->ps_pgnum;
1111	ps->ps_clshift = local_log2(bs->bs_clsize);
1112	ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
1113	ps->ps_hint = 0;
1114
1115	PS_LOCK_INIT(ps);
1116	ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
1117	if (!ps->ps_bmap) {
1118		PS_LOCK_DESTROY(ps);
1119		kfree(ps, sizeof *ps);
1120		BS_UNLOCK(bs);
1121		return KERN_RESOURCE_SHORTAGE;
1122	}
1123	for (i = 0; i < ps->ps_ncls; i++) {
1124		clrbit(ps->ps_bmap, i);
1125	}
1126
1127	if(paging_segment_count == 0) {
1128		ps->ps_state = PS_EMERGENCY_SEGMENT;
1129		if(use_emergency_swap_file_first) {
1130			ps->ps_state |= PS_CAN_USE;
1131		}
1132	} else {
1133		ps->ps_state = PS_CAN_USE;
1134	}
1135
1136	ps->ps_bs = bs;
1137
1138	if ((error = ps_enter(ps)) != 0) {
1139		kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
1140
1141		PS_LOCK_DESTROY(ps);
1142		kfree(ps, sizeof *ps);
1143		BS_UNLOCK(bs);
1144		return KERN_RESOURCE_SHORTAGE;
1145	}
1146
1147	bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
1148	bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
1149	BS_UNLOCK(bs);
1150
1151	PSL_LOCK();
1152	if(IS_PS_OK_TO_USE(ps)) {
1153		dp_pages_free += ps->ps_pgcount;
1154	} else {
1155		dp_pages_reserve += ps->ps_pgcount;
1156	}
1157	PSL_UNLOCK();
1158
1159	bs_more_space(ps->ps_clcount);
1160
1161	DP_DEBUG(DEBUG_BS_INTERNAL,
1162		 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
1163		  device, offset, count, record_size,
1164		  ps->ps_record_shift, ps->ps_pgnum));
1165
1166	return KERN_SUCCESS;
1167}
1168
1169boolean_t
1170bs_add_device(
1171	char		*dev_name,
1172	MACH_PORT_FACE	master)
1173{
1174	security_token_t	null_security_token = {
1175		{ 0, 0 }
1176	};
1177	MACH_PORT_FACE	device;
1178	int		info[DEV_GET_SIZE_COUNT];
1179	mach_msg_type_number_t info_count;
1180	MACH_PORT_FACE	bs = MACH_PORT_NULL;
1181	unsigned int	rec_size;
1182	recnum_t	count;
1183	int		clsize;
1184	MACH_PORT_FACE  reply_port;
1185
1186	if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE,
1187			null_security_token, dev_name, &device))
1188		return FALSE;
1189
1190	info_count = DEV_GET_SIZE_COUNT;
1191	if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) {
1192		rec_size = info[DEV_GET_SIZE_RECORD_SIZE];
1193		count = info[DEV_GET_SIZE_DEVICE_SIZE] /  rec_size;
1194		clsize = bs_get_global_clsize(0);
1195		if (!default_pager_backing_store_create(
1196					default_pager_object,
1197					DEFAULT_PAGER_BACKING_STORE_MAXPRI,
1198					(clsize * vm_page_size),
1199					&bs)) {
1200			if (!default_pager_add_segment(bs, device,
1201						       0, count, rec_size)) {
1202				return TRUE;
1203			}
1204			ipc_port_release_receive(bs);
1205		}
1206	}
1207
1208	ipc_port_release_send(device);
1209	return FALSE;
1210}
1211#endif /* DEVICE_PAGING */
1212
1213#if	VS_ASYNC_REUSE
1214
1215struct vs_async *
1216vs_alloc_async(void)
1217{
1218	struct vs_async	*vsa;
1219	MACH_PORT_FACE	reply_port;
1220//	kern_return_t	kr;
1221
1222	VS_ASYNC_LOCK();
1223	if (vs_async_free_list == NULL) {
1224		VS_ASYNC_UNLOCK();
1225		vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1226		if (vsa != NULL) {
1227			/*
1228			 * Try allocating a reply port named after the
1229			 * address of the vs_async structure.
1230			 */
1231			struct vstruct_alias 	*alias_struct;
1232
1233			reply_port = ipc_port_alloc_kernel();
1234			alias_struct = (struct vstruct_alias *)
1235				kalloc(sizeof (struct vstruct_alias));
1236			if(alias_struct != NULL) {
1237				alias_struct->vs = (struct vstruct *)vsa;
1238				alias_struct->name = &default_pager_ops;
1239				reply_port->ip_alias = (uintptr_t) alias_struct;
1240				vsa->reply_port = reply_port;
1241				vs_alloc_async_count++;
1242			}
1243			else {
1244				vs_alloc_async_failed++;
1245				ipc_port_dealloc_kernel((MACH_PORT_FACE)
1246								(reply_port));
1247				kfree(vsa, sizeof (struct vs_async));
1248				vsa = NULL;
1249			}
1250		}
1251	} else {
1252		vsa = vs_async_free_list;
1253		vs_async_free_list = vs_async_free_list->vsa_next;
1254		VS_ASYNC_UNLOCK();
1255	}
1256
1257	return vsa;
1258}
1259
1260void
1261vs_free_async(
1262	struct vs_async *vsa)
1263{
1264	VS_ASYNC_LOCK();
1265	vsa->vsa_next = vs_async_free_list;
1266	vs_async_free_list = vsa;
1267	VS_ASYNC_UNLOCK();
1268}
1269
1270#else	/* VS_ASYNC_REUSE */
1271
1272struct vs_async *
1273vs_alloc_async(void)
1274{
1275	struct vs_async	*vsa;
1276	MACH_PORT_FACE	reply_port;
1277	kern_return_t	kr;
1278
1279	vsa = (struct vs_async *) kalloc(sizeof (struct vs_async));
1280	if (vsa != NULL) {
1281		/*
1282		 * Try allocating a reply port named after the
1283		 * address of the vs_async structure.
1284		 */
1285			reply_port = ipc_port_alloc_kernel();
1286			alias_struct = (vstruct_alias *)
1287				kalloc(sizeof (struct vstruct_alias));
1288			if(alias_struct != NULL) {
1289				alias_struct->vs = reply_port;
1290				alias_struct->name = &default_pager_ops;
1291				reply_port->defpager_importance.alias = (int) vsa;
1292				vsa->reply_port = reply_port;
1293				vs_alloc_async_count++;
1294			}
1295			else {
1296				vs_alloc_async_failed++;
1297				ipc_port_dealloc_kernel((MACH_PORT_FACE)
1298								(reply_port));
1299				kfree(vsa, sizeof (struct vs_async));
1300				vsa = NULL;
1301			}
1302	}
1303
1304	return vsa;
1305}
1306
1307void
1308vs_free_async(
1309	struct vs_async *vsa)
1310{
1311	MACH_PORT_FACE	reply_port;
1312	kern_return_t	kr;
1313
1314	reply_port = vsa->reply_port;
1315	kfree(reply_port->ip_alias, sizeof (struct vstuct_alias));
1316	kfree(vsa, sizeof (struct vs_async));
1317	ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port));
1318#if 0
1319	VS_ASYNC_LOCK();
1320	vs_alloc_async_count--;
1321	VS_ASYNC_UNLOCK();
1322#endif
1323}
1324
1325#endif	/* VS_ASYNC_REUSE */
1326
1327zone_t	vstruct_zone;
1328
1329vstruct_t
1330ps_vstruct_create(
1331	dp_size_t size)
1332{
1333	vstruct_t	vs;
1334	unsigned int	i;
1335
1336	vs = (vstruct_t) zalloc(vstruct_zone);
1337	if (vs == VSTRUCT_NULL) {
1338		return VSTRUCT_NULL;
1339	}
1340
1341	VS_LOCK_INIT(vs);
1342
1343	/*
1344	 * The following fields will be provided later.
1345	 */
1346	vs->vs_pager_ops = NULL;
1347	vs->vs_control = MEMORY_OBJECT_CONTROL_NULL;
1348	vs->vs_references = 1;
1349	vs->vs_seqno = 0;
1350
1351	vs->vs_waiting_seqno = FALSE;
1352	vs->vs_waiting_read = FALSE;
1353	vs->vs_waiting_write = FALSE;
1354	vs->vs_waiting_async = FALSE;
1355
1356	vs->vs_readers = 0;
1357	vs->vs_writers = 0;
1358
1359	vs->vs_errors = 0;
1360
1361	vs->vs_clshift = local_log2(bs_get_global_clsize(0));
1362	vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1;
1363	vs->vs_async_pending = 0;
1364
1365	/*
1366	 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE
1367	 * depending on the size of the memory object.
1368	 */
1369	if (INDIRECT_CLMAP(vs->vs_size)) {
1370		vs->vs_imap = (struct vs_map **)
1371			kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size));
1372		vs->vs_indirect = TRUE;
1373	} else {
1374		vs->vs_dmap = (struct vs_map *)
1375			kalloc(CLMAP_SIZE(vs->vs_size));
1376		vs->vs_indirect = FALSE;
1377	}
1378	vs->vs_xfer_pending = FALSE;
1379	DP_DEBUG(DEBUG_VS_INTERNAL,
1380		 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect));
1381
1382	/*
1383	 * Check to see that we got the space.
1384	 */
1385	if (!vs->vs_dmap) {
1386		kfree(vs, sizeof *vs);
1387		return VSTRUCT_NULL;
1388	}
1389
1390	/*
1391	 * Zero the indirect pointers, or clear the direct pointers.
1392	 */
1393	if (vs->vs_indirect)
1394		memset(vs->vs_imap, 0,
1395		       INDIRECT_CLMAP_SIZE(vs->vs_size));
1396	else
1397		for (i = 0; i < vs->vs_size; i++)
1398			VSM_CLR(vs->vs_dmap[i]);
1399
1400	VS_MAP_LOCK_INIT(vs);
1401
1402	bs_commit(vs->vs_size);
1403
1404	return vs;
1405}
1406
1407paging_segment_t ps_select_segment(unsigned int, int *);	/* forward */
1408
1409paging_segment_t
1410ps_select_segment(
1411	unsigned int	shift,
1412	int		*psindex)
1413{
1414	paging_segment_t	ps;
1415	int			i;
1416	int			j;
1417
1418	/*
1419	 * Optimize case where there's only one segment.
1420	 * paging_segment_max will index the one and only segment.
1421	 */
1422
1423	PSL_LOCK();
1424	if (paging_segment_count == 1) {
1425		paging_segment_t lps = PAGING_SEGMENT_NULL;	/* used to avoid extra PS_UNLOCK */
1426		ipc_port_t trigger = IP_NULL;
1427
1428		ps = paging_segments[paging_segment_max];
1429		*psindex = paging_segment_max;
1430		PS_LOCK(ps);
1431		if( !IS_PS_EMERGENCY_SEGMENT(ps) ) {
1432			panic("Emergency paging segment missing\n");
1433		}
1434		ASSERT(ps->ps_clshift >= shift);
1435		if(IS_PS_OK_TO_USE(ps)) {
1436			if (ps->ps_clcount) {
1437				ps->ps_clcount--;
1438				dp_pages_free -=  1 << ps->ps_clshift;
1439				ps->ps_pgcount -=  1 << ps->ps_clshift;
1440				if(min_pages_trigger_port &&
1441				  (dp_pages_free < minimum_pages_remaining)) {
1442					trigger = min_pages_trigger_port;
1443					min_pages_trigger_port = NULL;
1444					bs_low = TRUE;
1445					backing_store_abort_compaction = TRUE;
1446				}
1447				lps = ps;
1448			}
1449		}
1450		PS_UNLOCK(ps);
1451
1452		if( lps == PAGING_SEGMENT_NULL ) {
1453			if(dp_pages_free) {
1454				dp_pages_free_drift_count++;
1455				if(dp_pages_free > dp_pages_free_drifted_max) {
1456					dp_pages_free_drifted_max = dp_pages_free;
1457				}
1458				dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1459			}
1460	        	dp_pages_free = 0;
1461		}
1462
1463		PSL_UNLOCK();
1464
1465		if (trigger != IP_NULL) {
1466			dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1467
1468			default_pager_space_alert(trigger, HI_WAT_ALERT);
1469			ipc_port_release_send(trigger);
1470		}
1471		return lps;
1472	}
1473
1474	if (paging_segment_count == 0) {
1475		if(dp_pages_free) {
1476			dp_pages_free_drift_count++;
1477			if(dp_pages_free > dp_pages_free_drifted_max) {
1478				dp_pages_free_drifted_max = dp_pages_free;
1479			}
1480			dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free));
1481		}
1482	        dp_pages_free = 0;
1483		PSL_UNLOCK();
1484		return PAGING_SEGMENT_NULL;
1485	}
1486
1487	for (i = BS_MAXPRI;
1488	     i >= BS_MINPRI; i--) {
1489		int start_index;
1490
1491		if ((ps_select_array[i] == BS_NOPRI) ||
1492				(ps_select_array[i] == BS_FULLPRI))
1493			continue;
1494		start_index = ps_select_array[i];
1495
1496		if(!(paging_segments[start_index])) {
1497			j = start_index+1;
1498			physical_transfer_cluster_count = 0;
1499		}
1500		else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >>
1501				(((paging_segments[start_index])->ps_clshift)
1502				+ vm_page_shift))) {
1503			physical_transfer_cluster_count = 0;
1504			j = start_index + 1;
1505		} else {
1506			physical_transfer_cluster_count+=1;
1507			j = start_index;
1508			if(start_index == 0)
1509				start_index = paging_segment_max;
1510			else
1511				start_index = start_index - 1;
1512		}
1513
1514		while (1) {
1515			if (j > paging_segment_max)
1516				j = 0;
1517			if ((ps = paging_segments[j]) &&
1518			    (ps->ps_bs->bs_priority == i)) {
1519				/*
1520				 * Force the ps cluster size to be
1521				 * >= that of the vstruct.
1522				 */
1523				PS_LOCK(ps);
1524				if (IS_PS_OK_TO_USE(ps)) {
1525					if ((ps->ps_clcount) &&
1526						   (ps->ps_clshift >= shift)) {
1527						ipc_port_t trigger = IP_NULL;
1528
1529						ps->ps_clcount--;
1530						dp_pages_free -=  1 << ps->ps_clshift;
1531						ps->ps_pgcount -=  1 << ps->ps_clshift;
1532						if(min_pages_trigger_port &&
1533							(dp_pages_free <
1534							minimum_pages_remaining)) {
1535							trigger = min_pages_trigger_port;
1536							min_pages_trigger_port = NULL;
1537							bs_low = TRUE;
1538							backing_store_abort_compaction = TRUE;
1539						}
1540						PS_UNLOCK(ps);
1541						/*
1542						 * found one, quit looking.
1543						 */
1544						ps_select_array[i] = j;
1545						PSL_UNLOCK();
1546
1547						if (trigger != IP_NULL) {
1548							dprintf(("ps_select_segment - send HI_WAT_ALERT\n"));
1549
1550							default_pager_space_alert(
1551								trigger,
1552								HI_WAT_ALERT);
1553							ipc_port_release_send(trigger);
1554						}
1555						*psindex = j;
1556						return ps;
1557					}
1558				}
1559				PS_UNLOCK(ps);
1560			}
1561			if (j == start_index) {
1562				/*
1563				 * none at this priority -- mark it full
1564				 */
1565				ps_select_array[i] = BS_FULLPRI;
1566				break;
1567			}
1568			j++;
1569		}
1570	}
1571
1572	if(dp_pages_free) {
1573		dp_pages_free_drift_count++;
1574		if(dp_pages_free > dp_pages_free_drifted_max) {
1575			dp_pages_free_drifted_max = dp_pages_free;
1576		}
1577		dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free));
1578	}
1579	dp_pages_free = 0;
1580	PSL_UNLOCK();
1581	return PAGING_SEGMENT_NULL;
1582}
1583
1584dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/
1585
1586dp_offset_t
1587ps_allocate_cluster(
1588	vstruct_t		vs,
1589	int			*psindex,
1590	paging_segment_t	use_ps)
1591{
1592	unsigned int		byte_num;
1593	int			bit_num = 0;
1594	paging_segment_t	ps;
1595	dp_offset_t		cluster;
1596	ipc_port_t		trigger = IP_NULL;
1597
1598	/*
1599	 * Find best paging segment.
1600	 * ps_select_segment will decrement cluster count on ps.
1601	 * Must pass cluster shift to find the most appropriate segment.
1602	 */
1603	/* NOTE:  The addition of paging segment delete capability threatened
1604	 * to seriously complicate the treatment of paging segments in this
1605	 * module and the ones that call it (notably ps_clmap), because of the
1606	 * difficulty in assuring that the paging segment would continue to
1607	 * exist between being unlocked and locked.   This was
1608	 * avoided because all calls to this module are based in either
1609	 * dp_memory_object calls which rely on the vs lock, or by
1610	 * the transfer function which is part of the segment delete path.
1611	 * The transfer function which is part of paging segment delete is
1612	 * protected from multiple callers by the backing store lock.
1613	 * The paging segment delete function treats mappings to a paging
1614	 * segment on a vstruct by vstruct basis, locking the vstruct targeted
1615	 * while data is transferred to the remaining segments.  This is in
1616	 * line with the view that incomplete or in-transition mappings between
1617	 * data, a vstruct, and backing store are protected by the vs lock.
1618	 * This and the ordering of the paging segment "going_away" bit setting
1619	 * protects us.
1620	 */
1621retry:
1622	if (use_ps != PAGING_SEGMENT_NULL) {
1623		ps = use_ps;
1624		PSL_LOCK();
1625		PS_LOCK(ps);
1626
1627		ASSERT(ps->ps_clcount != 0);
1628
1629		ps->ps_clcount--;
1630		dp_pages_free -=  1 << ps->ps_clshift;
1631		ps->ps_pgcount -=  1 << ps->ps_clshift;
1632		if(min_pages_trigger_port &&
1633				(dp_pages_free < minimum_pages_remaining)) {
1634			trigger = min_pages_trigger_port;
1635			min_pages_trigger_port = NULL;
1636			bs_low = TRUE;
1637			backing_store_abort_compaction = TRUE;
1638		}
1639		PSL_UNLOCK();
1640		PS_UNLOCK(ps);
1641		if (trigger != IP_NULL) {
1642			dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1643
1644			default_pager_space_alert(trigger, HI_WAT_ALERT);
1645			ipc_port_release_send(trigger);
1646		}
1647
1648	} else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) ==
1649		   PAGING_SEGMENT_NULL) {
1650		static clock_sec_t lastnotify = 0;
1651		clock_sec_t now;
1652		clock_nsec_t nanoseconds_dummy;
1653
1654		/*
1655		 * Don't immediately jump to the emergency segment. Give the
1656		 * dynamic pager a chance to create it's first normal swap file.
1657		 * Unless, of course the very first normal swap file can't be
1658		 * created due to some problem and we didn't expect that problem
1659		 * i.e. use_emergency_swap_file_first was never set to true initially.
1660		 * It then gets set in the swap file creation error handling.
1661		 */
1662		if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) {
1663
1664			ps = paging_segments[EMERGENCY_PSEG_INDEX];
1665			if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) {
1666				PSL_LOCK();
1667				PS_LOCK(ps);
1668
1669				if(IS_PS_GOING_AWAY(ps)) {
1670					/* Someone de-activated the emergency paging segment*/
1671					PS_UNLOCK(ps);
1672					PSL_UNLOCK();
1673
1674				} else if(dp_pages_free) {
1675					/*
1676					 * Someone has already activated the emergency paging segment
1677					 * OR
1678					 * Between us having rec'd a NULL segment from ps_select_segment
1679					 * and reaching here a new normal segment could have been added.
1680					 * E.g. we get NULL segment and another thread just added the
1681					 * new swap file. Hence check to see if we have more dp_pages_free
1682					 * before activating the emergency segment.
1683					 */
1684					PS_UNLOCK(ps);
1685					PSL_UNLOCK();
1686					goto retry;
1687
1688				} else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) {
1689					/*
1690					 * PS_CAN_USE is only reset from the emergency segment when it's
1691					 * been successfully recovered. So it's legal to have an emergency
1692					 * segment that has PS_CAN_USE but no clusters because it's recovery
1693					 * failed.
1694					 */
1695					backing_store_t bs = ps->ps_bs;
1696					ps->ps_state |= PS_CAN_USE;
1697					if(ps_select_array[bs->bs_priority] == BS_FULLPRI ||
1698						ps_select_array[bs->bs_priority] == BS_NOPRI) {
1699						ps_select_array[bs->bs_priority] = 0;
1700					}
1701					dp_pages_free += ps->ps_pgcount;
1702					dp_pages_reserve -= ps->ps_pgcount;
1703					PS_UNLOCK(ps);
1704					PSL_UNLOCK();
1705					dprintf(("Switching ON Emergency paging segment\n"));
1706					goto retry;
1707				}
1708
1709				PS_UNLOCK(ps);
1710				PSL_UNLOCK();
1711			}
1712		}
1713
1714		/*
1715		 * Emit a notification of the low-paging resource condition
1716		 * but don't issue it more than once every five seconds.  This
1717		 * prevents us from overflowing logs with thousands of
1718		 * repetitions of the message.
1719		 */
1720		clock_get_system_nanotime(&now, &nanoseconds_dummy);
1721		if (paging_segment_count > 1 && (now > lastnotify + 5)) {
1722			/* With an activated emergency paging segment we still
1723			 * didn't get any clusters. This could mean that the
1724			 * emergency paging segment is exhausted.
1725 			 */
1726			dprintf(("System is out of paging space.\n"));
1727			lastnotify = now;
1728		}
1729
1730		PSL_LOCK();
1731
1732		if(min_pages_trigger_port) {
1733			trigger = min_pages_trigger_port;
1734			min_pages_trigger_port = NULL;
1735			bs_low = TRUE;
1736			backing_store_abort_compaction = TRUE;
1737		}
1738		PSL_UNLOCK();
1739		if (trigger != IP_NULL) {
1740			dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n"));
1741
1742			default_pager_space_alert(trigger, HI_WAT_ALERT);
1743			ipc_port_release_send(trigger);
1744		}
1745		return (dp_offset_t) -1;
1746	}
1747
1748	/*
1749	 * Look for an available cluster.  At the end of the loop,
1750	 * byte_num is the byte offset and bit_num is the bit offset of the
1751	 * first zero bit in the paging segment bitmap.
1752	 */
1753	PS_LOCK(ps);
1754	byte_num = ps->ps_hint;
1755	for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) {
1756		if (*(ps->ps_bmap + byte_num) != BYTEMASK) {
1757			for (bit_num = 0; bit_num < NBBY; bit_num++) {
1758				if (isclr((ps->ps_bmap + byte_num), bit_num))
1759					break;
1760			}
1761			ASSERT(bit_num != NBBY);
1762			break;
1763		}
1764	}
1765	ps->ps_hint = byte_num;
1766	cluster = (byte_num*NBBY) + bit_num;
1767
1768	/* Space was reserved, so this must be true */
1769	ASSERT(cluster < ps->ps_ncls);
1770
1771	setbit(ps->ps_bmap, cluster);
1772	PS_UNLOCK(ps);
1773
1774	return cluster;
1775}
1776
1777void ps_deallocate_cluster(paging_segment_t, dp_offset_t);	/* forward */
1778
1779void
1780ps_deallocate_cluster(
1781	paging_segment_t	ps,
1782	dp_offset_t		cluster)
1783{
1784
1785	if (cluster >= ps->ps_ncls)
1786		panic("ps_deallocate_cluster: Invalid cluster number");
1787
1788	/*
1789	 * Lock the paging segment, clear the cluster's bitmap and increment the
1790	 * number of free cluster.
1791	 */
1792	PSL_LOCK();
1793	PS_LOCK(ps);
1794	clrbit(ps->ps_bmap, cluster);
1795	if( IS_PS_OK_TO_USE(ps)) {
1796		++ps->ps_clcount;
1797		ps->ps_pgcount +=  1 << ps->ps_clshift;
1798		dp_pages_free +=  1 << ps->ps_clshift;
1799	} else {
1800		ps->ps_special_clusters += 1;
1801	}
1802
1803	/*
1804	 * Move the hint down to the freed cluster if it is
1805	 * less than the current hint.
1806	 */
1807	if ((cluster/NBBY) < ps->ps_hint) {
1808		ps->ps_hint = (cluster/NBBY);
1809	}
1810
1811
1812	/*
1813	 * If we're freeing space on a full priority, reset the array.
1814	 */
1815	if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)
1816		ps_select_array[ps->ps_bs->bs_priority] = 0;
1817	PS_UNLOCK(ps);
1818	PSL_UNLOCK();
1819
1820	return;
1821}
1822
1823void ps_dealloc_vsmap(struct vs_map *, dp_size_t);	/* forward */
1824
1825void
1826ps_dealloc_vsmap(
1827	struct vs_map	*vsmap,
1828	dp_size_t	size)
1829{
1830	unsigned int i;
1831	struct ps_vnode_trim_data trim_data;
1832
1833	ps_vnode_trim_init(&trim_data);
1834
1835	for (i = 0; i < size; i++) {
1836		if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) {
1837			ps_vnode_trim_more(&trim_data,
1838					      &vsmap[i],
1839					      VSM_PS(vsmap[i])->ps_clshift,
1840					      vm_page_size << VSM_PS(vsmap[i])->ps_clshift);
1841			ps_deallocate_cluster(VSM_PS(vsmap[i]),
1842					      VSM_CLOFF(vsmap[i]));
1843		} else {
1844			ps_vnode_trim_now(&trim_data);
1845		}
1846	}
1847	ps_vnode_trim_now(&trim_data);
1848}
1849
1850void
1851ps_vstruct_dealloc(
1852	vstruct_t vs)
1853{
1854	unsigned int	i;
1855//	spl_t	s;
1856
1857	VS_MAP_LOCK(vs);
1858
1859	/*
1860	 * If this is an indirect structure, then we walk through the valid
1861	 * (non-zero) indirect pointers and deallocate the clusters
1862	 * associated with each used map entry (via ps_dealloc_vsmap).
1863	 * When all of the clusters in an indirect block have been
1864	 * freed, we deallocate the block.  When all of the indirect
1865	 * blocks have been deallocated we deallocate the memory
1866	 * holding the indirect pointers.
1867	 */
1868	if (vs->vs_indirect) {
1869		for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1870			if (vs->vs_imap[i] != NULL) {
1871				ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES);
1872				kfree(vs->vs_imap[i], CLMAP_THRESHOLD);
1873			}
1874		}
1875		kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1876	} else {
1877		/*
1878		 * Direct map.  Free used clusters, then memory.
1879		 */
1880		ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
1881		kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
1882	}
1883	VS_MAP_UNLOCK(vs);
1884
1885	bs_commit(- vs->vs_size);
1886
1887	VS_MAP_LOCK_DESTROY(vs);
1888
1889	zfree(vstruct_zone, vs);
1890}
1891
1892kern_return_t
1893ps_vstruct_reclaim(
1894	vstruct_t vs,
1895	boolean_t return_to_vm,
1896	boolean_t reclaim_backing_store)
1897{
1898	unsigned int	i, j;
1899	struct vs_map	*vsmap;
1900	boolean_t	vsmap_all_clear, vsimap_all_clear;
1901	struct vm_object_fault_info fault_info;
1902	int		clmap_off;
1903	unsigned int	vsmap_size;
1904	kern_return_t	kr = KERN_SUCCESS;
1905
1906	VS_MAP_LOCK(vs);
1907
1908	fault_info.cluster_size = VM_SUPER_CLUSTER;
1909	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
1910	fault_info.user_tag = 0;
1911	fault_info.pmap_options = 0;
1912	fault_info.lo_offset = 0;
1913	fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift);
1914	fault_info.io_sync = reclaim_backing_store;
1915	fault_info.batch_pmap_op = FALSE;
1916
1917	/*
1918	 * If this is an indirect structure, then we walk through the valid
1919	 * (non-zero) indirect pointers and deallocate the clusters
1920	 * associated with each used map entry (via ps_dealloc_vsmap).
1921	 * When all of the clusters in an indirect block have been
1922	 * freed, we deallocate the block.  When all of the indirect
1923	 * blocks have been deallocated we deallocate the memory
1924	 * holding the indirect pointers.
1925	 */
1926	if (vs->vs_indirect) {
1927		vsimap_all_clear = TRUE;
1928		for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
1929			vsmap = vs->vs_imap[i];
1930			if (vsmap == NULL)
1931				continue;
1932			/* loop on clusters in this indirect map */
1933			clmap_off = (vm_page_size * CLMAP_ENTRIES *
1934				     VSCLSIZE(vs) * i);
1935			if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
1936				vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
1937			else
1938				vsmap_size = CLMAP_ENTRIES;
1939			vsmap_all_clear = TRUE;
1940			if (return_to_vm) {
1941				for (j = 0; j < vsmap_size;) {
1942					if (VSM_ISCLR(vsmap[j]) ||
1943					    VSM_ISERR(vsmap[j])) {
1944						j++;
1945						clmap_off += vm_page_size * VSCLSIZE(vs);
1946						continue;
1947					}
1948					VS_MAP_UNLOCK(vs);
1949					kr = pvs_cluster_read(
1950						vs,
1951						clmap_off,
1952						(dp_size_t) -1, /* read whole cluster */
1953						&fault_info);
1954
1955					VS_MAP_LOCK(vs); /* XXX what if it changed ? */
1956					if (kr != KERN_SUCCESS) {
1957						vsmap_all_clear = FALSE;
1958						vsimap_all_clear = FALSE;
1959
1960						kr = KERN_MEMORY_ERROR;
1961						goto out;
1962					}
1963				}
1964			}
1965			if (vsmap_all_clear) {
1966				ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES);
1967				kfree(vsmap, CLMAP_THRESHOLD);
1968				vs->vs_imap[i] = NULL;
1969			}
1970		}
1971		if (vsimap_all_clear) {
1972//			kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size));
1973		}
1974	} else {
1975		/*
1976		 * Direct map.  Free used clusters, then memory.
1977		 */
1978		vsmap = vs->vs_dmap;
1979		if (vsmap == NULL) {
1980			goto out;
1981		}
1982		vsmap_all_clear = TRUE;
1983		/* loop on clusters in the direct map */
1984		if (return_to_vm) {
1985			for (j = 0; j < vs->vs_size;) {
1986				if (VSM_ISCLR(vsmap[j]) ||
1987				    VSM_ISERR(vsmap[j])) {
1988					j++;
1989					continue;
1990				}
1991				clmap_off = vm_page_size * (j << vs->vs_clshift);
1992				VS_MAP_UNLOCK(vs);
1993				kr = pvs_cluster_read(
1994					vs,
1995					clmap_off,
1996					(dp_size_t) -1, /* read whole cluster */
1997					&fault_info);
1998
1999				VS_MAP_LOCK(vs); /* XXX what if it changed ? */
2000				if (kr != KERN_SUCCESS) {
2001					vsmap_all_clear = FALSE;
2002
2003					kr = KERN_MEMORY_ERROR;
2004					goto out;
2005				} else {
2006//					VSM_CLR(vsmap[j]);
2007				}
2008			}
2009		}
2010		if (vsmap_all_clear) {
2011			ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size);
2012//			kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size));
2013		}
2014	}
2015out:
2016	VS_MAP_UNLOCK(vs);
2017
2018	return kr;
2019}
2020
2021int ps_map_extend(vstruct_t, unsigned int);	/* forward */
2022
2023int ps_map_extend(
2024	vstruct_t	vs,
2025	unsigned int	new_size)
2026{
2027	struct vs_map	**new_imap;
2028	struct vs_map	*new_dmap = NULL;
2029	int		newdsize;
2030	int		i;
2031	void		*old_map = NULL;
2032	int		old_map_size = 0;
2033
2034	if (vs->vs_size >= new_size) {
2035		/*
2036		 * Someone has already done the work.
2037		 */
2038		return 0;
2039	}
2040
2041	/*
2042	 * If the new size extends into the indirect range, then we have one
2043	 * of two cases: we are going from indirect to indirect, or we are
2044	 * going from direct to indirect.  If we are going from indirect to
2045	 * indirect, then it is possible that the new size will fit in the old
2046	 * indirect map.  If this is the case, then just reset the size of the
2047	 * vstruct map and we are done.  If the new size will not
2048	 * fit into the old indirect map, then we have to allocate a new
2049	 * indirect map and copy the old map pointers into this new map.
2050	 *
2051	 * If we are going from direct to indirect, then we have to allocate a
2052	 * new indirect map and copy the old direct pages into the first
2053	 * indirect page of the new map.
2054	 * NOTE: allocating memory here is dangerous, as we're in the
2055	 * pageout path.
2056	 */
2057	if (INDIRECT_CLMAP(new_size)) {
2058		int new_map_size = INDIRECT_CLMAP_SIZE(new_size);
2059
2060		/*
2061		 * Get a new indirect map and zero it.
2062		 */
2063		old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size);
2064		if (vs->vs_indirect &&
2065		    (new_map_size == old_map_size)) {
2066			bs_commit(new_size - vs->vs_size);
2067			vs->vs_size = new_size;
2068			return 0;
2069		}
2070
2071		new_imap = (struct vs_map **)kalloc(new_map_size);
2072		if (new_imap == NULL) {
2073			return -1;
2074		}
2075		memset(new_imap, 0, new_map_size);
2076
2077		if (vs->vs_indirect) {
2078			/* Copy old entries into new map */
2079			memcpy(new_imap, vs->vs_imap, old_map_size);
2080			/* Arrange to free the old map */
2081			old_map = (void *) vs->vs_imap;
2082			newdsize = 0;
2083		} else {	/* Old map was a direct map */
2084			/* Allocate an indirect page */
2085			if ((new_imap[0] = (struct vs_map *)
2086			     kalloc(CLMAP_THRESHOLD)) == NULL) {
2087				kfree(new_imap, new_map_size);
2088				return -1;
2089			}
2090			new_dmap = new_imap[0];
2091			newdsize = CLMAP_ENTRIES;
2092		}
2093	} else {
2094		new_imap = NULL;
2095		newdsize = new_size;
2096		/*
2097		 * If the new map is a direct map, then the old map must
2098		 * also have been a direct map.  All we have to do is
2099		 * to allocate a new direct map, copy the old entries
2100		 * into it and free the old map.
2101		 */
2102		if ((new_dmap = (struct vs_map *)
2103		     kalloc(CLMAP_SIZE(new_size))) == NULL) {
2104			return -1;
2105		}
2106	}
2107	if (newdsize) {
2108
2109		/* Free the old map */
2110		old_map = (void *) vs->vs_dmap;
2111		old_map_size = CLMAP_SIZE(vs->vs_size);
2112
2113		/* Copy info from the old map into the new map */
2114		memcpy(new_dmap, vs->vs_dmap, old_map_size);
2115
2116		/* Initialize the rest of the new map */
2117		for (i = vs->vs_size; i < newdsize; i++)
2118			VSM_CLR(new_dmap[i]);
2119	}
2120	if (new_imap) {
2121		vs->vs_imap = new_imap;
2122		vs->vs_indirect = TRUE;
2123	} else
2124		vs->vs_dmap = new_dmap;
2125	bs_commit(new_size - vs->vs_size);
2126	vs->vs_size = new_size;
2127	if (old_map)
2128		kfree(old_map, old_map_size);
2129	return 0;
2130}
2131
2132dp_offset_t
2133ps_clmap(
2134	vstruct_t	vs,
2135	dp_offset_t	offset,
2136	struct clmap	*clmap,
2137	int		flag,
2138	dp_size_t	size,
2139	int		error)
2140{
2141	dp_offset_t	cluster;	/* The cluster of offset.	*/
2142	dp_offset_t	newcl;		/* The new cluster allocated.	*/
2143	dp_offset_t	newoff;
2144	unsigned int	i;
2145	struct vs_map	*vsmap;
2146
2147	VS_MAP_LOCK(vs);
2148
2149	ASSERT(vs->vs_dmap);
2150	cluster = atop_32(offset) >> vs->vs_clshift;
2151
2152	/*
2153	 * Initialize cluster error value
2154	 */
2155	clmap->cl_error = 0;
2156
2157	/*
2158	 * If the object has grown, extend the page map.
2159	 */
2160	if (cluster >= vs->vs_size) {
2161		if (flag == CL_FIND) {
2162			/* Do not allocate if just doing a lookup */
2163			VS_MAP_UNLOCK(vs);
2164			return (dp_offset_t) -1;
2165		}
2166		if (ps_map_extend(vs, cluster + 1)) {
2167			VS_MAP_UNLOCK(vs);
2168			return (dp_offset_t) -1;
2169		}
2170	}
2171
2172	/*
2173	 * Look for the desired cluster.  If the map is indirect, then we
2174	 * have a two level lookup.  First find the indirect block, then
2175	 * find the actual cluster.  If the indirect block has not yet
2176	 * been allocated, then do so.  If the cluster has not yet been
2177	 * allocated, then do so.
2178	 *
2179	 * If any of the allocations fail, then return an error.
2180	 * Don't allocate if just doing a lookup.
2181	 */
2182	if (vs->vs_indirect) {
2183		long	ind_block = cluster/CLMAP_ENTRIES;
2184
2185		/* Is the indirect block allocated? */
2186		vsmap = vs->vs_imap[ind_block];
2187		if (vsmap == NULL) {
2188			if (flag == CL_FIND) {
2189				VS_MAP_UNLOCK(vs);
2190				return (dp_offset_t) -1;
2191			}
2192
2193			/* Allocate the indirect block */
2194			vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD);
2195			if (vsmap == NULL) {
2196				VS_MAP_UNLOCK(vs);
2197				return (dp_offset_t) -1;
2198			}
2199			/* Initialize the cluster offsets */
2200			for (i = 0; i < CLMAP_ENTRIES; i++)
2201				VSM_CLR(vsmap[i]);
2202			vs->vs_imap[ind_block] = vsmap;
2203		}
2204	} else
2205		vsmap = vs->vs_dmap;
2206
2207	ASSERT(vsmap);
2208	vsmap += cluster%CLMAP_ENTRIES;
2209
2210	/*
2211	 * At this point, vsmap points to the struct vs_map desired.
2212	 *
2213	 * Look in the map for the cluster, if there was an error on a
2214	 * previous write, flag it and return.  If it is not yet
2215	 * allocated, then allocate it, if we're writing; if we're
2216	 * doing a lookup and the cluster's not allocated, return error.
2217	 */
2218	if (VSM_ISERR(*vsmap)) {
2219		clmap->cl_error = VSM_GETERR(*vsmap);
2220		VS_MAP_UNLOCK(vs);
2221		return (dp_offset_t) -1;
2222	} else if (VSM_ISCLR(*vsmap)) {
2223		int psindex;
2224
2225		if (flag == CL_FIND) {
2226			/*
2227			 * If there's an error and the entry is clear, then
2228			 * we've run out of swap space.  Record the error
2229			 * here and return.
2230			 */
2231			if (error) {
2232				VSM_SETERR(*vsmap, error);
2233			}
2234			VS_MAP_UNLOCK(vs);
2235			return (dp_offset_t) -1;
2236		} else {
2237			/*
2238			 * Attempt to allocate a cluster from the paging segment
2239			 */
2240			newcl = ps_allocate_cluster(vs, &psindex,
2241						    PAGING_SEGMENT_NULL);
2242			if (newcl == (dp_offset_t) -1) {
2243				VS_MAP_UNLOCK(vs);
2244				return (dp_offset_t) -1;
2245			}
2246			VSM_CLR(*vsmap);
2247			VSM_SETCLOFF(*vsmap, newcl);
2248			VSM_SETPS(*vsmap, psindex);
2249		}
2250	} else
2251		newcl = VSM_CLOFF(*vsmap);
2252
2253	/*
2254	 * Fill in pertinent fields of the clmap
2255	 */
2256	clmap->cl_ps = VSM_PS(*vsmap);
2257	clmap->cl_numpages = VSCLSIZE(vs);
2258	clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap);
2259
2260	/*
2261	 * Byte offset in paging segment is byte offset to cluster plus
2262	 * byte offset within cluster.  It looks ugly, but should be
2263	 * relatively quick.
2264	 */
2265	ASSERT(trunc_page(offset) == offset);
2266	newcl = ptoa_32(newcl) << vs->vs_clshift;
2267	newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1);
2268	if (flag == CL_ALLOC) {
2269		/*
2270		 * set bits in the allocation bitmap according to which
2271		 * pages were requested.  size is in bytes.
2272		 */
2273		i = atop_32(newoff);
2274		while ((size > 0) && (i < VSCLSIZE(vs))) {
2275			VSM_SETALLOC(*vsmap, i);
2276			i++;
2277			size -= vm_page_size;
2278		}
2279	}
2280	clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap);
2281	if (newoff) {
2282		/*
2283		 * Offset is not cluster aligned, so number of pages
2284		 * and bitmaps must be adjusted
2285		 */
2286		clmap->cl_numpages -= atop_32(newoff);
2287		CLMAP_SHIFT(clmap, vs);
2288		CLMAP_SHIFTALLOC(clmap, vs);
2289	}
2290
2291	/*
2292	 *
2293	 * The setting of valid bits and handling of write errors
2294	 * must be done here, while we hold the lock on the map.
2295	 * It logically should be done in ps_vs_write_complete().
2296	 * The size and error information has been passed from
2297	 * ps_vs_write_complete().  If the size parameter is non-zero,
2298	 * then there is work to be done.  If error is also non-zero,
2299	 * then the error number is recorded in the cluster and the
2300	 * entire cluster is in error.
2301	 */
2302	if (size && flag == CL_FIND) {
2303		dp_offset_t off = (dp_offset_t) 0;
2304
2305		if (!error) {
2306			for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0;
2307			     i++) {
2308				VSM_SETPG(*vsmap, i);
2309				size -= vm_page_size;
2310			}
2311			ASSERT(i <= VSCLSIZE(vs));
2312		} else {
2313			BS_STAT(clmap->cl_ps->ps_bs,
2314				clmap->cl_ps->ps_bs->bs_pages_out_fail +=
2315					atop_32(size));
2316			off = VSM_CLOFF(*vsmap);
2317			VSM_SETERR(*vsmap, error);
2318		}
2319		/*
2320		 * Deallocate cluster if error, and no valid pages
2321		 * already present.
2322		 */
2323		if (off != (dp_offset_t) 0)
2324			ps_deallocate_cluster(clmap->cl_ps, off);
2325		VS_MAP_UNLOCK(vs);
2326		return (dp_offset_t) 0;
2327	} else
2328		VS_MAP_UNLOCK(vs);
2329
2330	DP_DEBUG(DEBUG_VS_INTERNAL,
2331		 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n",
2332		  newcl+newoff, (int) vs, (int) vsmap, flag));
2333	DP_DEBUG(DEBUG_VS_INTERNAL,
2334		 ("	clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n",
2335		  (int) clmap->cl_ps, clmap->cl_numpages,
2336		  (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map));
2337
2338	return (newcl + newoff);
2339}
2340
2341void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t);	/* forward */
2342
2343void
2344ps_clunmap(
2345	vstruct_t	vs,
2346	dp_offset_t	offset,
2347	dp_size_t	length)
2348{
2349	dp_offset_t		cluster; /* The cluster number of offset */
2350	struct vs_map		*vsmap;
2351	struct ps_vnode_trim_data trim_data;
2352
2353	ps_vnode_trim_init(&trim_data);
2354
2355	VS_MAP_LOCK(vs);
2356
2357	/*
2358	 * Loop through all clusters in this range, freeing paging segment
2359	 * clusters and map entries as encountered.
2360	 */
2361	while (length > 0) {
2362		dp_offset_t 	newoff;
2363		unsigned int	i;
2364
2365		cluster = atop_32(offset) >> vs->vs_clshift;
2366		if (vs->vs_indirect)	/* indirect map */
2367			vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES];
2368		else
2369			vsmap = vs->vs_dmap;
2370		if (vsmap == NULL) {
2371			ps_vnode_trim_now(&trim_data);
2372			VS_MAP_UNLOCK(vs);
2373			return;
2374		}
2375		vsmap += cluster%CLMAP_ENTRIES;
2376		if (VSM_ISCLR(*vsmap)) {
2377			ps_vnode_trim_now(&trim_data);
2378			length -= vm_page_size;
2379			offset += vm_page_size;
2380			continue;
2381		}
2382		/*
2383		 * We've got a valid mapping.  Clear it and deallocate
2384		 * paging segment cluster pages.
2385		 * Optimize for entire cluster cleraing.
2386		 */
2387		if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) {
2388			/*
2389			 * Not cluster aligned.
2390			 */
2391			ASSERT(trunc_page(newoff) == newoff);
2392			i = atop_32(newoff);
2393		} else
2394			i = 0;
2395		while ((i < VSCLSIZE(vs)) && (length > 0)) {
2396			VSM_CLRPG(*vsmap, i);
2397			VSM_CLRALLOC(*vsmap, i);
2398			length -= vm_page_size;
2399			offset += vm_page_size;
2400			i++;
2401		}
2402
2403		/*
2404		 * If map entry is empty, clear and deallocate cluster.
2405		 */
2406		if (!VSM_BMAP(*vsmap)) {
2407			ps_vnode_trim_more(&trim_data,
2408					      vsmap,
2409					      vs->vs_clshift,
2410					      VSCLSIZE(vs) * vm_page_size);
2411			ps_deallocate_cluster(VSM_PS(*vsmap),
2412					      VSM_CLOFF(*vsmap));
2413			VSM_CLR(*vsmap);
2414		} else {
2415			ps_vnode_trim_now(&trim_data);
2416		}
2417	}
2418	ps_vnode_trim_now(&trim_data);
2419
2420	VS_MAP_UNLOCK(vs);
2421}
2422
2423void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */
2424
2425void
2426ps_vs_write_complete(
2427	vstruct_t	vs,
2428	dp_offset_t	offset,
2429	dp_size_t	size,
2430	int		error)
2431{
2432	struct clmap	clmap;
2433
2434	/*
2435	 * Get the struct vsmap for this cluster.
2436	 * Use READ, even though it was written, because the
2437	 * cluster MUST be present, unless there was an error
2438	 * in the original ps_clmap (e.g. no space), in which
2439	 * case, nothing happens.
2440	 *
2441	 * Must pass enough information to ps_clmap to allow it
2442	 * to set the vs_map structure bitmap under lock.
2443	 */
2444	(void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error);
2445}
2446
2447void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int);	/* forward */
2448
2449void
2450vs_cl_write_complete(
2451	vstruct_t			vs,
2452	__unused paging_segment_t	ps,
2453	dp_offset_t			offset,
2454	__unused vm_offset_t		addr,
2455	dp_size_t			size,
2456	boolean_t			async,
2457	int				error)
2458{
2459//	kern_return_t	kr;
2460
2461	if (error) {
2462		/*
2463		 * For internal objects, the error is recorded on a
2464		 * per-cluster basis by ps_clmap() which is called
2465		 * by ps_vs_write_complete() below.
2466		 */
2467		dprintf(("write failed error = 0x%x\n", error));
2468		/* add upl_abort code here */
2469	} else
2470		GSTAT(global_stats.gs_pages_out += atop_32(size));
2471	/*
2472	 * Notify the vstruct mapping code, so it can do its accounting.
2473	 */
2474	ps_vs_write_complete(vs, offset, size, error);
2475
2476	if (async) {
2477		VS_LOCK(vs);
2478		ASSERT(vs->vs_async_pending > 0);
2479		vs->vs_async_pending -= size;
2480		if (vs->vs_async_pending == 0 && vs->vs_waiting_async) {
2481			vs->vs_waiting_async = FALSE;
2482			VS_UNLOCK(vs);
2483			thread_wakeup(&vs->vs_async_pending);
2484		} else {
2485			VS_UNLOCK(vs);
2486		}
2487	}
2488}
2489
2490#ifdef DEVICE_PAGING
2491kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2492
2493kern_return_t
2494device_write_reply(
2495	MACH_PORT_FACE	reply_port,
2496	kern_return_t	device_code,
2497	io_buf_len_t	bytes_written)
2498{
2499	struct vs_async	*vsa;
2500
2501	vsa = (struct vs_async *)
2502		((struct vstruct_alias *)(reply_port->ip_alias))->vs;
2503
2504	if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) {
2505		device_code = KERN_FAILURE;
2506	}
2507
2508	vsa->vsa_error = device_code;
2509
2510
2511	ASSERT(vsa->vsa_vs != VSTRUCT_NULL);
2512	if(vsa->vsa_flags & VSA_TRANSFER) {
2513		/* revisit when async disk segments redone */
2514		if(vsa->vsa_error) {
2515		   /* need to consider error condition.  re-write data or */
2516		   /* throw it away here. */
2517		   vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr);
2518		}
2519		ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset,
2520						vsa->vsa_size, vsa->vsa_error);
2521	} else {
2522		vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset,
2523			     vsa->vsa_addr, vsa->vsa_size, TRUE,
2524			     vsa->vsa_error);
2525	}
2526	VS_FREE_ASYNC(vsa);
2527
2528	return KERN_SUCCESS;
2529}
2530
2531kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2532kern_return_t
2533device_write_reply_inband(
2534	MACH_PORT_FACE		reply_port,
2535	kern_return_t		return_code,
2536	io_buf_len_t		bytes_written)
2537{
2538	panic("device_write_reply_inband: illegal");
2539	return KERN_SUCCESS;
2540}
2541
2542kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t);
2543kern_return_t
2544device_read_reply(
2545	MACH_PORT_FACE		reply_port,
2546	kern_return_t		return_code,
2547	io_buf_ptr_t		data,
2548	mach_msg_type_number_t	dataCnt)
2549{
2550	struct vs_async	*vsa;
2551	vsa = (struct vs_async *)
2552		((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs;
2553	vsa->vsa_addr = (vm_offset_t)data;
2554	vsa->vsa_size = (vm_size_t)dataCnt;
2555	vsa->vsa_error = return_code;
2556	thread_wakeup(&vsa);
2557	return KERN_SUCCESS;
2558}
2559
2560kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t);
2561kern_return_t
2562device_read_reply_inband(
2563	MACH_PORT_FACE		reply_port,
2564	kern_return_t		return_code,
2565	io_buf_ptr_inband_t	data,
2566	mach_msg_type_number_t	dataCnt)
2567{
2568	panic("device_read_reply_inband: illegal");
2569	return KERN_SUCCESS;
2570}
2571
2572kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t);
2573kern_return_t
2574device_read_reply_overwrite(
2575	MACH_PORT_FACE		reply_port,
2576	kern_return_t		return_code,
2577	io_buf_len_t		bytes_read)
2578{
2579	panic("device_read_reply_overwrite: illegal\n");
2580	return KERN_SUCCESS;
2581}
2582
2583kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE);
2584kern_return_t
2585device_open_reply(
2586	MACH_PORT_FACE		reply_port,
2587	kern_return_t		return_code,
2588	MACH_PORT_FACE		device_port)
2589{
2590	panic("device_open_reply: illegal\n");
2591	return KERN_SUCCESS;
2592}
2593
2594kern_return_t
2595ps_read_device(
2596	paging_segment_t	ps,
2597	dp_offset_t		offset,
2598	vm_offset_t		*bufferp,
2599	unsigned int		size,
2600	unsigned int		*residualp,
2601	int 			flags)
2602{
2603	kern_return_t	kr;
2604	recnum_t	dev_offset;
2605	unsigned int	bytes_wanted;
2606	unsigned int	bytes_read;
2607	unsigned int	total_read;
2608	vm_offset_t	dev_buffer;
2609	vm_offset_t	buf_ptr;
2610	unsigned int	records_read;
2611	struct vs_async *vsa;
2612
2613	device_t	device;
2614	vm_map_copy_t	device_data = NULL;
2615	default_pager_thread_t *dpt = NULL;
2616
2617	device = dev_port_lookup(ps->ps_device);
2618	clustered_reads[atop_32(size)]++;
2619
2620	dev_offset = (ps->ps_offset +
2621		      (offset >> (vm_page_shift - ps->ps_record_shift)));
2622	bytes_wanted = size;
2623	total_read = 0;
2624	*bufferp = (vm_offset_t)NULL;
2625
2626	do {
2627		vsa = VS_ALLOC_ASYNC();
2628		if (vsa) {
2629			vsa->vsa_vs = NULL;
2630			vsa->vsa_addr = 0;
2631			vsa->vsa_offset = 0;
2632			vsa->vsa_size = 0;
2633			vsa->vsa_ps = NULL;
2634		}
2635		ip_lock(vsa->reply_port);
2636		vsa->reply_port->ip_sorights++;
2637		ip_reference(vsa->reply_port);
2638		ip_unlock(vsa->reply_port);
2639		kr = ds_device_read_common(device,
2640				 vsa->reply_port,
2641			         (mach_msg_type_name_t)
2642					MACH_MSG_TYPE_MOVE_SEND_ONCE,
2643				 (dev_mode_t) 0,
2644				 dev_offset,
2645				 bytes_wanted,
2646				 (IO_READ | IO_CALL),
2647				 (io_buf_ptr_t *) &dev_buffer,
2648				 (mach_msg_type_number_t *) &bytes_read);
2649		if(kr == MIG_NO_REPLY) {
2650			assert_wait(&vsa, THREAD_UNINT);
2651			thread_block(THREAD_CONTINUE_NULL);
2652
2653			dev_buffer = vsa->vsa_addr;
2654			bytes_read = (unsigned int)vsa->vsa_size;
2655			kr = vsa->vsa_error;
2656		}
2657		VS_FREE_ASYNC(vsa);
2658		if (kr != KERN_SUCCESS || bytes_read == 0) {
2659			break;
2660		}
2661		total_read += bytes_read;
2662
2663		/*
2664		 * If we got the entire range, use the returned dev_buffer.
2665		 */
2666		if (bytes_read == size) {
2667			*bufferp = (vm_offset_t)dev_buffer;
2668			break;
2669		}
2670
2671#if 1
2672		dprintf(("read only %d bytes out of %d\n",
2673			 bytes_read, bytes_wanted));
2674#endif
2675		if(dpt == NULL) {
2676			dpt = get_read_buffer();
2677			buf_ptr = dpt->dpt_buffer;
2678			*bufferp = (vm_offset_t)buf_ptr;
2679		}
2680		/*
2681		 * Otherwise, copy the data into the provided buffer (*bufferp)
2682		 * and append the rest of the range as it comes in.
2683		 */
2684		memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read);
2685		buf_ptr += bytes_read;
2686		bytes_wanted -= bytes_read;
2687		records_read = (bytes_read >>
2688				(vm_page_shift - ps->ps_record_shift));
2689		dev_offset += records_read;
2690		DP_DEBUG(DEBUG_VS_INTERNAL,
2691			 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n",
2692			  dev_buffer, bytes_read));
2693		if (vm_deallocate(kernel_map, dev_buffer, bytes_read)
2694		    != KERN_SUCCESS)
2695			Panic("dealloc buf");
2696	} while (bytes_wanted);
2697
2698	*residualp = size - total_read;
2699	if((dev_buffer != *bufferp) && (total_read != 0)) {
2700		vm_offset_t temp_buffer;
2701		vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE);
2702		memcpy((void *) temp_buffer, (void *) *bufferp, total_read);
2703		if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read,
2704			VM_MAP_COPYIN_OPT_SRC_DESTROY |
2705			VM_MAP_COPYIN_OPT_STEAL_PAGES |
2706			VM_MAP_COPYIN_OPT_PMAP_ENTER,
2707			(vm_map_copy_t *)&device_data, FALSE))
2708				panic("ps_read_device: cannot copyin locally provided buffer\n");
2709	}
2710	else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){
2711		if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read,
2712			VM_MAP_COPYIN_OPT_SRC_DESTROY |
2713			VM_MAP_COPYIN_OPT_STEAL_PAGES |
2714			VM_MAP_COPYIN_OPT_PMAP_ENTER,
2715			(vm_map_copy_t *)&device_data, FALSE))
2716				panic("ps_read_device: cannot copyin backing store provided buffer\n");
2717	}
2718	else {
2719		device_data = NULL;
2720	}
2721	*bufferp = (vm_offset_t)device_data;
2722
2723	if(dpt != NULL) {
2724		/* Free the receive buffer */
2725		dpt->checked_out = 0;
2726		thread_wakeup(&dpt_array);
2727	}
2728	return KERN_SUCCESS;
2729}
2730
2731kern_return_t
2732ps_write_device(
2733	paging_segment_t	ps,
2734	dp_offset_t		offset,
2735	vm_offset_t		addr,
2736	unsigned int		size,
2737	struct vs_async		*vsa)
2738{
2739	recnum_t	dev_offset;
2740	io_buf_len_t	bytes_to_write, bytes_written;
2741	recnum_t	records_written;
2742	kern_return_t	kr;
2743	MACH_PORT_FACE	reply_port;
2744
2745
2746
2747	clustered_writes[atop_32(size)]++;
2748
2749	dev_offset = (ps->ps_offset +
2750		      (offset >> (vm_page_shift - ps->ps_record_shift)));
2751	bytes_to_write = size;
2752
2753	if (vsa) {
2754		/*
2755		 * Asynchronous write.
2756		 */
2757		reply_port = vsa->reply_port;
2758		ip_lock(reply_port);
2759		reply_port->ip_sorights++;
2760		ip_reference(reply_port);
2761		ip_unlock(reply_port);
2762		{
2763		device_t	device;
2764		device = dev_port_lookup(ps->ps_device);
2765
2766		vsa->vsa_addr = addr;
2767		kr=ds_device_write_common(device,
2768			reply_port,
2769			(mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE,
2770			(dev_mode_t) 0,
2771			dev_offset,
2772			(io_buf_ptr_t)	addr,
2773			size,
2774			(IO_WRITE | IO_CALL),
2775			&bytes_written);
2776		}
2777		if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) {
2778			if (verbose)
2779				dprintf(("%s0x%x, addr=0x%x,"
2780					 "size=0x%x,offset=0x%x\n",
2781					 "device_write_request returned ",
2782					 kr, addr, size, offset));
2783			BS_STAT(ps->ps_bs,
2784				ps->ps_bs->bs_pages_out_fail += atop_32(size));
2785			/* do the completion notification to free resources */
2786			device_write_reply(reply_port, kr, 0);
2787			return PAGER_ERROR;
2788		}
2789	} else do {
2790		/*
2791		 * Synchronous write.
2792		 */
2793		{
2794		device_t	device;
2795		device = dev_port_lookup(ps->ps_device);
2796		kr=ds_device_write_common(device,
2797			IP_NULL, 0,
2798			(dev_mode_t) 0,
2799			dev_offset,
2800			(io_buf_ptr_t)	addr,
2801			size,
2802			(IO_WRITE | IO_SYNC | IO_KERNEL_BUF),
2803			&bytes_written);
2804		}
2805		if (kr != KERN_SUCCESS) {
2806			dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n",
2807				 "device_write returned ",
2808				 kr, addr, size, offset));
2809			BS_STAT(ps->ps_bs,
2810				ps->ps_bs->bs_pages_out_fail += atop_32(size));
2811			return PAGER_ERROR;
2812		}
2813		if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1))
2814			Panic("fragmented write");
2815		records_written = (bytes_written >>
2816				   (vm_page_shift - ps->ps_record_shift));
2817		dev_offset += records_written;
2818#if 1
2819		if (bytes_written != bytes_to_write) {
2820			dprintf(("wrote only %d bytes out of %d\n",
2821				 bytes_written, bytes_to_write));
2822		}
2823#endif
2824		bytes_to_write -= bytes_written;
2825		addr += bytes_written;
2826	} while (bytes_to_write > 0);
2827
2828	return PAGER_SUCCESS;
2829}
2830
2831
2832#else /* !DEVICE_PAGING */
2833
2834kern_return_t
2835ps_read_device(
2836	__unused paging_segment_t	ps,
2837	__unused dp_offset_t		offset,
2838	__unused vm_offset_t		*bufferp,
2839	__unused unsigned int		size,
2840	__unused unsigned int		*residualp,
2841	__unused int 				flags)
2842{
2843  panic("ps_read_device not supported");
2844  return KERN_FAILURE;
2845}
2846
2847kern_return_t
2848ps_write_device(
2849	__unused paging_segment_t	ps,
2850	__unused dp_offset_t		offset,
2851	__unused vm_offset_t		addr,
2852	__unused unsigned int		size,
2853	__unused struct vs_async	*vsa)
2854{
2855  panic("ps_write_device not supported");
2856  return KERN_FAILURE;
2857}
2858
2859#endif /* DEVICE_PAGING */
2860void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t);	/* forward */
2861
2862void
2863pvs_object_data_provided(
2864	__unused vstruct_t		vs,
2865	__unused upl_t			upl,
2866	__unused upl_offset_t	offset,
2867	upl_size_t				size)
2868{
2869#if	RECLAIM_SWAP
2870	boolean_t	empty;
2871#endif
2872
2873	DP_DEBUG(DEBUG_VS_INTERNAL,
2874		 ("buffer=0x%x,offset=0x%x,size=0x%x\n",
2875		  upl, offset, size));
2876
2877	ASSERT(size > 0);
2878	GSTAT(global_stats.gs_pages_in += atop_32(size));
2879
2880/* check upl iosync flag instead of using RECLAIM_SWAP*/
2881#if	RECLAIM_SWAP
2882	if (size != upl->size) {
2883		if (size) {
2884			ps_clunmap(vs, offset, size);
2885			upl_commit_range(upl, 0, size, 0, NULL, 0, &empty);
2886		}
2887		upl_abort(upl, UPL_ABORT_ERROR);
2888		upl_deallocate(upl);
2889	} else {
2890		ps_clunmap(vs, offset, size);
2891		upl_commit(upl, NULL, 0);
2892		upl_deallocate(upl);
2893	}
2894#endif	/* RECLAIM_SWAP */
2895
2896}
2897
2898static memory_object_offset_t   last_start;
2899static vm_size_t		last_length;
2900
2901/*
2902 * A "cnt" of 0 means that the caller just wants to check if the page at
2903 * offset "vs_offset" exists in the backing store.  That page hasn't been
2904 * prepared, so no need to release it.
2905 *
2906 * A "cnt" of -1 means that the caller wants to bring back from the backing
2907 * store all existing pages in the cluster containing "vs_offset".
2908 */
2909kern_return_t
2910pvs_cluster_read(
2911	vstruct_t	vs,
2912	dp_offset_t	vs_offset,
2913	dp_size_t	cnt,
2914        void		*fault_info)
2915{
2916	kern_return_t		error = KERN_SUCCESS;
2917	unsigned int		size;
2918	unsigned int		residual;
2919	unsigned int		request_flags;
2920	int			io_flags = 0;
2921	int			seg_index;
2922	int			pages_in_cl;
2923	int	                cl_size;
2924	int	                cl_mask;
2925	int			cl_index;
2926	unsigned int		xfer_size;
2927	dp_offset_t		orig_vs_offset;
2928	dp_offset_t		ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2929	paging_segment_t        psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT];
2930	struct clmap		clmap;
2931	upl_t			upl;
2932	unsigned int		page_list_count;
2933	memory_object_offset_t	cluster_start;
2934	vm_size_t		cluster_length;
2935	uint32_t		io_streaming;
2936	int			i;
2937	boolean_t		io_sync = FALSE;
2938	boolean_t		reclaim_all = FALSE;
2939
2940	pages_in_cl = 1 << vs->vs_clshift;
2941	cl_size = pages_in_cl * vm_page_size;
2942	cl_mask = cl_size - 1;
2943
2944	request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE;
2945
2946	if (cnt == (dp_size_t) -1)
2947		reclaim_all = TRUE;
2948
2949	if (reclaim_all == TRUE) {
2950		/*
2951		 * We've been called from ps_vstruct_reclaim() to move all
2952		 * the object's swapped pages back to VM pages.
2953		 * This can put memory pressure on the system, so we do want
2954		 * to wait for free pages, to avoid getting in the way of the
2955		 * vm_pageout_scan() thread.
2956		 * Let's not use UPL_NOBLOCK in this case.
2957		 */
2958		vs_offset &= ~cl_mask;
2959		i = pages_in_cl;
2960	} else {
2961		i = 1;
2962
2963		/*
2964		 * if the I/O cluster size == PAGE_SIZE, we don't want to set
2965		 * the UPL_NOBLOCK since we may be trying to recover from a
2966		 * previous partial pagein I/O that occurred because we were low
2967		 * on memory and bailed early in order to honor the UPL_NOBLOCK...
2968		 * since we're only asking for a single page, we can block w/o fear
2969		 * of tying up pages while waiting for more to become available
2970		 */
2971		if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE)
2972			request_flags |= UPL_NOBLOCK;
2973	}
2974
2975again:
2976	cl_index = (vs_offset & cl_mask) / vm_page_size;
2977
2978        if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) ||
2979	    !CLMAP_ISSET(clmap, cl_index)) {
2980	        /*
2981		 * the needed page doesn't exist in the backing store...
2982		 * we don't want to try to do any I/O, just abort the
2983		 * page and let the fault handler provide a zero-fill
2984		 */
2985		if (cnt == 0) {
2986			/*
2987			 * The caller was just poking at us to see if
2988			 * the page has been paged out.  No need to
2989			 * mess with the page at all.
2990			 * Just let the caller know we don't have that page.
2991			 */
2992			return KERN_FAILURE;
2993		}
2994		if (reclaim_all == TRUE) {
2995			i--;
2996			if (i == 0) {
2997				/* no more pages in this cluster */
2998				return KERN_FAILURE;
2999			}
3000			/* try the next page in this cluster */
3001			vs_offset += vm_page_size;
3002			goto again;
3003		}
3004
3005		page_list_count = 0;
3006
3007		memory_object_super_upl_request(vs->vs_control,	(memory_object_offset_t)vs_offset,
3008						PAGE_SIZE, PAGE_SIZE,
3009						&upl, NULL, &page_list_count,
3010						request_flags  | UPL_SET_INTERNAL);
3011		upl_range_needed(upl, 0, 1);
3012
3013		if (clmap.cl_error)
3014		        upl_abort(upl, UPL_ABORT_ERROR);
3015		else
3016		        upl_abort(upl, UPL_ABORT_UNAVAILABLE);
3017		upl_deallocate(upl);
3018
3019		return KERN_SUCCESS;
3020	}
3021
3022	if (cnt == 0) {
3023		/*
3024		 * The caller was just poking at us to see if
3025		 * the page has been paged out.  No need to
3026		 * mess with the page at all.
3027		 * Just let the caller know we do have that page.
3028		 */
3029		return KERN_SUCCESS;
3030	}
3031
3032	if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) {
3033		io_sync = TRUE;
3034	} else {
3035#if RECLAIM_SWAP
3036		io_sync = TRUE;
3037#endif	/* RECLAIM_SWAP */
3038	}
3039
3040	if( io_sync == TRUE ) {
3041
3042		io_flags |= UPL_IOSYNC | UPL_NOCOMMIT;
3043#if USE_PRECIOUS
3044		request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE;
3045#else	/* USE_PRECIOUS */
3046		request_flags |= UPL_REQUEST_SET_DIRTY;
3047#endif	/* USE_PRECIOUS */
3048	}
3049
3050	assert(dp_encryption_inited);
3051	if (dp_encryption) {
3052		/*
3053		 * ENCRYPTED SWAP:
3054		 * request that the UPL be prepared for
3055		 * decryption.
3056		 */
3057		request_flags |= UPL_ENCRYPT;
3058		io_flags |= UPL_PAGING_ENCRYPTED;
3059	}
3060	orig_vs_offset = vs_offset;
3061
3062	assert(cnt != 0);
3063	cnt = VM_SUPER_CLUSTER;
3064	cluster_start = (memory_object_offset_t) vs_offset;
3065	cluster_length = (vm_size_t) cnt;
3066	io_streaming = 0;
3067
3068	/*
3069	 * determine how big a speculative I/O we should try for...
3070	 */
3071	if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) {
3072		assert(vs_offset >= (dp_offset_t) cluster_start &&
3073		       vs_offset < (dp_offset_t) (cluster_start + cluster_length));
3074	        vs_offset = (dp_offset_t) cluster_start;
3075		cnt = (dp_size_t) cluster_length;
3076	} else {
3077		cluster_length = PAGE_SIZE;
3078	        cnt = PAGE_SIZE;
3079	}
3080
3081	if (io_streaming)
3082                io_flags |= UPL_IOSTREAMING;
3083
3084	last_start = cluster_start;
3085	last_length = cluster_length;
3086
3087	/*
3088	 * This loop will be executed multiple times until the entire
3089	 * range has been looked at or we issue an I/O... if the request spans cluster
3090	 * boundaries, the clusters will be checked for logical continunity,
3091	 * if contiguous the I/O request will span multiple clusters...
3092	 * at most only 1 I/O will be issued... it will encompass the original offset
3093	 */
3094	while (cnt && error == KERN_SUCCESS) {
3095	        int     ps_info_valid;
3096
3097		if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) {
3098			size = VM_SUPER_CLUSTER;
3099			size -= vs_offset & cl_mask;
3100	        } else if (cnt > VM_SUPER_CLUSTER)
3101		        size = VM_SUPER_CLUSTER;
3102		else
3103		        size = cnt;
3104
3105		cnt -= size;
3106
3107		ps_info_valid = 0;
3108		seg_index     = 0;
3109
3110		while (size > 0 && error == KERN_SUCCESS) {
3111		        unsigned int  abort_size;
3112			unsigned int  lsize;
3113			int           failed_size;
3114			int           beg_pseg;
3115			int           beg_indx;
3116			dp_offset_t   cur_offset;
3117
3118			if ( !ps_info_valid) {
3119			        ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3120				psp[seg_index]       = CLMAP_PS(clmap);
3121				ps_info_valid = 1;
3122			}
3123		        /*
3124			 * skip over unallocated physical segments
3125			 */
3126			if (ps_offset[seg_index] == (dp_offset_t) -1) {
3127				abort_size = cl_size - (vs_offset & cl_mask);
3128				abort_size = MIN(abort_size, size);
3129
3130				size      -= abort_size;
3131				vs_offset += abort_size;
3132
3133				seg_index++;
3134				ps_info_valid = 0;
3135
3136				continue;
3137			}
3138			cl_index = (vs_offset & cl_mask) / vm_page_size;
3139
3140			for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) {
3141			        /*
3142				 * skip over unallocated pages
3143				 */
3144			        if (CLMAP_ISSET(clmap, cl_index))
3145				        break;
3146				abort_size += vm_page_size;
3147			}
3148			if (abort_size) {
3149				size      -= abort_size;
3150				vs_offset += abort_size;
3151
3152				if (cl_index == pages_in_cl) {
3153				        /*
3154					 * if we're at the end of this physical cluster
3155					 * then bump to the next one and continue looking
3156					 */
3157				        seg_index++;
3158					ps_info_valid = 0;
3159
3160					continue;
3161				}
3162				if (size == 0)
3163				        break;
3164			}
3165			/*
3166			 * remember the starting point of the first allocated page
3167			 * for the I/O we're about to issue
3168			 */
3169			beg_pseg   = seg_index;
3170			beg_indx   = cl_index;
3171			cur_offset = vs_offset;
3172
3173			/*
3174			 * calculate the size of the I/O that we can do...
3175			 * this may span multiple physical segments if
3176			 * they are contiguous
3177			 */
3178			for (xfer_size = 0; xfer_size < size; ) {
3179
3180			        while (cl_index < pages_in_cl && xfer_size < size) {
3181				        /*
3182					 * accumulate allocated pages within
3183					 * a physical segment
3184					 */
3185				        if (CLMAP_ISSET(clmap, cl_index)) {
3186					        xfer_size  += vm_page_size;
3187						cur_offset += vm_page_size;
3188						cl_index++;
3189
3190						BS_STAT(psp[seg_index]->ps_bs,
3191							psp[seg_index]->ps_bs->bs_pages_in++);
3192					} else
3193					        break;
3194				}
3195				if (cl_index < pages_in_cl || xfer_size >= size) {
3196				        /*
3197					 * we've hit an unallocated page or
3198					 * the end of this request... see if
3199					 * it's time to fire the I/O
3200					 */
3201				        break;
3202				}
3203				/*
3204				 * we've hit the end of the current physical
3205				 * segment and there's more to do, so try
3206				 * moving to the next one
3207				 */
3208				seg_index++;
3209
3210				ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0);
3211				psp[seg_index] = CLMAP_PS(clmap);
3212				ps_info_valid = 1;
3213
3214				if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) {
3215				        /*
3216					 * if the physical segment we're about
3217					 * to step into is not contiguous to
3218					 * the one we're currently in, or it's
3219					 * in a different paging file, or
3220					 * it hasn't been allocated....
3221					 * we stop this run and go check
3222					 * to see if it's time to fire the I/O
3223					 */
3224				        break;
3225				}
3226				/*
3227				 * start with first page of the next physical
3228				 * segment
3229				 */
3230				cl_index = 0;
3231			}
3232			if (xfer_size == 0) {
3233			        /*
3234				 * no I/O to generate for this segment
3235				 */
3236			        continue;
3237			}
3238			if (cur_offset <= orig_vs_offset) {
3239			        /*
3240				 * we've hit a hole in our speculative cluster
3241				 * before the offset that we're really after...
3242				 * don't issue the I/O since it doesn't encompass
3243				 * the original offset and we're looking to only
3244				 * pull in the speculative pages if they can be
3245				 * made part of a single I/O
3246				 */
3247			        size      -= xfer_size;
3248				vs_offset += xfer_size;
3249
3250				continue;
3251			}
3252			/*
3253			 * we have a contiguous range of allocated pages
3254			 * to read from that encompasses the original offset
3255			 */
3256			page_list_count = 0;
3257			memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset,
3258							xfer_size, xfer_size,
3259							&upl, NULL, &page_list_count,
3260							request_flags | UPL_SET_INTERNAL);
3261
3262			error = ps_read_file(psp[beg_pseg],
3263					     upl, (upl_offset_t) 0,
3264					     ps_offset[beg_pseg] + (beg_indx * vm_page_size),
3265					     xfer_size, &residual, io_flags);
3266
3267
3268			/*
3269			 * Adjust counts and send response to VM.  Optimize
3270			 * for the common case, i.e. no error and/or partial
3271			 * data. If there was an error, then we need to error
3272			 * the entire range, even if some data was successfully
3273			 * read. If there was a partial read we may supply some
3274			 * data and may error some as well.  In all cases the
3275			 * VM must receive some notification for every page
3276			 * in the range.
3277			 */
3278			if ((error == KERN_SUCCESS) && (residual == 0)) {
3279			        /*
3280				 * Got everything we asked for, supply the data
3281				 * to the VM.  Note that as a side effect of
3282				 * supplying the data, the buffer holding the
3283				 * supplied data is deallocated from the pager's
3284				 *  address space.
3285				 */
3286				lsize = xfer_size;
3287				failed_size = 0;
3288			} else {
3289				lsize = 0;
3290			        failed_size = xfer_size;
3291
3292				if (error == KERN_SUCCESS) {
3293				        if (residual == xfer_size) {
3294					        /*
3295						 * If a read operation returns no error
3296						 * and no data moved, we turn it into
3297						 * an error, assuming we're reading at
3298						 * or beyong EOF.
3299						 * Fall through and error the entire range.
3300						 */
3301					        error = KERN_FAILURE;
3302					} else {
3303					        /*
3304						 * Otherwise, we have partial read. If
3305						 * the part read is a integral number
3306						 * of pages supply it. Otherwise round
3307						 * it up to a page boundary, zero fill
3308						 * the unread part, and supply it.
3309						 * Fall through and error the remainder
3310						 * of the range, if any.
3311						 */
3312					        int fill;
3313
3314						fill = residual & (vm_page_size - 1);
3315						lsize = (xfer_size - residual) + fill;
3316
3317						if (lsize < xfer_size)
3318						        failed_size = xfer_size - lsize;
3319
3320						if (reclaim_all == FALSE)
3321							error = KERN_FAILURE;
3322					}
3323				}
3324			}
3325			pvs_object_data_provided(vs, upl, vs_offset, lsize);
3326
3327			if (failed_size) {
3328			        /*
3329				 * There was an error in some part of the range, tell
3330				 * the VM. Note that error is explicitly checked again
3331				 * since it can be modified above.
3332				 */
3333				BS_STAT(psp[beg_pseg]->ps_bs,
3334					psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size));
3335			}
3336			/*
3337			 * we've issued a single I/O that encompassed the original offset
3338			 * at this point we either met our speculative request length or
3339			 * we ran into a 'hole' (i.e. page not present in the cluster, cluster
3340			 * not present or not physically contiguous to the previous one), so
3341			 * we're done issuing I/O at this point
3342			 */
3343			return (error);
3344		}
3345	}
3346	return error;
3347}
3348
3349int vs_do_async_write = 1;
3350
3351kern_return_t
3352vs_cluster_write(
3353	vstruct_t	vs,
3354	upl_t		internal_upl,
3355	upl_offset_t	offset,
3356	upl_size_t	cnt,
3357	boolean_t	dp_internal,
3358	int 		flags)
3359{
3360	upl_size_t	transfer_size;
3361	int		error = 0;
3362	struct clmap	clmap;
3363
3364	dp_offset_t	actual_offset;	/* Offset within paging segment */
3365	paging_segment_t ps;
3366	dp_offset_t	mobj_base_addr;
3367	dp_offset_t	mobj_target_addr;
3368
3369	upl_t		upl;
3370	upl_page_info_t *pl;
3371	int		page_index;
3372	unsigned int	page_max_index;
3373	int		list_size;
3374	int		pages_in_cl;
3375	unsigned int	cl_size;
3376	int             base_index;
3377	unsigned int	seg_size;
3378	unsigned int	upl_offset_in_object;
3379	boolean_t	minimal_clustering = FALSE;
3380	boolean_t	found_dirty;
3381
3382	if (!dp_encryption_inited) {
3383		/*
3384		 * ENCRYPTED SWAP:
3385		 * Once we've started using swap, we
3386		 * can't change our mind on whether
3387		 * it needs to be encrypted or
3388		 * not.
3389		 */
3390		dp_encryption_inited = TRUE;
3391	}
3392	if (dp_encryption) {
3393		/*
3394		 * ENCRYPTED SWAP:
3395		 * the UPL will need to be encrypted...
3396		 */
3397		flags |= UPL_PAGING_ENCRYPTED;
3398	}
3399
3400	pages_in_cl = 1 << vs->vs_clshift;
3401	cl_size = pages_in_cl * vm_page_size;
3402
3403#if CONFIG_FREEZE
3404	minimal_clustering = TRUE;
3405#else
3406	if (dp_isssd == TRUE)
3407		minimal_clustering = TRUE;
3408#endif
3409	if (!dp_internal) {
3410		unsigned int page_list_count;
3411		int	     request_flags;
3412		unsigned int super_size;
3413		int          first_dirty;
3414		int          num_dirty;
3415		int          num_of_pages;
3416		int          seg_index;
3417		upl_offset_t  upl_offset;
3418		upl_offset_t  upl_offset_aligned;
3419		dp_offset_t  seg_offset;
3420		dp_offset_t  ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3421		paging_segment_t   psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1];
3422
3423
3424		if (bs_low)
3425			super_size = cl_size;
3426		else
3427			super_size = VM_SUPER_CLUSTER;
3428
3429		request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE |
3430			        UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM |
3431				UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE;
3432
3433		if (dp_encryption) {
3434			/*
3435			 * ENCRYPTED SWAP:
3436			 * request that the UPL be prepared for
3437			 * encryption.
3438			 */
3439			request_flags |= UPL_ENCRYPT;
3440			flags |= UPL_PAGING_ENCRYPTED;
3441		}
3442
3443		page_list_count = 0;
3444		memory_object_super_upl_request(vs->vs_control,
3445				(memory_object_offset_t)offset,
3446				cnt, super_size,
3447				&upl, NULL, &page_list_count,
3448				request_flags | UPL_FOR_PAGEOUT);
3449
3450		/*
3451		 * The default pager does not handle objects larger than
3452		 * 4GB, so it does not deal with offset that don't fit in
3453		 * 32-bit.  Cast down upl->offset now and make sure we
3454		 * did not lose any valuable bits.
3455		 */
3456		upl_offset_in_object = (unsigned int) upl->offset;
3457		assert(upl->offset == upl_offset_in_object);
3458
3459		pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3460
3461		seg_size = cl_size - (upl_offset_in_object % cl_size);
3462		upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1);
3463		page_index = 0;
3464		page_max_index = upl->size / PAGE_SIZE;
3465		found_dirty = TRUE;
3466
3467		for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) {
3468
3469			unsigned int	seg_pgcnt;
3470
3471			seg_pgcnt = seg_size / PAGE_SIZE;
3472
3473			if (minimal_clustering == TRUE) {
3474				unsigned int	non_dirty;
3475
3476				non_dirty = 0;
3477				found_dirty = FALSE;
3478
3479				for (; non_dirty < seg_pgcnt; non_dirty++) {
3480					if ((page_index + non_dirty) >= page_max_index)
3481						break;
3482
3483					if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) ||
3484					    UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) {
3485						found_dirty = TRUE;
3486						break;
3487					}
3488				}
3489			}
3490			if (found_dirty == TRUE) {
3491				ps_offset[seg_index] =
3492					ps_clmap(vs,
3493						 upl_offset_aligned,
3494						 &clmap, CL_ALLOC,
3495						 cl_size, 0);
3496
3497				if (ps_offset[seg_index] == (dp_offset_t) -1) {
3498					upl_abort(upl, 0);
3499					upl_deallocate(upl);
3500
3501					return KERN_FAILURE;
3502				}
3503				psp[seg_index] = CLMAP_PS(clmap);
3504			}
3505			if (transfer_size > seg_size) {
3506				page_index += seg_pgcnt;
3507			        transfer_size -= seg_size;
3508				upl_offset_aligned += cl_size;
3509				seg_size = cl_size;
3510				seg_index++;
3511			} else
3512			        transfer_size = 0;
3513		}
3514		/*
3515		 * Ignore any non-present pages at the end of the
3516		 * UPL.
3517		 */
3518		for (page_index = upl->size / vm_page_size; page_index > 0;)  {
3519			if (UPL_PAGE_PRESENT(pl, --page_index)) {
3520				page_index++;
3521				break;
3522			}
3523		}
3524		if (page_index == 0) {
3525			/*
3526			 * no pages in the UPL
3527			 * abort and return
3528			 */
3529			upl_abort(upl, 0);
3530			upl_deallocate(upl);
3531
3532			return KERN_SUCCESS;
3533		}
3534		num_of_pages = page_index;
3535
3536		base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE;
3537
3538		for (page_index = 0; page_index < num_of_pages; ) {
3539			/*
3540			 * skip over non-dirty pages
3541			 */
3542			for ( ; page_index < num_of_pages; page_index++) {
3543			        if (UPL_DIRTY_PAGE(pl, page_index)
3544					|| UPL_PRECIOUS_PAGE(pl, page_index))
3545				        /*
3546					 * this is a page we need to write
3547					 * go see if we can buddy it up with
3548					 * others that are contiguous to it
3549					 */
3550				        break;
3551				/*
3552				 * if the page is not-dirty, but present we
3553				 * need to commit it...  This is an unusual
3554				 * case since we only asked for dirty pages
3555				 */
3556				if (UPL_PAGE_PRESENT(pl, page_index)) {
3557					boolean_t empty = FALSE;
3558				        upl_commit_range(upl,
3559						 page_index * vm_page_size,
3560						 vm_page_size,
3561						 UPL_COMMIT_NOTIFY_EMPTY,
3562						 pl,
3563						 page_list_count,
3564						 &empty);
3565					if (empty) {
3566						assert(page_index ==
3567						       num_of_pages - 1);
3568						upl_deallocate(upl);
3569					}
3570				}
3571			}
3572			if (page_index == num_of_pages)
3573			        /*
3574				 * no more pages to look at, we're out of here
3575				 */
3576			        break;
3577
3578			/*
3579			 * gather up contiguous dirty pages... we have at
3580			 * least 1 * otherwise we would have bailed above
3581			 * make sure that each physical segment that we step
3582			 * into is contiguous to the one we're currently in
3583			 * if it's not, we have to stop and write what we have
3584			 */
3585			for (first_dirty = page_index;
3586					page_index < num_of_pages; ) {
3587				if ( !UPL_DIRTY_PAGE(pl, page_index)
3588					&& !UPL_PRECIOUS_PAGE(pl, page_index))
3589				        break;
3590				page_index++;
3591				/*
3592				 * if we just looked at the last page in the UPL
3593				 * we don't need to check for physical segment
3594				 * continuity
3595				 */
3596				if (page_index < num_of_pages) {
3597				        int cur_seg;
3598				        int nxt_seg;
3599
3600				        cur_seg = (base_index + (page_index - 1))/pages_in_cl;
3601					nxt_seg = (base_index + page_index)/pages_in_cl;
3602
3603					if (cur_seg != nxt_seg) {
3604					        if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg]))
3605						/*
3606						 * if the segment we're about
3607						 * to step into is not
3608						 * contiguous to the one we're
3609						 * currently in, or it's in a
3610						 * different paging file....
3611						 * we stop here and generate
3612						 * the I/O
3613						 */
3614						        break;
3615					}
3616				}
3617			}
3618			num_dirty = page_index - first_dirty;
3619
3620			if (num_dirty) {
3621			        upl_offset = first_dirty * vm_page_size;
3622				transfer_size = num_dirty * vm_page_size;
3623
3624				while (transfer_size) {
3625
3626					if ((seg_size = cl_size -
3627						((upl_offset_in_object +
3628						  upl_offset) % cl_size))
3629							> transfer_size)
3630					        seg_size = transfer_size;
3631
3632					ps_vs_write_complete(
3633						vs,
3634						(upl_offset_in_object +
3635						 upl_offset),
3636						seg_size, error);
3637
3638					transfer_size -= seg_size;
3639					upl_offset += seg_size;
3640				}
3641			        upl_offset = first_dirty * vm_page_size;
3642				transfer_size = num_dirty * vm_page_size;
3643
3644			        seg_index  = (base_index + first_dirty) / pages_in_cl;
3645				seg_offset = (upl_offset_in_object + upl_offset) % cl_size;
3646
3647				error = ps_write_file(psp[seg_index],
3648						upl, upl_offset,
3649						ps_offset[seg_index]
3650								+ seg_offset,
3651						transfer_size, flags);
3652			}
3653		}
3654
3655	} else {
3656		assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift));
3657		list_size = cnt;
3658
3659		page_index = 0;
3660		/* The caller provides a mapped_data which is derived  */
3661		/* from a temporary object.  The targeted pages are    */
3662		/* guaranteed to be set at offset 0 in the mapped_data */
3663		/* The actual offset however must still be derived     */
3664		/* from the offset in the vs in question               */
3665		mobj_base_addr = offset;
3666		mobj_target_addr = mobj_base_addr;
3667
3668		for (transfer_size = list_size; transfer_size != 0;) {
3669			actual_offset = ps_clmap(vs, mobj_target_addr,
3670				&clmap, CL_ALLOC,
3671				transfer_size < cl_size ?
3672					transfer_size : cl_size, 0);
3673			if(actual_offset == (dp_offset_t) -1) {
3674				error = 1;
3675				break;
3676			}
3677			cnt = MIN(transfer_size,
3678				  (unsigned) CLMAP_NPGS(clmap) * vm_page_size);
3679			ps = CLMAP_PS(clmap);
3680			/* Assume that the caller has given us contiguous */
3681			/* pages */
3682	 	   	if(cnt) {
3683				ps_vs_write_complete(vs, mobj_target_addr,
3684								cnt, error);
3685				error = ps_write_file(ps, internal_upl,
3686						0, actual_offset,
3687						cnt, flags);
3688				if (error)
3689				        break;
3690		   	   }
3691			if (error)
3692				break;
3693		   	actual_offset += cnt;
3694		   	mobj_target_addr += cnt;
3695			transfer_size -= cnt;
3696		   	cnt = 0;
3697
3698			if (error)
3699				break;
3700		}
3701	}
3702	if(error)
3703		return KERN_FAILURE;
3704	else
3705		return KERN_SUCCESS;
3706}
3707
3708vm_size_t
3709ps_vstruct_allocated_size(
3710	vstruct_t	vs)
3711{
3712	int		num_pages;
3713	struct vs_map	*vsmap;
3714	unsigned int	i, j, k;
3715
3716	num_pages = 0;
3717	if (vs->vs_indirect) {
3718		/* loop on indirect maps */
3719		for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3720			vsmap = vs->vs_imap[i];
3721			if (vsmap == NULL)
3722				continue;
3723			/* loop on clusters in this indirect map */
3724			for (j = 0; j < CLMAP_ENTRIES; j++) {
3725				if (VSM_ISCLR(vsmap[j]) ||
3726				    VSM_ISERR(vsmap[j]))
3727					continue;
3728				/* loop on pages in this cluster */
3729				for (k = 0; k < VSCLSIZE(vs); k++) {
3730					if ((VSM_BMAP(vsmap[j])) & (1 << k))
3731						num_pages++;
3732				}
3733			}
3734		}
3735	} else {
3736		vsmap = vs->vs_dmap;
3737		if (vsmap == NULL)
3738			return 0;
3739		/* loop on clusters in the direct map */
3740		for (j = 0; j < CLMAP_ENTRIES; j++) {
3741			if (VSM_ISCLR(vsmap[j]) ||
3742			    VSM_ISERR(vsmap[j]))
3743				continue;
3744			/* loop on pages in this cluster */
3745			for (k = 0; k < VSCLSIZE(vs); k++) {
3746				if ((VSM_BMAP(vsmap[j])) & (1 << k))
3747					num_pages++;
3748			}
3749		}
3750	}
3751
3752	return ptoa_32(num_pages);
3753}
3754
3755unsigned int
3756ps_vstruct_allocated_pages(
3757	vstruct_t		vs,
3758	default_pager_page_t	*pages,
3759	unsigned int		pages_size)
3760{
3761	unsigned int	num_pages;
3762	struct vs_map	*vsmap;
3763	dp_offset_t	offset;
3764	unsigned int	i, j, k;
3765
3766	num_pages = 0;
3767	offset = 0;
3768	if (vs->vs_indirect) {
3769		/* loop on indirect maps */
3770		for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3771			vsmap = vs->vs_imap[i];
3772			if (vsmap == NULL) {
3773				offset += (vm_page_size * CLMAP_ENTRIES *
3774					   VSCLSIZE(vs));
3775				continue;
3776			}
3777			/* loop on clusters in this indirect map */
3778			for (j = 0; j < CLMAP_ENTRIES; j++) {
3779				if (VSM_ISCLR(vsmap[j]) ||
3780				    VSM_ISERR(vsmap[j])) {
3781					offset += vm_page_size * VSCLSIZE(vs);
3782					continue;
3783				}
3784				/* loop on pages in this cluster */
3785				for (k = 0; k < VSCLSIZE(vs); k++) {
3786					if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3787						num_pages++;
3788						if (num_pages < pages_size)
3789							pages++->dpp_offset =
3790								offset;
3791					}
3792					offset += vm_page_size;
3793				}
3794			}
3795		}
3796	} else {
3797		vsmap = vs->vs_dmap;
3798		if (vsmap == NULL)
3799			return 0;
3800		/* loop on clusters in the direct map */
3801		for (j = 0; j < CLMAP_ENTRIES; j++) {
3802			if (VSM_ISCLR(vsmap[j]) ||
3803			    VSM_ISERR(vsmap[j])) {
3804				offset += vm_page_size * VSCLSIZE(vs);
3805				continue;
3806			}
3807			/* loop on pages in this cluster */
3808			for (k = 0; k < VSCLSIZE(vs); k++) {
3809				if ((VSM_BMAP(vsmap[j])) & (1 << k)) {
3810					num_pages++;
3811					if (num_pages < pages_size)
3812						pages++->dpp_offset = offset;
3813				}
3814				offset += vm_page_size;
3815			}
3816		}
3817	}
3818
3819	return num_pages;
3820}
3821
3822
3823kern_return_t
3824ps_vstruct_transfer_from_segment(
3825	vstruct_t	 vs,
3826	paging_segment_t segment,
3827	upl_t		 upl)
3828{
3829	struct vs_map	*vsmap;
3830//	struct vs_map	old_vsmap;
3831//	struct vs_map	new_vsmap;
3832	unsigned int	i, j;
3833
3834	VS_LOCK(vs);	/* block all work on this vstruct */
3835			/* can't allow the normal multiple write */
3836			/* semantic because writes may conflict */
3837	vs->vs_xfer_pending = TRUE;
3838	vs_wait_for_sync_writers(vs);
3839	vs_start_write(vs);
3840	vs_wait_for_readers(vs);
3841	/* we will unlock the vs to allow other writes while transferring */
3842	/* and will be guaranteed of the persistance of the vs struct     */
3843	/* because the caller of  ps_vstruct_transfer_from_segment bumped */
3844	/* vs_async_pending */
3845	/* OK we now have guaranteed no other parties are accessing this */
3846	/* vs.  Now that we are also supporting simple lock versions of  */
3847	/* vs_lock we cannot hold onto VS_LOCK as we may block below.    */
3848	/* our purpose in holding it before was the multiple write case */
3849	/* we now use the boolean xfer_pending to do that.  We can use  */
3850	/* a boolean instead of a count because we have guaranteed single */
3851	/* file access to this code in its caller */
3852	VS_UNLOCK(vs);
3853vs_changed:
3854	if (vs->vs_indirect) {
3855		unsigned int	vsmap_size;
3856		int		clmap_off;
3857		/* loop on indirect maps */
3858		for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) {
3859			vsmap = vs->vs_imap[i];
3860			if (vsmap == NULL)
3861				continue;
3862			/* loop on clusters in this indirect map */
3863			clmap_off = (vm_page_size * CLMAP_ENTRIES *
3864					   VSCLSIZE(vs) * i);
3865			if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size))
3866				vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i);
3867			else
3868				vsmap_size = CLMAP_ENTRIES;
3869			for (j = 0; j < vsmap_size; j++) {
3870				if (VSM_ISCLR(vsmap[j]) ||
3871				    VSM_ISERR(vsmap[j]) ||
3872				    (VSM_PS(vsmap[j]) != segment))
3873					continue;
3874				if(vs_cluster_transfer(vs,
3875					(vm_page_size * (j << vs->vs_clshift))
3876					+ clmap_off,
3877					vm_page_size << vs->vs_clshift,
3878					upl)
3879						!= KERN_SUCCESS) {
3880				   VS_LOCK(vs);
3881				   vs->vs_xfer_pending = FALSE;
3882				   VS_UNLOCK(vs);
3883				   vs_finish_write(vs);
3884				   return KERN_FAILURE;
3885				}
3886				/* allow other readers/writers during transfer*/
3887				VS_LOCK(vs);
3888				vs->vs_xfer_pending = FALSE;
3889				VS_UNLOCK(vs);
3890				vs_finish_write(vs);
3891
3892				if (backing_store_abort_compaction || backing_store_stop_compaction) {
3893					backing_store_abort_compaction = FALSE;
3894					dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n"));
3895					return KERN_FAILURE;
3896				}
3897				vnode_pager_throttle();
3898
3899				VS_LOCK(vs);
3900				vs->vs_xfer_pending = TRUE;
3901				vs_wait_for_sync_writers(vs);
3902				vs_start_write(vs);
3903				vs_wait_for_readers(vs);
3904				VS_UNLOCK(vs);
3905				if (!(vs->vs_indirect)) {
3906					goto vs_changed;
3907				}
3908			}
3909		}
3910	} else {
3911		vsmap = vs->vs_dmap;
3912		if (vsmap == NULL) {
3913			VS_LOCK(vs);
3914			vs->vs_xfer_pending = FALSE;
3915			VS_UNLOCK(vs);
3916			vs_finish_write(vs);
3917			return KERN_SUCCESS;
3918		}
3919		/* loop on clusters in the direct map */
3920		for (j = 0; j < vs->vs_size; j++) {
3921			if (VSM_ISCLR(vsmap[j]) ||
3922			    VSM_ISERR(vsmap[j]) ||
3923			    (VSM_PS(vsmap[j]) != segment))
3924				continue;
3925			if(vs_cluster_transfer(vs,
3926				vm_page_size * (j << vs->vs_clshift),
3927				vm_page_size << vs->vs_clshift,
3928				upl) != KERN_SUCCESS) {
3929			   VS_LOCK(vs);
3930			   vs->vs_xfer_pending = FALSE;
3931			   VS_UNLOCK(vs);
3932			   vs_finish_write(vs);
3933			   return KERN_FAILURE;
3934			}
3935			/* allow other readers/writers during transfer*/
3936			VS_LOCK(vs);
3937			vs->vs_xfer_pending = FALSE;
3938			VS_UNLOCK(vs);
3939			vs_finish_write(vs);
3940			VS_LOCK(vs);
3941			vs->vs_xfer_pending = TRUE;
3942			vs_wait_for_sync_writers(vs);
3943			vs_start_write(vs);
3944			vs_wait_for_readers(vs);
3945			VS_UNLOCK(vs);
3946			if (vs->vs_indirect) {
3947				goto vs_changed;
3948			}
3949		}
3950	}
3951
3952	VS_LOCK(vs);
3953	vs->vs_xfer_pending = FALSE;
3954	VS_UNLOCK(vs);
3955	vs_finish_write(vs);
3956	return KERN_SUCCESS;
3957}
3958
3959
3960
3961vs_map_t
3962vs_get_map_entry(
3963	vstruct_t	vs,
3964	dp_offset_t	offset)
3965{
3966	struct vs_map	*vsmap;
3967	dp_offset_t	cluster;
3968
3969	cluster = atop_32(offset) >> vs->vs_clshift;
3970	if (vs->vs_indirect) {
3971		long	ind_block = cluster/CLMAP_ENTRIES;
3972
3973		/* Is the indirect block allocated? */
3974		vsmap = vs->vs_imap[ind_block];
3975		if(vsmap == (vs_map_t) NULL)
3976			return vsmap;
3977	} else
3978		vsmap = vs->vs_dmap;
3979	vsmap += cluster%CLMAP_ENTRIES;
3980	return vsmap;
3981}
3982
3983kern_return_t
3984vs_cluster_transfer(
3985	vstruct_t	vs,
3986	dp_offset_t	offset,
3987	dp_size_t	cnt,
3988	upl_t		upl)
3989{
3990	dp_offset_t		actual_offset;
3991	paging_segment_t	ps;
3992	struct clmap		clmap;
3993	kern_return_t		error = KERN_SUCCESS;
3994	unsigned int		size, size_wanted;
3995	int			i;
3996	unsigned int		residual = 0;
3997	unsigned int		unavail_size;
3998//	default_pager_thread_t	*dpt;
3999//	boolean_t		dealloc;
4000	struct	vs_map		*vsmap_ptr = NULL;
4001	struct	vs_map		read_vsmap;
4002	struct	vs_map		original_read_vsmap;
4003	struct	vs_map		write_vsmap;
4004//	upl_t				sync_upl;
4005//	vm_offset_t			ioaddr;
4006
4007	/* vs_cluster_transfer reads in the pages of a cluster and
4008	 * then writes these pages back to new backing store.  The
4009	 * segment the pages are being read from is assumed to have
4010	 * been taken off-line and is no longer considered for new
4011	 * space requests.
4012         */
4013
4014	/*
4015	 * This loop will be executed once per cluster referenced.
4016	 * Typically this means once, since it's unlikely that the
4017	 * VM system will ask for anything spanning cluster boundaries.
4018	 *
4019	 * If there are holes in a cluster (in a paging segment), we stop
4020	 * reading at the hole, then loop again, hoping to
4021	 * find valid pages later in the cluster.  This continues until
4022	 * the entire range has been examined, and read, if present.  The
4023	 * pages are written as they are read.  If a failure occurs after
4024	 * some pages are written the unmap call at the bottom of the loop
4025	 * recovers the backing store and the old backing store remains
4026	 * in effect.
4027	 */
4028
4029	VSM_CLR(write_vsmap);
4030	VSM_CLR(original_read_vsmap);
4031	/* grab the actual object's pages to sync with I/O */
4032	while (cnt && (error == KERN_SUCCESS)) {
4033		vsmap_ptr = vs_get_map_entry(vs, offset);
4034		actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0);
4035
4036		if (actual_offset == (dp_offset_t) -1) {
4037
4038			/*
4039			 * Nothing left to write in this cluster at least
4040			 * set write cluster information for any previous
4041			 * write, clear for next cluster, if there is one
4042			 */
4043			unsigned int local_size, clmask, clsize;
4044
4045			clsize = vm_page_size << vs->vs_clshift;
4046			clmask = clsize - 1;
4047			local_size = clsize - (offset & clmask);
4048			ASSERT(local_size);
4049			local_size = MIN(local_size, cnt);
4050
4051			/* This cluster has no data in it beyond what may */
4052			/* have been found on a previous iteration through */
4053			/* the loop "write_vsmap" */
4054			*vsmap_ptr = write_vsmap;
4055			VSM_CLR(write_vsmap);
4056			VSM_CLR(original_read_vsmap);
4057
4058			cnt -= local_size;
4059			offset += local_size;
4060			continue;
4061		}
4062
4063		/*
4064		 * Count up contiguous available or unavailable
4065		 * pages.
4066		 */
4067		ps = CLMAP_PS(clmap);
4068		ASSERT(ps);
4069		size = 0;
4070		unavail_size = 0;
4071		for (i = 0;
4072		     (size < cnt) && (unavail_size < cnt) &&
4073		     (i < CLMAP_NPGS(clmap)); i++) {
4074			if (CLMAP_ISSET(clmap, i)) {
4075				if (unavail_size != 0)
4076					break;
4077				size += vm_page_size;
4078				BS_STAT(ps->ps_bs,
4079					ps->ps_bs->bs_pages_in++);
4080			} else {
4081				if (size != 0)
4082					break;
4083				unavail_size += vm_page_size;
4084			}
4085		}
4086
4087		if (size == 0) {
4088			ASSERT(unavail_size);
4089			ps_clunmap(vs, offset, unavail_size);
4090			cnt -= unavail_size;
4091			offset += unavail_size;
4092			if((offset & ((vm_page_size << vs->vs_clshift) - 1))
4093				== 0) {
4094				/* There is no more to transfer in this
4095				   cluster
4096				*/
4097				*vsmap_ptr = write_vsmap;
4098				VSM_CLR(write_vsmap);
4099				VSM_CLR(original_read_vsmap);
4100			}
4101			continue;
4102		}
4103
4104		if(VSM_ISCLR(original_read_vsmap))
4105			original_read_vsmap = *vsmap_ptr;
4106
4107		if(ps->ps_segtype == PS_PARTITION) {
4108			panic("swap partition not supported\n");
4109			/*NOTREACHED*/
4110			error = KERN_FAILURE;
4111			residual = size;
4112/*
4113			NEED TO ISSUE WITH SYNC & NO COMMIT
4114			error = ps_read_device(ps, actual_offset, &buffer,
4115				       size, &residual, flags);
4116*/
4117		} else {
4118			/* NEED TO ISSUE WITH SYNC & NO COMMIT */
4119			error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset,
4120					size, &residual,
4121					(UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0)));
4122		}
4123
4124		read_vsmap = *vsmap_ptr;
4125
4126
4127		/*
4128		 * Adjust counts and put data in new BS.  Optimize for the
4129		 * common case, i.e. no error and/or partial data.
4130		 * If there was an error, then we need to error the entire
4131		 * range, even if some data was successfully read.
4132		 *
4133		 */
4134		if ((error == KERN_SUCCESS) && (residual == 0)) {
4135
4136			/*
4137			 * Got everything we asked for, supply the data to
4138			 * the new BS.  Note that as a side effect of supplying
4139			 * the data, the buffer holding the supplied data is
4140			 * deallocated from the pager's address space unless
4141			 * the write is unsuccessful.
4142			 */
4143
4144			/* note buffer will be cleaned up in all cases by */
4145			/* internal_cluster_write or if an error on write */
4146			/* the vm_map_copy_page_discard call              */
4147			*vsmap_ptr = write_vsmap;
4148
4149			if(vs_cluster_write(vs, upl, offset,
4150					size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) {
4151			 	error = KERN_FAILURE;
4152				if(!(VSM_ISCLR(*vsmap_ptr))) {
4153					/* unmap the new backing store object */
4154					ps_clunmap(vs, offset, size);
4155				}
4156				/* original vsmap */
4157				*vsmap_ptr = original_read_vsmap;
4158				VSM_CLR(write_vsmap);
4159			} else {
4160			       if((offset + size) &
4161					((vm_page_size << vs->vs_clshift)
4162					- 1)) {
4163					/* There is more to transfer in this
4164					   cluster
4165					*/
4166					write_vsmap = *vsmap_ptr;
4167					*vsmap_ptr = read_vsmap;
4168					ps_clunmap(vs, offset, size);
4169				} else {
4170					/* discard the old backing object */
4171					write_vsmap = *vsmap_ptr;
4172					*vsmap_ptr = read_vsmap;
4173					ps_clunmap(vs, offset, size);
4174					*vsmap_ptr = write_vsmap;
4175					VSM_CLR(write_vsmap);
4176					VSM_CLR(original_read_vsmap);
4177				}
4178			}
4179		} else {
4180			size_wanted = size;
4181			if (error == KERN_SUCCESS) {
4182				if (residual == size) {
4183					/*
4184					 * If a read operation returns no error
4185					 * and no data moved, we turn it into
4186					 * an error, assuming we're reading at
4187					 * or beyond EOF.
4188					 * Fall through and error the entire
4189					 * range.
4190					 */
4191					error = KERN_FAILURE;
4192					*vsmap_ptr = write_vsmap;
4193					if(!(VSM_ISCLR(*vsmap_ptr))) {
4194					/* unmap the new backing store object */
4195					ps_clunmap(vs, offset, size);
4196					}
4197					*vsmap_ptr = original_read_vsmap;
4198					VSM_CLR(write_vsmap);
4199					continue;
4200				} else {
4201					/*
4202					 * Otherwise, we have partial read.
4203					 * This is also considered an error
4204					 * for the purposes of cluster transfer
4205					 */
4206					error = KERN_FAILURE;
4207					*vsmap_ptr = write_vsmap;
4208					if(!(VSM_ISCLR(*vsmap_ptr))) {
4209					/* unmap the new backing store object */
4210					ps_clunmap(vs, offset, size);
4211					}
4212					*vsmap_ptr = original_read_vsmap;
4213					VSM_CLR(write_vsmap);
4214					continue;
4215				}
4216			}
4217
4218		}
4219		cnt -= size;
4220		offset += size;
4221
4222	} /* END while (cnt && (error == 0)) */
4223	if(!VSM_ISCLR(write_vsmap))
4224		*vsmap_ptr = write_vsmap;
4225
4226	return error;
4227}
4228
4229kern_return_t
4230default_pager_add_file(
4231	MACH_PORT_FACE	backing_store,
4232	vnode_ptr_t	vp,
4233	int		record_size,
4234	vm_size_t	size)
4235{
4236	backing_store_t		bs;
4237	paging_segment_t	ps;
4238	int			i;
4239	unsigned int		j;
4240	int			error;
4241
4242	if ((bs = backing_store_lookup(backing_store))
4243	    == BACKING_STORE_NULL)
4244		return KERN_INVALID_ARGUMENT;
4245
4246	PSL_LOCK();
4247	for (i = 0; i <= paging_segment_max; i++) {
4248		ps = paging_segments[i];
4249		if (ps == PAGING_SEGMENT_NULL)
4250			continue;
4251		if (ps->ps_segtype != PS_FILE)
4252			continue;
4253
4254		/*
4255		 * Check for overlap on same device.
4256		 */
4257		if (ps->ps_vnode == (struct vnode *)vp) {
4258			PSL_UNLOCK();
4259			BS_UNLOCK(bs);
4260			return KERN_INVALID_ARGUMENT;
4261		}
4262	}
4263	PSL_UNLOCK();
4264
4265	/*
4266	 * Set up the paging segment
4267	 */
4268	ps = (paging_segment_t) kalloc(sizeof (struct paging_segment));
4269	if (ps == PAGING_SEGMENT_NULL) {
4270		BS_UNLOCK(bs);
4271		return KERN_RESOURCE_SHORTAGE;
4272	}
4273
4274	ps->ps_segtype = PS_FILE;
4275	ps->ps_vnode = (struct vnode *)vp;
4276	ps->ps_offset = 0;
4277	ps->ps_record_shift = local_log2(vm_page_size / record_size);
4278	assert((dp_size_t) size == size);
4279	ps->ps_recnum = (dp_size_t) size;
4280	ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift;
4281
4282	ps->ps_pgcount = ps->ps_pgnum;
4283	ps->ps_clshift = local_log2(bs->bs_clsize);
4284	ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift;
4285	ps->ps_special_clusters = 0;
4286	ps->ps_hint = 0;
4287
4288	PS_LOCK_INIT(ps);
4289	ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls));
4290	if (!ps->ps_bmap) {
4291		PS_LOCK_DESTROY(ps);
4292		kfree(ps, sizeof *ps);
4293		BS_UNLOCK(bs);
4294		return KERN_RESOURCE_SHORTAGE;
4295	}
4296	for (j = 0; j < ps->ps_ncls; j++) {
4297		clrbit(ps->ps_bmap, j);
4298	}
4299
4300	if(paging_segment_count == 0) {
4301		ps->ps_state = PS_EMERGENCY_SEGMENT;
4302		if(use_emergency_swap_file_first) {
4303			ps->ps_state |= PS_CAN_USE;
4304		}
4305		emergency_segment_backing_store = backing_store;
4306	} else {
4307		ps->ps_state = PS_CAN_USE;
4308	}
4309
4310	ps->ps_bs = bs;
4311
4312	if ((error = ps_enter(ps)) != 0) {
4313		kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls));
4314		PS_LOCK_DESTROY(ps);
4315		kfree(ps, sizeof *ps);
4316		BS_UNLOCK(bs);
4317		return KERN_RESOURCE_SHORTAGE;
4318	}
4319
4320	bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift;
4321	bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift;
4322	PSL_LOCK();
4323	if(IS_PS_OK_TO_USE(ps)) {
4324		dp_pages_free += ps->ps_pgcount;
4325	} else {
4326		dp_pages_reserve += ps->ps_pgcount;
4327	}
4328	PSL_UNLOCK();
4329
4330	BS_UNLOCK(bs);
4331
4332	bs_more_space(ps->ps_clcount);
4333
4334	/*
4335	 * If the paging segment being activated is not the emergency
4336	 * segment and we notice that the emergency segment is being
4337	 * used then we help recover it. If all goes well, the
4338	 * emergency segment will be back to its original state of
4339	 * online but not activated (till it's needed the next time).
4340	 */
4341#if CONFIG_FREEZE
4342	if (!memorystatus_freeze_enabled)
4343#endif
4344	{
4345		ps = paging_segments[EMERGENCY_PSEG_INDEX];
4346		if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) {
4347			if(default_pager_backing_store_delete(emergency_segment_backing_store)) {
4348				dprintf(("Failed to recover emergency paging segment\n"));
4349			} else {
4350				dprintf(("Recovered emergency paging segment\n"));
4351			}
4352		}
4353	}
4354
4355	DP_DEBUG(DEBUG_BS_INTERNAL,
4356		 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n",
4357		  device, offset, (dp_size_t) size, record_size,
4358		  ps->ps_record_shift, ps->ps_pgnum));
4359
4360	return KERN_SUCCESS;
4361}
4362
4363
4364
4365kern_return_t
4366ps_read_file(
4367	paging_segment_t	ps,
4368	upl_t			upl,
4369	upl_offset_t		upl_offset,
4370	dp_offset_t		offset,
4371	upl_size_t		size,
4372	unsigned int		*residualp,
4373	int			flags)
4374{
4375	vm_object_offset_t	f_offset;
4376	int			error = 0;
4377	int			result;
4378
4379	assert(dp_encryption_inited);
4380
4381	clustered_reads[atop_32(size)]++;
4382
4383	f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4384
4385	/*
4386	 * for transfer case we need to pass uploffset and flags
4387	 */
4388	assert((upl_size_t) size == size);
4389	error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL);
4390
4391	/* The vnode_pagein semantic is somewhat at odds with the existing   */
4392	/* device_read semantic.  Partial reads are not experienced at this  */
4393	/* level.  It is up to the bit map code and cluster read code to     */
4394	/* check that requested data locations are actually backed, and the  */
4395	/* pagein code to either read all of the requested data or return an */
4396	/* error. */
4397
4398	if (error)
4399		result = KERN_FAILURE;
4400	else {
4401		*residualp = 0;
4402		result = KERN_SUCCESS;
4403	}
4404	return result;
4405}
4406
4407kern_return_t
4408ps_write_file(
4409	paging_segment_t	ps,
4410	upl_t                   upl,
4411	upl_offset_t		upl_offset,
4412	dp_offset_t		offset,
4413	unsigned int		size,
4414	int			flags)
4415{
4416	vm_object_offset_t	f_offset;
4417	kern_return_t		result;
4418
4419	assert(dp_encryption_inited);
4420
4421	clustered_writes[atop_32(size)]++;
4422	f_offset = (vm_object_offset_t)(ps->ps_offset + offset);
4423
4424	if (flags & UPL_PAGING_ENCRYPTED) {
4425		/*
4426		 * ENCRYPTED SWAP:
4427		 * encrypt all the pages that we're going
4428		 * to pageout.
4429		 */
4430		upl_encrypt(upl, upl_offset, size);
4431	}
4432	assert((upl_size_t) size == size);
4433	if (vnode_pageout(ps->ps_vnode,	upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL))
4434	        result = KERN_FAILURE;
4435	else
4436	        result = KERN_SUCCESS;
4437
4438	return result;
4439}
4440
4441static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data)
4442{
4443#pragma unused(data)
4444}
4445
4446static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data)
4447{
4448#pragma unused(data)
4449}
4450
4451static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length)
4452{
4453#pragma unused(data, map, shift, length)
4454}
4455
4456kern_return_t
4457default_pager_triggers( __unused MACH_PORT_FACE default_pager,
4458	int		hi_wat,
4459	int		lo_wat,
4460	int		flags,
4461	MACH_PORT_FACE  trigger_port)
4462{
4463	MACH_PORT_FACE release = IPC_PORT_NULL;
4464	kern_return_t kr;
4465	clock_sec_t now;
4466	clock_nsec_t nanoseconds_dummy;
4467	static clock_sec_t error_notify = 0;
4468
4469	PSL_LOCK();
4470	if (flags == SWAP_ENCRYPT_ON) {
4471		/* ENCRYPTED SWAP: turn encryption on */
4472		release = trigger_port;
4473		if (!dp_encryption_inited) {
4474			dp_encryption_inited = TRUE;
4475			dp_encryption = TRUE;
4476			kr = KERN_SUCCESS;
4477		} else {
4478			kr = KERN_FAILURE;
4479		}
4480	} else if (flags == SWAP_ENCRYPT_OFF) {
4481		/* ENCRYPTED SWAP: turn encryption off */
4482		release = trigger_port;
4483		if (!dp_encryption_inited) {
4484			dp_encryption_inited = TRUE;
4485			dp_encryption = FALSE;
4486			kr = KERN_SUCCESS;
4487		} else {
4488			kr = KERN_FAILURE;
4489		}
4490	} else if (flags == HI_WAT_ALERT) {
4491		release = min_pages_trigger_port;
4492#if CONFIG_FREEZE
4493		/* High and low water signals aren't applicable when freeze is */
4494		/* enabled, so release the trigger ports here and return       */
4495		/* KERN_FAILURE.                                               */
4496		if (memorystatus_freeze_enabled) {
4497			if (IP_VALID( trigger_port )){
4498				ipc_port_release_send( trigger_port );
4499			}
4500			min_pages_trigger_port = IPC_PORT_NULL;
4501			kr = KERN_FAILURE;
4502		}
4503		else
4504#endif
4505		{
4506			min_pages_trigger_port = trigger_port;
4507			minimum_pages_remaining = hi_wat/vm_page_size;
4508			bs_low = FALSE;
4509			kr = KERN_SUCCESS;
4510		}
4511	} else if (flags ==  LO_WAT_ALERT) {
4512		release = max_pages_trigger_port;
4513#if CONFIG_FREEZE
4514		if (memorystatus_freeze_enabled) {
4515			if (IP_VALID( trigger_port )){
4516				ipc_port_release_send( trigger_port );
4517			}
4518			max_pages_trigger_port = IPC_PORT_NULL;
4519			kr = KERN_FAILURE;
4520		}
4521		else
4522#endif
4523		{
4524			max_pages_trigger_port = trigger_port;
4525			maximum_pages_free = lo_wat/vm_page_size;
4526			kr = KERN_SUCCESS;
4527		}
4528	} else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) {
4529		use_emergency_swap_file_first = TRUE;
4530		release = trigger_port;
4531		kr = KERN_SUCCESS;
4532	} else if (flags == SWAP_FILE_CREATION_ERROR) {
4533		release = trigger_port;
4534		kr = KERN_SUCCESS;
4535		if( paging_segment_count == 1) {
4536			use_emergency_swap_file_first = TRUE;
4537		}
4538		no_paging_space_action();
4539		clock_get_system_nanotime(&now, &nanoseconds_dummy);
4540		if (now > error_notify + 5) {
4541			dprintf(("Swap File Error.\n"));
4542			error_notify = now;
4543		}
4544	} else {
4545		release = trigger_port;
4546		kr =  KERN_INVALID_ARGUMENT;
4547	}
4548	PSL_UNLOCK();
4549
4550	if (IP_VALID(release))
4551		ipc_port_release_send(release);
4552
4553	return kr;
4554}
4555
4556/*
4557 * Monitor the amount of available backing store vs. the amount of
4558 * required backing store, notify a listener (if present) when
4559 * backing store may safely be removed.
4560 *
4561 * We attempt to avoid the situation where backing store is
4562 * discarded en masse, as this can lead to thrashing as the
4563 * backing store is compacted.
4564 */
4565
4566#define PF_INTERVAL	3	/* time between free level checks */
4567#define PF_LATENCY	10	/* number of intervals before release */
4568
4569static int dp_pages_free_low_count = 0;
4570thread_call_t default_pager_backing_store_monitor_callout;
4571
4572void
4573default_pager_backing_store_monitor(__unused thread_call_param_t p1,
4574									__unused thread_call_param_t p2)
4575{
4576//	unsigned long long	average;
4577	ipc_port_t		trigger;
4578	uint64_t		deadline;
4579
4580	/*
4581	 * We determine whether it will be safe to release some
4582	 * backing store by watching the free page level.  If
4583	 * it remains below the maximum_pages_free threshold for
4584	 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds)
4585	 * then we deem it safe.
4586	 *
4587	 * Note that this establishes a maximum rate at which backing
4588	 * store will be released, as each notification (currently)
4589	 * only results in a single backing store object being
4590	 * released.
4591	 */
4592	if (dp_pages_free > maximum_pages_free) {
4593		dp_pages_free_low_count++;
4594	} else {
4595		dp_pages_free_low_count = 0;
4596	}
4597
4598	/* decide whether to send notification */
4599	trigger = IP_NULL;
4600	if (max_pages_trigger_port &&
4601	    (backing_store_release_trigger_disable == 0) &&
4602	    (dp_pages_free_low_count > PF_LATENCY)) {
4603		trigger = max_pages_trigger_port;
4604		max_pages_trigger_port = NULL;
4605	}
4606
4607	/* send notification */
4608	if (trigger != IP_NULL) {
4609		VSL_LOCK();
4610		if(backing_store_release_trigger_disable != 0) {
4611			assert_wait((event_t)
4612				    &backing_store_release_trigger_disable,
4613				    THREAD_UNINT);
4614			VSL_UNLOCK();
4615			thread_block(THREAD_CONTINUE_NULL);
4616		} else {
4617			VSL_UNLOCK();
4618		}
4619		dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n"));
4620
4621		default_pager_space_alert(trigger, LO_WAT_ALERT);
4622		ipc_port_release_send(trigger);
4623		dp_pages_free_low_count = 0;
4624	}
4625
4626	clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline);
4627	thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline);
4628}
4629
4630#if CONFIG_FREEZE
4631unsigned int default_pager_swap_pages_free() {
4632	return dp_pages_free;
4633}
4634#endif
4635