1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56 57/* 58 * Default Pager. 59 * Paging File Management. 60 */ 61 62#include <mach/host_priv.h> 63#include <mach/memory_object_control.h> 64#include <mach/memory_object_server.h> 65#include <mach/upl.h> 66#include <default_pager/default_pager_internal.h> 67#include <default_pager/default_pager_alerts.h> 68#include <default_pager/default_pager_object_server.h> 69 70#include <ipc/ipc_types.h> 71#include <ipc/ipc_port.h> 72#include <ipc/ipc_space.h> 73 74#include <kern/kern_types.h> 75#include <kern/host.h> 76#include <kern/queue.h> 77#include <kern/counters.h> 78#include <kern/sched_prim.h> 79 80#include <vm/vm_kern.h> 81#include <vm/vm_pageout.h> 82#include <vm/vm_map.h> 83#include <vm/vm_object.h> 84#include <vm/vm_protos.h> 85 86 87/* todo - need large internal object support */ 88 89/* 90 * ALLOC_STRIDE... the maximum number of bytes allocated from 91 * a swap file before moving on to the next swap file... if 92 * all swap files reside on a single disk, this value should 93 * be very large (this is the default assumption)... if the 94 * swap files are spread across multiple disks, than this value 95 * should be small (128 * 1024)... 96 * 97 * This should be determined dynamically in the future 98 */ 99 100#define ALLOC_STRIDE (1024 * 1024 * 1024) 101int physical_transfer_cluster_count = 0; 102 103#define VM_SUPER_CLUSTER 0x40000 104#define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_MIN_SIZE) 105 106/* 107 * 0 means no shift to pages, so == 1 page/cluster. 1 would mean 108 * 2 pages/cluster, 2 means 4 pages/cluster, and so on. 109 */ 110#define VSTRUCT_MIN_CLSHIFT 0 111 112#define VSTRUCT_DEF_CLSHIFT 2 113int default_pager_clsize = 0; 114 115int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT; 116 117/* statistics */ 118unsigned int clustered_writes[VM_SUPER_PAGES+1]; 119unsigned int clustered_reads[VM_SUPER_PAGES+1]; 120 121/* 122 * Globals used for asynchronous paging operations: 123 * vs_async_list: head of list of to-be-completed I/O ops 124 * async_num_queued: number of pages completed, but not yet 125 * processed by async thread. 126 * async_requests_out: number of pages of requests not completed. 127 */ 128 129#if 0 130struct vs_async *vs_async_list; 131int async_num_queued; 132int async_requests_out; 133#endif 134 135 136#define VS_ASYNC_REUSE 1 137struct vs_async *vs_async_free_list; 138 139lck_mtx_t default_pager_async_lock; /* Protects globals above */ 140 141 142int vs_alloc_async_failed = 0; /* statistics */ 143int vs_alloc_async_count = 0; /* statistics */ 144struct vs_async *vs_alloc_async(void); /* forward */ 145void vs_free_async(struct vs_async *vsa); /* forward */ 146 147 148#define VS_ALLOC_ASYNC() vs_alloc_async() 149#define VS_FREE_ASYNC(vsa) vs_free_async(vsa) 150 151#define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock) 152#define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock) 153#define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr) 154#define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp) 155#define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock) 156/* 157 * Paging Space Hysteresis triggers and the target notification port 158 * 159 */ 160unsigned int dp_pages_free_drift_count = 0; 161unsigned int dp_pages_free_drifted_max = 0; 162unsigned int minimum_pages_remaining = 0; 163unsigned int maximum_pages_free = 0; 164ipc_port_t min_pages_trigger_port = NULL; 165ipc_port_t max_pages_trigger_port = NULL; 166 167#if CONFIG_FREEZE 168boolean_t use_emergency_swap_file_first = TRUE; 169#else 170boolean_t use_emergency_swap_file_first = FALSE; 171#endif 172boolean_t bs_low = FALSE; 173int backing_store_release_trigger_disable = 0; 174boolean_t backing_store_stop_compaction = FALSE; 175boolean_t backing_store_abort_compaction = FALSE; 176 177/* Have we decided if swap needs to be encrypted yet ? */ 178boolean_t dp_encryption_inited = FALSE; 179/* Should we encrypt swap ? */ 180boolean_t dp_encryption = FALSE; 181 182boolean_t dp_isssd = FALSE; 183 184/* 185 * Object sizes are rounded up to the next power of 2, 186 * unless they are bigger than a given maximum size. 187 */ 188vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */ 189 190/* 191 * List of all backing store and segments. 192 */ 193MACH_PORT_FACE emergency_segment_backing_store; 194struct backing_store_list_head backing_store_list; 195paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS]; 196lck_mtx_t paging_segments_lock; 197int paging_segment_max = 0; 198int paging_segment_count = 0; 199int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 }; 200 201 202/* 203 * Total pages free in system 204 * This differs from clusters committed/avail which is a measure of the 205 * over commitment of paging segments to backing store. An idea which is 206 * likely to be deprecated. 207 */ 208unsigned int dp_pages_free = 0; 209unsigned int dp_pages_reserve = 0; 210unsigned int cluster_transfer_minimum = 100; 211 212/* 213 * Trim state 214 */ 215struct ps_vnode_trim_data { 216 struct vnode *vp; 217 dp_offset_t offset; 218 dp_size_t length; 219}; 220 221/* forward declarations */ 222kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */ 223kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */ 224default_pager_thread_t *get_read_buffer( void ); 225kern_return_t ps_vstruct_transfer_from_segment( 226 vstruct_t vs, 227 paging_segment_t segment, 228 upl_t upl); 229kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */ 230kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */ 231kern_return_t vs_cluster_transfer( 232 vstruct_t vs, 233 dp_offset_t offset, 234 dp_size_t cnt, 235 upl_t upl); 236vs_map_t vs_get_map_entry( 237 vstruct_t vs, 238 dp_offset_t offset); 239 240kern_return_t 241default_pager_backing_store_delete_internal( MACH_PORT_FACE ); 242 243static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data); 244static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data); 245static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length); 246 247default_pager_thread_t * 248get_read_buffer( void ) 249{ 250 int i; 251 252 DPT_LOCK(dpt_lock); 253 while(TRUE) { 254 for (i=0; i<default_pager_internal_count; i++) { 255 if(dpt_array[i]->checked_out == FALSE) { 256 dpt_array[i]->checked_out = TRUE; 257 DPT_UNLOCK(dpt_lock); 258 return dpt_array[i]; 259 } 260 } 261 DPT_SLEEP(dpt_lock, &dpt_array, THREAD_UNINT); 262 } 263} 264 265void 266bs_initialize(void) 267{ 268 int i; 269 270 /* 271 * List of all backing store. 272 */ 273 BSL_LOCK_INIT(); 274 queue_init(&backing_store_list.bsl_queue); 275 PSL_LOCK_INIT(); 276 277 VS_ASYNC_LOCK_INIT(); 278#if VS_ASYNC_REUSE 279 vs_async_free_list = NULL; 280#endif /* VS_ASYNC_REUSE */ 281 282 for (i = 0; i < VM_SUPER_PAGES + 1; i++) { 283 clustered_writes[i] = 0; 284 clustered_reads[i] = 0; 285 } 286 287} 288 289/* 290 * When things do not quite workout... 291 */ 292void bs_no_paging_space(boolean_t); /* forward */ 293 294void 295bs_no_paging_space( 296 boolean_t out_of_memory) 297{ 298 299 if (out_of_memory) 300 dprintf(("*** OUT OF MEMORY ***\n")); 301 panic("bs_no_paging_space: NOT ENOUGH PAGING SPACE"); 302} 303 304void bs_more_space(int); /* forward */ 305void bs_commit(int); /* forward */ 306 307boolean_t user_warned = FALSE; 308unsigned int clusters_committed = 0; 309unsigned int clusters_available = 0; 310unsigned int clusters_committed_peak = 0; 311 312void 313bs_more_space( 314 int nclusters) 315{ 316 BSL_LOCK(); 317 /* 318 * Account for new paging space. 319 */ 320 clusters_available += nclusters; 321 322 if (clusters_available >= clusters_committed) { 323 if (verbose && user_warned) { 324 printf("%s%s - %d excess clusters now.\n", 325 my_name, 326 "paging space is OK now", 327 clusters_available - clusters_committed); 328 user_warned = FALSE; 329 clusters_committed_peak = 0; 330 } 331 } else { 332 if (verbose && user_warned) { 333 printf("%s%s - still short of %d clusters.\n", 334 my_name, 335 "WARNING: paging space over-committed", 336 clusters_committed - clusters_available); 337 clusters_committed_peak -= nclusters; 338 } 339 } 340 BSL_UNLOCK(); 341 342 return; 343} 344 345void 346bs_commit( 347 int nclusters) 348{ 349 BSL_LOCK(); 350 clusters_committed += nclusters; 351 if (clusters_committed > clusters_available) { 352 if (verbose && !user_warned) { 353 user_warned = TRUE; 354 printf("%s%s - short of %d clusters.\n", 355 my_name, 356 "WARNING: paging space over-committed", 357 clusters_committed - clusters_available); 358 } 359 if (clusters_committed > clusters_committed_peak) { 360 clusters_committed_peak = clusters_committed; 361 } 362 } else { 363 if (verbose && user_warned) { 364 printf("%s%s - was short of up to %d clusters.\n", 365 my_name, 366 "paging space is OK now", 367 clusters_committed_peak - clusters_available); 368 user_warned = FALSE; 369 clusters_committed_peak = 0; 370 } 371 } 372 BSL_UNLOCK(); 373 374 return; 375} 376 377int default_pager_info_verbose = 1; 378 379void 380bs_global_info( 381 uint64_t *totalp, 382 uint64_t *freep) 383{ 384 uint64_t pages_total, pages_free; 385 paging_segment_t ps; 386 int i; 387 388 PSL_LOCK(); 389 pages_total = pages_free = 0; 390 for (i = 0; i <= paging_segment_max; i++) { 391 ps = paging_segments[i]; 392 if (ps == PAGING_SEGMENT_NULL) 393 continue; 394 395 /* 396 * no need to lock: by the time this data 397 * gets back to any remote requestor it 398 * will be obsolete anyways 399 */ 400 pages_total += ps->ps_pgnum; 401 pages_free += ps->ps_clcount << ps->ps_clshift; 402 DP_DEBUG(DEBUG_BS_INTERNAL, 403 ("segment #%d: %d total, %d free\n", 404 i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift)); 405 } 406 *totalp = pages_total; 407 *freep = pages_free; 408 if (verbose && user_warned && default_pager_info_verbose) { 409 if (clusters_available < clusters_committed) { 410 printf("%s %d clusters committed, %d available.\n", 411 my_name, 412 clusters_committed, 413 clusters_available); 414 } 415 } 416 PSL_UNLOCK(); 417} 418 419backing_store_t backing_store_alloc(void); /* forward */ 420 421backing_store_t 422backing_store_alloc(void) 423{ 424 backing_store_t bs; 425 426 bs = (backing_store_t) kalloc(sizeof (struct backing_store)); 427 if (bs == BACKING_STORE_NULL) 428 panic("backing_store_alloc: no memory"); 429 430 BS_LOCK_INIT(bs); 431 bs->bs_port = MACH_PORT_NULL; 432 bs->bs_priority = 0; 433 bs->bs_clsize = 0; 434 bs->bs_pages_total = 0; 435 bs->bs_pages_in = 0; 436 bs->bs_pages_in_fail = 0; 437 bs->bs_pages_out = 0; 438 bs->bs_pages_out_fail = 0; 439 440 return bs; 441} 442 443backing_store_t backing_store_lookup(MACH_PORT_FACE); /* forward */ 444 445/* Even in both the component space and external versions of this pager, */ 446/* backing_store_lookup will be called from tasks in the application space */ 447backing_store_t 448backing_store_lookup( 449 MACH_PORT_FACE port) 450{ 451 backing_store_t bs; 452 453/* 454 port is currently backed with a vs structure in the alias field 455 we could create an ISBS alias and a port_is_bs call but frankly 456 I see no reason for the test, the bs->port == port check below 457 will work properly on junk entries. 458 459 if ((port == MACH_PORT_NULL) || port_is_vs(port)) 460*/ 461 if (port == MACH_PORT_NULL) 462 return BACKING_STORE_NULL; 463 464 BSL_LOCK(); 465 queue_iterate(&backing_store_list.bsl_queue, bs, backing_store_t, 466 bs_links) { 467 BS_LOCK(bs); 468 if (bs->bs_port == port) { 469 BSL_UNLOCK(); 470 /* Success, return it locked. */ 471 return bs; 472 } 473 BS_UNLOCK(bs); 474 } 475 BSL_UNLOCK(); 476 return BACKING_STORE_NULL; 477} 478 479void backing_store_add(backing_store_t); /* forward */ 480 481void 482backing_store_add( 483 __unused backing_store_t bs) 484{ 485// MACH_PORT_FACE port = bs->bs_port; 486// MACH_PORT_FACE pset = default_pager_default_set; 487 kern_return_t kr = KERN_SUCCESS; 488 489 if (kr != KERN_SUCCESS) 490 panic("backing_store_add: add to set"); 491 492} 493 494/* 495 * Set up default page shift, but only if not already 496 * set and argument is within range. 497 */ 498boolean_t 499bs_set_default_clsize(unsigned int npages) 500{ 501 switch(npages){ 502 case 1: 503 case 2: 504 case 4: 505 case 8: 506 if (default_pager_clsize == 0) /* if not yet set */ 507 vstruct_def_clshift = local_log2(npages); 508 return(TRUE); 509 } 510 return(FALSE); 511} 512 513int bs_get_global_clsize(int clsize); /* forward */ 514 515int 516bs_get_global_clsize( 517 int clsize) 518{ 519 int i; 520 memory_object_default_t dmm; 521 kern_return_t kr; 522 523 /* 524 * Only allow setting of cluster size once. If called 525 * with no cluster size (default), we use the compiled-in default 526 * for the duration. The same cluster size is used for all 527 * paging segments. 528 */ 529 if (default_pager_clsize == 0) { 530 /* 531 * Keep cluster size in bit shift because it's quicker 532 * arithmetic, and easier to keep at a power of 2. 533 */ 534 if (clsize != NO_CLSIZE) { 535 for (i = 0; (1 << i) < clsize; i++); 536 if (i > MAX_CLUSTER_SHIFT) 537 i = MAX_CLUSTER_SHIFT; 538 vstruct_def_clshift = i; 539 } 540 default_pager_clsize = (1 << vstruct_def_clshift); 541 542 /* 543 * Let the user know the new (and definitive) cluster size. 544 */ 545 if (verbose) 546 printf("%scluster size = %d page%s\n", 547 my_name, default_pager_clsize, 548 (default_pager_clsize == 1) ? "" : "s"); 549 550 /* 551 * Let the kernel know too, in case it hasn't used the 552 * default value provided in main() yet. 553 */ 554 dmm = default_pager_object; 555 clsize = default_pager_clsize * vm_page_size; /* in bytes */ 556 kr = host_default_memory_manager(host_priv_self(), 557 &dmm, 558 clsize); 559 memory_object_default_deallocate(dmm); 560 561 if (kr != KERN_SUCCESS) { 562 panic("bs_get_global_cl_size:host_default_memory_manager"); 563 } 564 if (dmm != default_pager_object) { 565 panic("bs_get_global_cl_size:there is another default pager"); 566 } 567 } 568 ASSERT(default_pager_clsize > 0 && 569 (default_pager_clsize & (default_pager_clsize - 1)) == 0); 570 571 return default_pager_clsize; 572} 573 574kern_return_t 575default_pager_backing_store_create( 576 memory_object_default_t pager, 577 int priority, 578 int clsize, /* in bytes */ 579 MACH_PORT_FACE *backing_store) 580{ 581 backing_store_t bs; 582 MACH_PORT_FACE port; 583// kern_return_t kr; 584 struct vstruct_alias *alias_struct; 585 586 if (pager != default_pager_object) 587 return KERN_INVALID_ARGUMENT; 588 589 bs = backing_store_alloc(); 590 port = ipc_port_alloc_kernel(); 591 ipc_port_make_send(port); 592 assert (port != IP_NULL); 593 594 DP_DEBUG(DEBUG_BS_EXTERNAL, 595 ("priority=%d clsize=%d bs_port=0x%x\n", 596 priority, clsize, (int) backing_store)); 597 598 alias_struct = (struct vstruct_alias *) 599 kalloc(sizeof (struct vstruct_alias)); 600 if(alias_struct != NULL) { 601 alias_struct->vs = (struct vstruct *)bs; 602 alias_struct->name = &default_pager_ops; 603 port->ip_alias = (uintptr_t) alias_struct; 604 } 605 else { 606 ipc_port_dealloc_kernel((MACH_PORT_FACE)(port)); 607 608 BS_LOCK_DESTROY(bs); 609 kfree(bs, sizeof (struct backing_store)); 610 611 return KERN_RESOURCE_SHORTAGE; 612 } 613 614 bs->bs_port = port; 615 if (priority == DEFAULT_PAGER_BACKING_STORE_MAXPRI) 616 priority = BS_MAXPRI; 617 else if (priority == BS_NOPRI) 618 priority = BS_MAXPRI; 619 else 620 priority = BS_MINPRI; 621 bs->bs_priority = priority; 622 623 bs->bs_clsize = bs_get_global_clsize(atop_32(clsize)); 624 625 BSL_LOCK(); 626 queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t, 627 bs_links); 628 BSL_UNLOCK(); 629 630 backing_store_add(bs); 631 632 *backing_store = port; 633 return KERN_SUCCESS; 634} 635 636kern_return_t 637default_pager_backing_store_info( 638 MACH_PORT_FACE backing_store, 639 backing_store_flavor_t flavour, 640 backing_store_info_t info, 641 mach_msg_type_number_t *size) 642{ 643 backing_store_t bs; 644 backing_store_basic_info_t basic; 645 int i; 646 paging_segment_t ps; 647 648 if (flavour != BACKING_STORE_BASIC_INFO || 649 *size < BACKING_STORE_BASIC_INFO_COUNT) 650 return KERN_INVALID_ARGUMENT; 651 652 basic = (backing_store_basic_info_t)info; 653 *size = BACKING_STORE_BASIC_INFO_COUNT; 654 655 VSTATS_LOCK(&global_stats.gs_lock); 656 basic->pageout_calls = global_stats.gs_pageout_calls; 657 basic->pagein_calls = global_stats.gs_pagein_calls; 658 basic->pages_in = global_stats.gs_pages_in; 659 basic->pages_out = global_stats.gs_pages_out; 660 basic->pages_unavail = global_stats.gs_pages_unavail; 661 basic->pages_init = global_stats.gs_pages_init; 662 basic->pages_init_writes= global_stats.gs_pages_init_writes; 663 VSTATS_UNLOCK(&global_stats.gs_lock); 664 665 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL) 666 return KERN_INVALID_ARGUMENT; 667 668 basic->bs_pages_total = bs->bs_pages_total; 669 PSL_LOCK(); 670 bs->bs_pages_free = 0; 671 for (i = 0; i <= paging_segment_max; i++) { 672 ps = paging_segments[i]; 673 if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) { 674 PS_LOCK(ps); 675 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; 676 PS_UNLOCK(ps); 677 } 678 } 679 PSL_UNLOCK(); 680 basic->bs_pages_free = bs->bs_pages_free; 681 basic->bs_pages_in = bs->bs_pages_in; 682 basic->bs_pages_in_fail = bs->bs_pages_in_fail; 683 basic->bs_pages_out = bs->bs_pages_out; 684 basic->bs_pages_out_fail= bs->bs_pages_out_fail; 685 686 basic->bs_priority = bs->bs_priority; 687 basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */ 688 689 BS_UNLOCK(bs); 690 691 return KERN_SUCCESS; 692} 693 694int ps_delete(paging_segment_t); /* forward */ 695boolean_t current_thread_aborted(void); 696 697int 698ps_delete( 699 paging_segment_t ps) 700{ 701 vstruct_t vs; 702 kern_return_t error = KERN_SUCCESS; 703 int vs_count; 704 705 VSL_LOCK(); /* get the lock on the list of vs's */ 706 707 /* The lock relationship and sequence is farily complicated */ 708 /* this code looks at a live list, locking and unlocking the list */ 709 /* as it traverses it. It depends on the locking behavior of */ 710 /* default_pager_no_senders. no_senders always locks the vstruct */ 711 /* targeted for removal before locking the vstruct list. However */ 712 /* it will remove that member of the list without locking its */ 713 /* neighbors. We can be sure when we hold a lock on a vstruct */ 714 /* it cannot be removed from the list but we must hold the list */ 715 /* lock to be sure that its pointers to its neighbors are valid. */ 716 /* Also, we can hold off destruction of a vstruct when the list */ 717 /* lock and the vs locks are not being held by bumping the */ 718 /* vs_async_pending count. */ 719 720 721 while(backing_store_release_trigger_disable != 0) { 722 VSL_SLEEP(&backing_store_release_trigger_disable, THREAD_UNINT); 723 } 724 725 /* we will choose instead to hold a send right */ 726 vs_count = vstruct_list.vsl_count; 727 vs = (vstruct_t) queue_first((queue_entry_t)&(vstruct_list.vsl_queue)); 728 if(vs == (vstruct_t)&vstruct_list) { 729 VSL_UNLOCK(); 730 return KERN_SUCCESS; 731 } 732 VS_LOCK(vs); 733 vs_async_wait(vs); /* wait for any pending async writes */ 734 if ((vs_count != 0) && (vs != NULL)) 735 vs->vs_async_pending += 1; /* hold parties calling */ 736 /* vs_async_wait */ 737 738 if (bs_low == FALSE) 739 backing_store_abort_compaction = FALSE; 740 741 VS_UNLOCK(vs); 742 VSL_UNLOCK(); 743 while((vs_count != 0) && (vs != NULL)) { 744 /* We take the count of AMO's before beginning the */ 745 /* transfer of of the target segment. */ 746 /* We are guaranteed that the target segment cannot get */ 747 /* more users. We also know that queue entries are */ 748 /* made at the back of the list. If some of the entries */ 749 /* we would check disappear while we are traversing the */ 750 /* list then we will either check new entries which */ 751 /* do not have any backing store in the target segment */ 752 /* or re-check old entries. This might not be optimal */ 753 /* but it will always be correct. The alternative is to */ 754 /* take a snapshot of the list. */ 755 vstruct_t next_vs; 756 757 if(dp_pages_free < cluster_transfer_minimum) 758 error = KERN_FAILURE; 759 else { 760 vm_object_t transfer_object; 761 unsigned int count; 762 upl_t upl; 763 int upl_flags; 764 765 transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER); 766 count = 0; 767 upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | 768 UPL_SET_LITE | UPL_SET_INTERNAL); 769 if (dp_encryption) { 770 /* mark the pages as "encrypted" when they come in */ 771 upl_flags |= UPL_ENCRYPT; 772 } 773 error = vm_object_upl_request(transfer_object, 774 (vm_object_offset_t)0, VM_SUPER_CLUSTER, 775 &upl, NULL, &count, upl_flags); 776 777 if(error == KERN_SUCCESS) { 778 error = ps_vstruct_transfer_from_segment( 779 vs, ps, upl); 780 upl_commit(upl, NULL, 0); 781 upl_deallocate(upl); 782 } else { 783 error = KERN_FAILURE; 784 } 785 vm_object_deallocate(transfer_object); 786 } 787 if(error || current_thread_aborted()) { 788 VS_LOCK(vs); 789 vs->vs_async_pending -= 1; /* release vs_async_wait */ 790 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { 791 vs->vs_waiting_async = FALSE; 792 VS_UNLOCK(vs); 793 thread_wakeup(&vs->vs_async_pending); 794 } else { 795 VS_UNLOCK(vs); 796 } 797 return KERN_FAILURE; 798 } 799 800 VSL_LOCK(); 801 802 while(backing_store_release_trigger_disable != 0) { 803 VSL_SLEEP(&backing_store_release_trigger_disable, 804 THREAD_UNINT); 805 } 806 807 next_vs = (vstruct_t) queue_next(&(vs->vs_links)); 808 if((next_vs != (vstruct_t)&vstruct_list) && 809 (vs != next_vs) && (vs_count != 1)) { 810 VS_LOCK(next_vs); 811 vs_async_wait(next_vs); /* wait for any */ 812 /* pending async writes */ 813 next_vs->vs_async_pending += 1; /* hold parties */ 814 /* calling vs_async_wait */ 815 VS_UNLOCK(next_vs); 816 } 817 VSL_UNLOCK(); 818 VS_LOCK(vs); 819 vs->vs_async_pending -= 1; 820 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { 821 vs->vs_waiting_async = FALSE; 822 VS_UNLOCK(vs); 823 thread_wakeup(&vs->vs_async_pending); 824 } else { 825 VS_UNLOCK(vs); 826 } 827 if((vs == next_vs) || (next_vs == (vstruct_t)&vstruct_list)) 828 vs = NULL; 829 else 830 vs = next_vs; 831 vs_count--; 832 } 833 return KERN_SUCCESS; 834} 835 836 837kern_return_t 838default_pager_backing_store_delete_internal( 839 MACH_PORT_FACE backing_store) 840{ 841 backing_store_t bs; 842 int i; 843 paging_segment_t ps; 844 int error; 845 int interim_pages_removed = 0; 846 boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store ); 847 848 if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL) 849 return KERN_INVALID_ARGUMENT; 850 851restart: 852 PSL_LOCK(); 853 error = KERN_SUCCESS; 854 for (i = 0; i <= paging_segment_max; i++) { 855 ps = paging_segments[i]; 856 if (ps != PAGING_SEGMENT_NULL && 857 ps->ps_bs == bs && 858 ! IS_PS_GOING_AWAY(ps)) { 859 PS_LOCK(ps); 860 861 if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) { 862 /* 863 * Someone is already busy reclamining this paging segment. 864 * If it's the emergency segment we are looking at then check 865 * that someone has not already recovered it and set the right 866 * state i.e. online but not activated. 867 */ 868 PS_UNLOCK(ps); 869 continue; 870 } 871 872 /* disable access to this segment */ 873 ps->ps_state &= ~PS_CAN_USE; 874 ps->ps_state |= PS_GOING_AWAY; 875 PS_UNLOCK(ps); 876 /* 877 * The "ps" segment is "off-line" now, 878 * we can try and delete it... 879 */ 880 if(dp_pages_free < (cluster_transfer_minimum 881 + ps->ps_pgcount)) { 882 error = KERN_FAILURE; 883 PSL_UNLOCK(); 884 } 885 else { 886 /* remove all pages associated with the */ 887 /* segment from the list of free pages */ 888 /* when transfer is through, all target */ 889 /* segment pages will appear to be free */ 890 891 dp_pages_free -= ps->ps_pgcount; 892 interim_pages_removed += ps->ps_pgcount; 893 PSL_UNLOCK(); 894 error = ps_delete(ps); 895 } 896 if (error != KERN_SUCCESS) { 897 /* 898 * We couldn't delete the segment, 899 * probably because there's not enough 900 * virtual memory left. 901 * Re-enable all the segments. 902 */ 903 PSL_LOCK(); 904 break; 905 } 906 goto restart; 907 } 908 } 909 910 if (error != KERN_SUCCESS) { 911 for (i = 0; i <= paging_segment_max; i++) { 912 ps = paging_segments[i]; 913 if (ps != PAGING_SEGMENT_NULL && 914 ps->ps_bs == bs && 915 IS_PS_GOING_AWAY(ps)) { 916 PS_LOCK(ps); 917 918 if( !IS_PS_GOING_AWAY(ps)) { 919 PS_UNLOCK(ps); 920 continue; 921 } 922 /* Handle the special clusters that came in while we let go the lock*/ 923 if( ps->ps_special_clusters) { 924 dp_pages_free += ps->ps_special_clusters << ps->ps_clshift; 925 ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift; 926 ps->ps_clcount += ps->ps_special_clusters; 927 if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) { 928 ps_select_array[ps->ps_bs->bs_priority] = 0; 929 } 930 ps->ps_special_clusters = 0; 931 } 932 /* re-enable access to this segment */ 933 ps->ps_state &= ~PS_GOING_AWAY; 934 ps->ps_state |= PS_CAN_USE; 935 PS_UNLOCK(ps); 936 } 937 } 938 dp_pages_free += interim_pages_removed; 939 PSL_UNLOCK(); 940 BS_UNLOCK(bs); 941 return error; 942 } 943 944 for (i = 0; i <= paging_segment_max; i++) { 945 ps = paging_segments[i]; 946 if (ps != PAGING_SEGMENT_NULL && 947 ps->ps_bs == bs) { 948 if(IS_PS_GOING_AWAY(ps)) { 949 if(IS_PS_EMERGENCY_SEGMENT(ps)) { 950 PS_LOCK(ps); 951 ps->ps_state &= ~PS_GOING_AWAY; 952 ps->ps_special_clusters = 0; 953 ps->ps_pgcount = ps->ps_pgnum; 954 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; 955 dp_pages_reserve += ps->ps_pgcount; 956 PS_UNLOCK(ps); 957 } else { 958 paging_segments[i] = PAGING_SEGMENT_NULL; 959 paging_segment_count--; 960 PS_LOCK(ps); 961 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); 962 kfree(ps, sizeof *ps); 963 } 964 } 965 } 966 } 967 968 /* Scan the entire ps array separately to make certain we find the */ 969 /* proper paging_segment_max */ 970 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) { 971 if(paging_segments[i] != PAGING_SEGMENT_NULL) 972 paging_segment_max = i; 973 } 974 975 PSL_UNLOCK(); 976 977 if( dealing_with_emergency_segment ) { 978 BS_UNLOCK(bs); 979 return KERN_SUCCESS; 980 } 981 982 /* 983 * All the segments have been deleted. 984 * We can remove the backing store. 985 */ 986 987 /* 988 * Disable lookups of this backing store. 989 */ 990 if((void *)bs->bs_port->ip_alias != NULL) 991 kfree((void *) bs->bs_port->ip_alias, 992 sizeof (struct vstruct_alias)); 993 ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port)); 994 bs->bs_port = MACH_PORT_NULL; 995 BS_UNLOCK(bs); 996 997 /* 998 * Remove backing store from backing_store list. 999 */ 1000 BSL_LOCK(); 1001 queue_remove(&backing_store_list.bsl_queue, bs, backing_store_t, 1002 bs_links); 1003 BSL_UNLOCK(); 1004 1005 /* 1006 * Free the backing store structure. 1007 */ 1008 BS_LOCK_DESTROY(bs); 1009 kfree(bs, sizeof *bs); 1010 1011 return KERN_SUCCESS; 1012} 1013 1014kern_return_t 1015default_pager_backing_store_delete( 1016 MACH_PORT_FACE backing_store) 1017{ 1018 if( backing_store != emergency_segment_backing_store ) { 1019 default_pager_backing_store_delete_internal(emergency_segment_backing_store); 1020 } 1021 return(default_pager_backing_store_delete_internal(backing_store)); 1022} 1023 1024int ps_enter(paging_segment_t); /* forward */ 1025 1026int 1027ps_enter( 1028 paging_segment_t ps) 1029{ 1030 int i; 1031 1032 PSL_LOCK(); 1033 1034 for (i = 0; i < MAX_NUM_PAGING_SEGMENTS; i++) { 1035 if (paging_segments[i] == PAGING_SEGMENT_NULL) 1036 break; 1037 } 1038 1039 if (i < MAX_NUM_PAGING_SEGMENTS) { 1040 paging_segments[i] = ps; 1041 if (i > paging_segment_max) 1042 paging_segment_max = i; 1043 paging_segment_count++; 1044 if ((ps_select_array[ps->ps_bs->bs_priority] == BS_NOPRI) || 1045 (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI)) 1046 ps_select_array[ps->ps_bs->bs_priority] = 0; 1047 i = 0; 1048 } else { 1049 PSL_UNLOCK(); 1050 return KERN_RESOURCE_SHORTAGE; 1051 } 1052 1053 PSL_UNLOCK(); 1054 return i; 1055} 1056 1057#ifdef DEVICE_PAGING 1058kern_return_t 1059default_pager_add_segment( 1060 MACH_PORT_FACE backing_store, 1061 MACH_PORT_FACE device, 1062 recnum_t offset, 1063 recnum_t count, 1064 int record_size) 1065{ 1066 backing_store_t bs; 1067 paging_segment_t ps; 1068 int i; 1069 int error; 1070 1071 if ((bs = backing_store_lookup(backing_store)) 1072 == BACKING_STORE_NULL) 1073 return KERN_INVALID_ARGUMENT; 1074 1075 PSL_LOCK(); 1076 for (i = 0; i <= paging_segment_max; i++) { 1077 ps = paging_segments[i]; 1078 if (ps == PAGING_SEGMENT_NULL) 1079 continue; 1080 1081 /* 1082 * Check for overlap on same device. 1083 */ 1084 if (!(ps->ps_device != device 1085 || offset >= ps->ps_offset + ps->ps_recnum 1086 || offset + count <= ps->ps_offset)) { 1087 PSL_UNLOCK(); 1088 BS_UNLOCK(bs); 1089 return KERN_INVALID_ARGUMENT; 1090 } 1091 } 1092 PSL_UNLOCK(); 1093 1094 /* 1095 * Set up the paging segment 1096 */ 1097 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment)); 1098 if (ps == PAGING_SEGMENT_NULL) { 1099 BS_UNLOCK(bs); 1100 return KERN_RESOURCE_SHORTAGE; 1101 } 1102 1103 ps->ps_segtype = PS_PARTITION; 1104 ps->ps_device = device; 1105 ps->ps_offset = offset; 1106 ps->ps_record_shift = local_log2(vm_page_size / record_size); 1107 ps->ps_recnum = count; 1108 ps->ps_pgnum = count >> ps->ps_record_shift; 1109 1110 ps->ps_pgcount = ps->ps_pgnum; 1111 ps->ps_clshift = local_log2(bs->bs_clsize); 1112 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; 1113 ps->ps_hint = 0; 1114 1115 PS_LOCK_INIT(ps); 1116 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); 1117 if (!ps->ps_bmap) { 1118 PS_LOCK_DESTROY(ps); 1119 kfree(ps, sizeof *ps); 1120 BS_UNLOCK(bs); 1121 return KERN_RESOURCE_SHORTAGE; 1122 } 1123 for (i = 0; i < ps->ps_ncls; i++) { 1124 clrbit(ps->ps_bmap, i); 1125 } 1126 1127 if(paging_segment_count == 0) { 1128 ps->ps_state = PS_EMERGENCY_SEGMENT; 1129 if(use_emergency_swap_file_first) { 1130 ps->ps_state |= PS_CAN_USE; 1131 } 1132 } else { 1133 ps->ps_state = PS_CAN_USE; 1134 } 1135 1136 ps->ps_bs = bs; 1137 1138 if ((error = ps_enter(ps)) != 0) { 1139 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); 1140 1141 PS_LOCK_DESTROY(ps); 1142 kfree(ps, sizeof *ps); 1143 BS_UNLOCK(bs); 1144 return KERN_RESOURCE_SHORTAGE; 1145 } 1146 1147 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; 1148 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift; 1149 BS_UNLOCK(bs); 1150 1151 PSL_LOCK(); 1152 if(IS_PS_OK_TO_USE(ps)) { 1153 dp_pages_free += ps->ps_pgcount; 1154 } else { 1155 dp_pages_reserve += ps->ps_pgcount; 1156 } 1157 PSL_UNLOCK(); 1158 1159 bs_more_space(ps->ps_clcount); 1160 1161 DP_DEBUG(DEBUG_BS_INTERNAL, 1162 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", 1163 device, offset, count, record_size, 1164 ps->ps_record_shift, ps->ps_pgnum)); 1165 1166 return KERN_SUCCESS; 1167} 1168 1169boolean_t 1170bs_add_device( 1171 char *dev_name, 1172 MACH_PORT_FACE master) 1173{ 1174 security_token_t null_security_token = { 1175 { 0, 0 } 1176 }; 1177 MACH_PORT_FACE device; 1178 int info[DEV_GET_SIZE_COUNT]; 1179 mach_msg_type_number_t info_count; 1180 MACH_PORT_FACE bs = MACH_PORT_NULL; 1181 unsigned int rec_size; 1182 recnum_t count; 1183 int clsize; 1184 MACH_PORT_FACE reply_port; 1185 1186 if (ds_device_open_sync(master, MACH_PORT_NULL, D_READ | D_WRITE, 1187 null_security_token, dev_name, &device)) 1188 return FALSE; 1189 1190 info_count = DEV_GET_SIZE_COUNT; 1191 if (!ds_device_get_status(device, DEV_GET_SIZE, info, &info_count)) { 1192 rec_size = info[DEV_GET_SIZE_RECORD_SIZE]; 1193 count = info[DEV_GET_SIZE_DEVICE_SIZE] / rec_size; 1194 clsize = bs_get_global_clsize(0); 1195 if (!default_pager_backing_store_create( 1196 default_pager_object, 1197 DEFAULT_PAGER_BACKING_STORE_MAXPRI, 1198 (clsize * vm_page_size), 1199 &bs)) { 1200 if (!default_pager_add_segment(bs, device, 1201 0, count, rec_size)) { 1202 return TRUE; 1203 } 1204 ipc_port_release_receive(bs); 1205 } 1206 } 1207 1208 ipc_port_release_send(device); 1209 return FALSE; 1210} 1211#endif /* DEVICE_PAGING */ 1212 1213#if VS_ASYNC_REUSE 1214 1215struct vs_async * 1216vs_alloc_async(void) 1217{ 1218 struct vs_async *vsa; 1219 MACH_PORT_FACE reply_port; 1220// kern_return_t kr; 1221 1222 VS_ASYNC_LOCK(); 1223 if (vs_async_free_list == NULL) { 1224 VS_ASYNC_UNLOCK(); 1225 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async)); 1226 if (vsa != NULL) { 1227 /* 1228 * Try allocating a reply port named after the 1229 * address of the vs_async structure. 1230 */ 1231 struct vstruct_alias *alias_struct; 1232 1233 reply_port = ipc_port_alloc_kernel(); 1234 alias_struct = (struct vstruct_alias *) 1235 kalloc(sizeof (struct vstruct_alias)); 1236 if(alias_struct != NULL) { 1237 alias_struct->vs = (struct vstruct *)vsa; 1238 alias_struct->name = &default_pager_ops; 1239 reply_port->ip_alias = (uintptr_t) alias_struct; 1240 vsa->reply_port = reply_port; 1241 vs_alloc_async_count++; 1242 } 1243 else { 1244 vs_alloc_async_failed++; 1245 ipc_port_dealloc_kernel((MACH_PORT_FACE) 1246 (reply_port)); 1247 kfree(vsa, sizeof (struct vs_async)); 1248 vsa = NULL; 1249 } 1250 } 1251 } else { 1252 vsa = vs_async_free_list; 1253 vs_async_free_list = vs_async_free_list->vsa_next; 1254 VS_ASYNC_UNLOCK(); 1255 } 1256 1257 return vsa; 1258} 1259 1260void 1261vs_free_async( 1262 struct vs_async *vsa) 1263{ 1264 VS_ASYNC_LOCK(); 1265 vsa->vsa_next = vs_async_free_list; 1266 vs_async_free_list = vsa; 1267 VS_ASYNC_UNLOCK(); 1268} 1269 1270#else /* VS_ASYNC_REUSE */ 1271 1272struct vs_async * 1273vs_alloc_async(void) 1274{ 1275 struct vs_async *vsa; 1276 MACH_PORT_FACE reply_port; 1277 kern_return_t kr; 1278 1279 vsa = (struct vs_async *) kalloc(sizeof (struct vs_async)); 1280 if (vsa != NULL) { 1281 /* 1282 * Try allocating a reply port named after the 1283 * address of the vs_async structure. 1284 */ 1285 reply_port = ipc_port_alloc_kernel(); 1286 alias_struct = (vstruct_alias *) 1287 kalloc(sizeof (struct vstruct_alias)); 1288 if(alias_struct != NULL) { 1289 alias_struct->vs = reply_port; 1290 alias_struct->name = &default_pager_ops; 1291 reply_port->defpager_importance.alias = (int) vsa; 1292 vsa->reply_port = reply_port; 1293 vs_alloc_async_count++; 1294 } 1295 else { 1296 vs_alloc_async_failed++; 1297 ipc_port_dealloc_kernel((MACH_PORT_FACE) 1298 (reply_port)); 1299 kfree(vsa, sizeof (struct vs_async)); 1300 vsa = NULL; 1301 } 1302 } 1303 1304 return vsa; 1305} 1306 1307void 1308vs_free_async( 1309 struct vs_async *vsa) 1310{ 1311 MACH_PORT_FACE reply_port; 1312 kern_return_t kr; 1313 1314 reply_port = vsa->reply_port; 1315 kfree(reply_port->ip_alias, sizeof (struct vstuct_alias)); 1316 kfree(vsa, sizeof (struct vs_async)); 1317 ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port)); 1318#if 0 1319 VS_ASYNC_LOCK(); 1320 vs_alloc_async_count--; 1321 VS_ASYNC_UNLOCK(); 1322#endif 1323} 1324 1325#endif /* VS_ASYNC_REUSE */ 1326 1327zone_t vstruct_zone; 1328 1329vstruct_t 1330ps_vstruct_create( 1331 dp_size_t size) 1332{ 1333 vstruct_t vs; 1334 unsigned int i; 1335 1336 vs = (vstruct_t) zalloc(vstruct_zone); 1337 if (vs == VSTRUCT_NULL) { 1338 return VSTRUCT_NULL; 1339 } 1340 1341 VS_LOCK_INIT(vs); 1342 1343 /* 1344 * The following fields will be provided later. 1345 */ 1346 vs->vs_pager_ops = NULL; 1347 vs->vs_control = MEMORY_OBJECT_CONTROL_NULL; 1348 vs->vs_references = 1; 1349 vs->vs_seqno = 0; 1350 1351 vs->vs_waiting_seqno = FALSE; 1352 vs->vs_waiting_read = FALSE; 1353 vs->vs_waiting_write = FALSE; 1354 vs->vs_waiting_async = FALSE; 1355 1356 vs->vs_readers = 0; 1357 vs->vs_writers = 0; 1358 1359 vs->vs_errors = 0; 1360 1361 vs->vs_clshift = local_log2(bs_get_global_clsize(0)); 1362 vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1; 1363 vs->vs_async_pending = 0; 1364 1365 /* 1366 * Allocate the pmap, either CLMAP_SIZE or INDIRECT_CLMAP_SIZE 1367 * depending on the size of the memory object. 1368 */ 1369 if (INDIRECT_CLMAP(vs->vs_size)) { 1370 vs->vs_imap = (struct vs_map **) 1371 kalloc(INDIRECT_CLMAP_SIZE(vs->vs_size)); 1372 vs->vs_indirect = TRUE; 1373 } else { 1374 vs->vs_dmap = (struct vs_map *) 1375 kalloc(CLMAP_SIZE(vs->vs_size)); 1376 vs->vs_indirect = FALSE; 1377 } 1378 vs->vs_xfer_pending = FALSE; 1379 DP_DEBUG(DEBUG_VS_INTERNAL, 1380 ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect)); 1381 1382 /* 1383 * Check to see that we got the space. 1384 */ 1385 if (!vs->vs_dmap) { 1386 kfree(vs, sizeof *vs); 1387 return VSTRUCT_NULL; 1388 } 1389 1390 /* 1391 * Zero the indirect pointers, or clear the direct pointers. 1392 */ 1393 if (vs->vs_indirect) 1394 memset(vs->vs_imap, 0, 1395 INDIRECT_CLMAP_SIZE(vs->vs_size)); 1396 else 1397 for (i = 0; i < vs->vs_size; i++) 1398 VSM_CLR(vs->vs_dmap[i]); 1399 1400 VS_MAP_LOCK_INIT(vs); 1401 1402 bs_commit(vs->vs_size); 1403 1404 return vs; 1405} 1406 1407paging_segment_t ps_select_segment(unsigned int, int *); /* forward */ 1408 1409paging_segment_t 1410ps_select_segment( 1411 unsigned int shift, 1412 int *psindex) 1413{ 1414 paging_segment_t ps; 1415 int i; 1416 int j; 1417 1418 /* 1419 * Optimize case where there's only one segment. 1420 * paging_segment_max will index the one and only segment. 1421 */ 1422 1423 PSL_LOCK(); 1424 if (paging_segment_count == 1) { 1425 paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */ 1426 ipc_port_t trigger = IP_NULL; 1427 1428 ps = paging_segments[paging_segment_max]; 1429 *psindex = paging_segment_max; 1430 PS_LOCK(ps); 1431 if( !IS_PS_EMERGENCY_SEGMENT(ps) ) { 1432 panic("Emergency paging segment missing\n"); 1433 } 1434 ASSERT(ps->ps_clshift >= shift); 1435 if(IS_PS_OK_TO_USE(ps)) { 1436 if (ps->ps_clcount) { 1437 ps->ps_clcount--; 1438 dp_pages_free -= 1 << ps->ps_clshift; 1439 ps->ps_pgcount -= 1 << ps->ps_clshift; 1440 if(min_pages_trigger_port && 1441 (dp_pages_free < minimum_pages_remaining)) { 1442 trigger = min_pages_trigger_port; 1443 min_pages_trigger_port = NULL; 1444 bs_low = TRUE; 1445 backing_store_abort_compaction = TRUE; 1446 } 1447 lps = ps; 1448 } 1449 } 1450 PS_UNLOCK(ps); 1451 1452 if( lps == PAGING_SEGMENT_NULL ) { 1453 if(dp_pages_free) { 1454 dp_pages_free_drift_count++; 1455 if(dp_pages_free > dp_pages_free_drifted_max) { 1456 dp_pages_free_drifted_max = dp_pages_free; 1457 } 1458 dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free)); 1459 } 1460 dp_pages_free = 0; 1461 } 1462 1463 PSL_UNLOCK(); 1464 1465 if (trigger != IP_NULL) { 1466 dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); 1467 1468 default_pager_space_alert(trigger, HI_WAT_ALERT); 1469 ipc_port_release_send(trigger); 1470 } 1471 return lps; 1472 } 1473 1474 if (paging_segment_count == 0) { 1475 if(dp_pages_free) { 1476 dp_pages_free_drift_count++; 1477 if(dp_pages_free > dp_pages_free_drifted_max) { 1478 dp_pages_free_drifted_max = dp_pages_free; 1479 } 1480 dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free)); 1481 } 1482 dp_pages_free = 0; 1483 PSL_UNLOCK(); 1484 return PAGING_SEGMENT_NULL; 1485 } 1486 1487 for (i = BS_MAXPRI; 1488 i >= BS_MINPRI; i--) { 1489 int start_index; 1490 1491 if ((ps_select_array[i] == BS_NOPRI) || 1492 (ps_select_array[i] == BS_FULLPRI)) 1493 continue; 1494 start_index = ps_select_array[i]; 1495 1496 if(!(paging_segments[start_index])) { 1497 j = start_index+1; 1498 physical_transfer_cluster_count = 0; 1499 } 1500 else if ((physical_transfer_cluster_count+1) == (ALLOC_STRIDE >> 1501 (((paging_segments[start_index])->ps_clshift) 1502 + vm_page_shift))) { 1503 physical_transfer_cluster_count = 0; 1504 j = start_index + 1; 1505 } else { 1506 physical_transfer_cluster_count+=1; 1507 j = start_index; 1508 if(start_index == 0) 1509 start_index = paging_segment_max; 1510 else 1511 start_index = start_index - 1; 1512 } 1513 1514 while (1) { 1515 if (j > paging_segment_max) 1516 j = 0; 1517 if ((ps = paging_segments[j]) && 1518 (ps->ps_bs->bs_priority == i)) { 1519 /* 1520 * Force the ps cluster size to be 1521 * >= that of the vstruct. 1522 */ 1523 PS_LOCK(ps); 1524 if (IS_PS_OK_TO_USE(ps)) { 1525 if ((ps->ps_clcount) && 1526 (ps->ps_clshift >= shift)) { 1527 ipc_port_t trigger = IP_NULL; 1528 1529 ps->ps_clcount--; 1530 dp_pages_free -= 1 << ps->ps_clshift; 1531 ps->ps_pgcount -= 1 << ps->ps_clshift; 1532 if(min_pages_trigger_port && 1533 (dp_pages_free < 1534 minimum_pages_remaining)) { 1535 trigger = min_pages_trigger_port; 1536 min_pages_trigger_port = NULL; 1537 bs_low = TRUE; 1538 backing_store_abort_compaction = TRUE; 1539 } 1540 PS_UNLOCK(ps); 1541 /* 1542 * found one, quit looking. 1543 */ 1544 ps_select_array[i] = j; 1545 PSL_UNLOCK(); 1546 1547 if (trigger != IP_NULL) { 1548 dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); 1549 1550 default_pager_space_alert( 1551 trigger, 1552 HI_WAT_ALERT); 1553 ipc_port_release_send(trigger); 1554 } 1555 *psindex = j; 1556 return ps; 1557 } 1558 } 1559 PS_UNLOCK(ps); 1560 } 1561 if (j == start_index) { 1562 /* 1563 * none at this priority -- mark it full 1564 */ 1565 ps_select_array[i] = BS_FULLPRI; 1566 break; 1567 } 1568 j++; 1569 } 1570 } 1571 1572 if(dp_pages_free) { 1573 dp_pages_free_drift_count++; 1574 if(dp_pages_free > dp_pages_free_drifted_max) { 1575 dp_pages_free_drifted_max = dp_pages_free; 1576 } 1577 dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free)); 1578 } 1579 dp_pages_free = 0; 1580 PSL_UNLOCK(); 1581 return PAGING_SEGMENT_NULL; 1582} 1583 1584dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/ 1585 1586dp_offset_t 1587ps_allocate_cluster( 1588 vstruct_t vs, 1589 int *psindex, 1590 paging_segment_t use_ps) 1591{ 1592 unsigned int byte_num; 1593 int bit_num = 0; 1594 paging_segment_t ps; 1595 dp_offset_t cluster; 1596 ipc_port_t trigger = IP_NULL; 1597 1598 /* 1599 * Find best paging segment. 1600 * ps_select_segment will decrement cluster count on ps. 1601 * Must pass cluster shift to find the most appropriate segment. 1602 */ 1603 /* NOTE: The addition of paging segment delete capability threatened 1604 * to seriously complicate the treatment of paging segments in this 1605 * module and the ones that call it (notably ps_clmap), because of the 1606 * difficulty in assuring that the paging segment would continue to 1607 * exist between being unlocked and locked. This was 1608 * avoided because all calls to this module are based in either 1609 * dp_memory_object calls which rely on the vs lock, or by 1610 * the transfer function which is part of the segment delete path. 1611 * The transfer function which is part of paging segment delete is 1612 * protected from multiple callers by the backing store lock. 1613 * The paging segment delete function treats mappings to a paging 1614 * segment on a vstruct by vstruct basis, locking the vstruct targeted 1615 * while data is transferred to the remaining segments. This is in 1616 * line with the view that incomplete or in-transition mappings between 1617 * data, a vstruct, and backing store are protected by the vs lock. 1618 * This and the ordering of the paging segment "going_away" bit setting 1619 * protects us. 1620 */ 1621retry: 1622 if (use_ps != PAGING_SEGMENT_NULL) { 1623 ps = use_ps; 1624 PSL_LOCK(); 1625 PS_LOCK(ps); 1626 1627 ASSERT(ps->ps_clcount != 0); 1628 1629 ps->ps_clcount--; 1630 dp_pages_free -= 1 << ps->ps_clshift; 1631 ps->ps_pgcount -= 1 << ps->ps_clshift; 1632 if(min_pages_trigger_port && 1633 (dp_pages_free < minimum_pages_remaining)) { 1634 trigger = min_pages_trigger_port; 1635 min_pages_trigger_port = NULL; 1636 bs_low = TRUE; 1637 backing_store_abort_compaction = TRUE; 1638 } 1639 PSL_UNLOCK(); 1640 PS_UNLOCK(ps); 1641 if (trigger != IP_NULL) { 1642 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); 1643 1644 default_pager_space_alert(trigger, HI_WAT_ALERT); 1645 ipc_port_release_send(trigger); 1646 } 1647 1648 } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) == 1649 PAGING_SEGMENT_NULL) { 1650 static clock_sec_t lastnotify = 0; 1651 clock_sec_t now; 1652 clock_nsec_t nanoseconds_dummy; 1653 1654 /* 1655 * Don't immediately jump to the emergency segment. Give the 1656 * dynamic pager a chance to create it's first normal swap file. 1657 * Unless, of course the very first normal swap file can't be 1658 * created due to some problem and we didn't expect that problem 1659 * i.e. use_emergency_swap_file_first was never set to true initially. 1660 * It then gets set in the swap file creation error handling. 1661 */ 1662 if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) { 1663 1664 ps = paging_segments[EMERGENCY_PSEG_INDEX]; 1665 if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) { 1666 PSL_LOCK(); 1667 PS_LOCK(ps); 1668 1669 if(IS_PS_GOING_AWAY(ps)) { 1670 /* Someone de-activated the emergency paging segment*/ 1671 PS_UNLOCK(ps); 1672 PSL_UNLOCK(); 1673 1674 } else if(dp_pages_free) { 1675 /* 1676 * Someone has already activated the emergency paging segment 1677 * OR 1678 * Between us having rec'd a NULL segment from ps_select_segment 1679 * and reaching here a new normal segment could have been added. 1680 * E.g. we get NULL segment and another thread just added the 1681 * new swap file. Hence check to see if we have more dp_pages_free 1682 * before activating the emergency segment. 1683 */ 1684 PS_UNLOCK(ps); 1685 PSL_UNLOCK(); 1686 goto retry; 1687 1688 } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) { 1689 /* 1690 * PS_CAN_USE is only reset from the emergency segment when it's 1691 * been successfully recovered. So it's legal to have an emergency 1692 * segment that has PS_CAN_USE but no clusters because it's recovery 1693 * failed. 1694 */ 1695 backing_store_t bs = ps->ps_bs; 1696 ps->ps_state |= PS_CAN_USE; 1697 if(ps_select_array[bs->bs_priority] == BS_FULLPRI || 1698 ps_select_array[bs->bs_priority] == BS_NOPRI) { 1699 ps_select_array[bs->bs_priority] = 0; 1700 } 1701 dp_pages_free += ps->ps_pgcount; 1702 dp_pages_reserve -= ps->ps_pgcount; 1703 PS_UNLOCK(ps); 1704 PSL_UNLOCK(); 1705 dprintf(("Switching ON Emergency paging segment\n")); 1706 goto retry; 1707 } 1708 1709 PS_UNLOCK(ps); 1710 PSL_UNLOCK(); 1711 } 1712 } 1713 1714 /* 1715 * Emit a notification of the low-paging resource condition 1716 * but don't issue it more than once every five seconds. This 1717 * prevents us from overflowing logs with thousands of 1718 * repetitions of the message. 1719 */ 1720 clock_get_system_nanotime(&now, &nanoseconds_dummy); 1721 if (paging_segment_count > 1 && (now > lastnotify + 5)) { 1722 /* With an activated emergency paging segment we still 1723 * didn't get any clusters. This could mean that the 1724 * emergency paging segment is exhausted. 1725 */ 1726 dprintf(("System is out of paging space.\n")); 1727 lastnotify = now; 1728 } 1729 1730 PSL_LOCK(); 1731 1732 if(min_pages_trigger_port) { 1733 trigger = min_pages_trigger_port; 1734 min_pages_trigger_port = NULL; 1735 bs_low = TRUE; 1736 backing_store_abort_compaction = TRUE; 1737 } 1738 PSL_UNLOCK(); 1739 if (trigger != IP_NULL) { 1740 dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); 1741 1742 default_pager_space_alert(trigger, HI_WAT_ALERT); 1743 ipc_port_release_send(trigger); 1744 } 1745 return (dp_offset_t) -1; 1746 } 1747 1748 /* 1749 * Look for an available cluster. At the end of the loop, 1750 * byte_num is the byte offset and bit_num is the bit offset of the 1751 * first zero bit in the paging segment bitmap. 1752 */ 1753 PS_LOCK(ps); 1754 byte_num = ps->ps_hint; 1755 for (; byte_num < howmany(ps->ps_ncls, NBBY); byte_num++) { 1756 if (*(ps->ps_bmap + byte_num) != BYTEMASK) { 1757 for (bit_num = 0; bit_num < NBBY; bit_num++) { 1758 if (isclr((ps->ps_bmap + byte_num), bit_num)) 1759 break; 1760 } 1761 ASSERT(bit_num != NBBY); 1762 break; 1763 } 1764 } 1765 ps->ps_hint = byte_num; 1766 cluster = (byte_num*NBBY) + bit_num; 1767 1768 /* Space was reserved, so this must be true */ 1769 ASSERT(cluster < ps->ps_ncls); 1770 1771 setbit(ps->ps_bmap, cluster); 1772 PS_UNLOCK(ps); 1773 1774 return cluster; 1775} 1776 1777void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */ 1778 1779void 1780ps_deallocate_cluster( 1781 paging_segment_t ps, 1782 dp_offset_t cluster) 1783{ 1784 1785 if (cluster >= ps->ps_ncls) 1786 panic("ps_deallocate_cluster: Invalid cluster number"); 1787 1788 /* 1789 * Lock the paging segment, clear the cluster's bitmap and increment the 1790 * number of free cluster. 1791 */ 1792 PSL_LOCK(); 1793 PS_LOCK(ps); 1794 clrbit(ps->ps_bmap, cluster); 1795 if( IS_PS_OK_TO_USE(ps)) { 1796 ++ps->ps_clcount; 1797 ps->ps_pgcount += 1 << ps->ps_clshift; 1798 dp_pages_free += 1 << ps->ps_clshift; 1799 } else { 1800 ps->ps_special_clusters += 1; 1801 } 1802 1803 /* 1804 * Move the hint down to the freed cluster if it is 1805 * less than the current hint. 1806 */ 1807 if ((cluster/NBBY) < ps->ps_hint) { 1808 ps->ps_hint = (cluster/NBBY); 1809 } 1810 1811 1812 /* 1813 * If we're freeing space on a full priority, reset the array. 1814 */ 1815 if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) 1816 ps_select_array[ps->ps_bs->bs_priority] = 0; 1817 PS_UNLOCK(ps); 1818 PSL_UNLOCK(); 1819 1820 return; 1821} 1822 1823void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */ 1824 1825void 1826ps_dealloc_vsmap( 1827 struct vs_map *vsmap, 1828 dp_size_t size) 1829{ 1830 unsigned int i; 1831 struct ps_vnode_trim_data trim_data; 1832 1833 ps_vnode_trim_init(&trim_data); 1834 1835 for (i = 0; i < size; i++) { 1836 if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) { 1837 ps_vnode_trim_more(&trim_data, 1838 &vsmap[i], 1839 VSM_PS(vsmap[i])->ps_clshift, 1840 vm_page_size << VSM_PS(vsmap[i])->ps_clshift); 1841 ps_deallocate_cluster(VSM_PS(vsmap[i]), 1842 VSM_CLOFF(vsmap[i])); 1843 } else { 1844 ps_vnode_trim_now(&trim_data); 1845 } 1846 } 1847 ps_vnode_trim_now(&trim_data); 1848} 1849 1850void 1851ps_vstruct_dealloc( 1852 vstruct_t vs) 1853{ 1854 unsigned int i; 1855// spl_t s; 1856 1857 VS_MAP_LOCK(vs); 1858 1859 /* 1860 * If this is an indirect structure, then we walk through the valid 1861 * (non-zero) indirect pointers and deallocate the clusters 1862 * associated with each used map entry (via ps_dealloc_vsmap). 1863 * When all of the clusters in an indirect block have been 1864 * freed, we deallocate the block. When all of the indirect 1865 * blocks have been deallocated we deallocate the memory 1866 * holding the indirect pointers. 1867 */ 1868 if (vs->vs_indirect) { 1869 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { 1870 if (vs->vs_imap[i] != NULL) { 1871 ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES); 1872 kfree(vs->vs_imap[i], CLMAP_THRESHOLD); 1873 } 1874 } 1875 kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); 1876 } else { 1877 /* 1878 * Direct map. Free used clusters, then memory. 1879 */ 1880 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); 1881 kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); 1882 } 1883 VS_MAP_UNLOCK(vs); 1884 1885 bs_commit(- vs->vs_size); 1886 1887 VS_MAP_LOCK_DESTROY(vs); 1888 1889 zfree(vstruct_zone, vs); 1890} 1891 1892kern_return_t 1893ps_vstruct_reclaim( 1894 vstruct_t vs, 1895 boolean_t return_to_vm, 1896 boolean_t reclaim_backing_store) 1897{ 1898 unsigned int i, j; 1899 struct vs_map *vsmap; 1900 boolean_t vsmap_all_clear, vsimap_all_clear; 1901 struct vm_object_fault_info fault_info; 1902 int clmap_off; 1903 unsigned int vsmap_size; 1904 kern_return_t kr = KERN_SUCCESS; 1905 1906 VS_MAP_LOCK(vs); 1907 1908 fault_info.cluster_size = VM_SUPER_CLUSTER; 1909 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; 1910 fault_info.user_tag = 0; 1911 fault_info.pmap_options = 0; 1912 fault_info.lo_offset = 0; 1913 fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift); 1914 fault_info.io_sync = reclaim_backing_store; 1915 fault_info.batch_pmap_op = FALSE; 1916 1917 /* 1918 * If this is an indirect structure, then we walk through the valid 1919 * (non-zero) indirect pointers and deallocate the clusters 1920 * associated with each used map entry (via ps_dealloc_vsmap). 1921 * When all of the clusters in an indirect block have been 1922 * freed, we deallocate the block. When all of the indirect 1923 * blocks have been deallocated we deallocate the memory 1924 * holding the indirect pointers. 1925 */ 1926 if (vs->vs_indirect) { 1927 vsimap_all_clear = TRUE; 1928 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { 1929 vsmap = vs->vs_imap[i]; 1930 if (vsmap == NULL) 1931 continue; 1932 /* loop on clusters in this indirect map */ 1933 clmap_off = (vm_page_size * CLMAP_ENTRIES * 1934 VSCLSIZE(vs) * i); 1935 if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size)) 1936 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i); 1937 else 1938 vsmap_size = CLMAP_ENTRIES; 1939 vsmap_all_clear = TRUE; 1940 if (return_to_vm) { 1941 for (j = 0; j < vsmap_size;) { 1942 if (VSM_ISCLR(vsmap[j]) || 1943 VSM_ISERR(vsmap[j])) { 1944 j++; 1945 clmap_off += vm_page_size * VSCLSIZE(vs); 1946 continue; 1947 } 1948 VS_MAP_UNLOCK(vs); 1949 kr = pvs_cluster_read( 1950 vs, 1951 clmap_off, 1952 (dp_size_t) -1, /* read whole cluster */ 1953 &fault_info); 1954 1955 VS_MAP_LOCK(vs); /* XXX what if it changed ? */ 1956 if (kr != KERN_SUCCESS) { 1957 vsmap_all_clear = FALSE; 1958 vsimap_all_clear = FALSE; 1959 1960 kr = KERN_MEMORY_ERROR; 1961 goto out; 1962 } 1963 } 1964 } 1965 if (vsmap_all_clear) { 1966 ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES); 1967 kfree(vsmap, CLMAP_THRESHOLD); 1968 vs->vs_imap[i] = NULL; 1969 } 1970 } 1971 if (vsimap_all_clear) { 1972// kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); 1973 } 1974 } else { 1975 /* 1976 * Direct map. Free used clusters, then memory. 1977 */ 1978 vsmap = vs->vs_dmap; 1979 if (vsmap == NULL) { 1980 goto out; 1981 } 1982 vsmap_all_clear = TRUE; 1983 /* loop on clusters in the direct map */ 1984 if (return_to_vm) { 1985 for (j = 0; j < vs->vs_size;) { 1986 if (VSM_ISCLR(vsmap[j]) || 1987 VSM_ISERR(vsmap[j])) { 1988 j++; 1989 continue; 1990 } 1991 clmap_off = vm_page_size * (j << vs->vs_clshift); 1992 VS_MAP_UNLOCK(vs); 1993 kr = pvs_cluster_read( 1994 vs, 1995 clmap_off, 1996 (dp_size_t) -1, /* read whole cluster */ 1997 &fault_info); 1998 1999 VS_MAP_LOCK(vs); /* XXX what if it changed ? */ 2000 if (kr != KERN_SUCCESS) { 2001 vsmap_all_clear = FALSE; 2002 2003 kr = KERN_MEMORY_ERROR; 2004 goto out; 2005 } else { 2006// VSM_CLR(vsmap[j]); 2007 } 2008 } 2009 } 2010 if (vsmap_all_clear) { 2011 ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); 2012// kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); 2013 } 2014 } 2015out: 2016 VS_MAP_UNLOCK(vs); 2017 2018 return kr; 2019} 2020 2021int ps_map_extend(vstruct_t, unsigned int); /* forward */ 2022 2023int ps_map_extend( 2024 vstruct_t vs, 2025 unsigned int new_size) 2026{ 2027 struct vs_map **new_imap; 2028 struct vs_map *new_dmap = NULL; 2029 int newdsize; 2030 int i; 2031 void *old_map = NULL; 2032 int old_map_size = 0; 2033 2034 if (vs->vs_size >= new_size) { 2035 /* 2036 * Someone has already done the work. 2037 */ 2038 return 0; 2039 } 2040 2041 /* 2042 * If the new size extends into the indirect range, then we have one 2043 * of two cases: we are going from indirect to indirect, or we are 2044 * going from direct to indirect. If we are going from indirect to 2045 * indirect, then it is possible that the new size will fit in the old 2046 * indirect map. If this is the case, then just reset the size of the 2047 * vstruct map and we are done. If the new size will not 2048 * fit into the old indirect map, then we have to allocate a new 2049 * indirect map and copy the old map pointers into this new map. 2050 * 2051 * If we are going from direct to indirect, then we have to allocate a 2052 * new indirect map and copy the old direct pages into the first 2053 * indirect page of the new map. 2054 * NOTE: allocating memory here is dangerous, as we're in the 2055 * pageout path. 2056 */ 2057 if (INDIRECT_CLMAP(new_size)) { 2058 int new_map_size = INDIRECT_CLMAP_SIZE(new_size); 2059 2060 /* 2061 * Get a new indirect map and zero it. 2062 */ 2063 old_map_size = INDIRECT_CLMAP_SIZE(vs->vs_size); 2064 if (vs->vs_indirect && 2065 (new_map_size == old_map_size)) { 2066 bs_commit(new_size - vs->vs_size); 2067 vs->vs_size = new_size; 2068 return 0; 2069 } 2070 2071 new_imap = (struct vs_map **)kalloc(new_map_size); 2072 if (new_imap == NULL) { 2073 return -1; 2074 } 2075 memset(new_imap, 0, new_map_size); 2076 2077 if (vs->vs_indirect) { 2078 /* Copy old entries into new map */ 2079 memcpy(new_imap, vs->vs_imap, old_map_size); 2080 /* Arrange to free the old map */ 2081 old_map = (void *) vs->vs_imap; 2082 newdsize = 0; 2083 } else { /* Old map was a direct map */ 2084 /* Allocate an indirect page */ 2085 if ((new_imap[0] = (struct vs_map *) 2086 kalloc(CLMAP_THRESHOLD)) == NULL) { 2087 kfree(new_imap, new_map_size); 2088 return -1; 2089 } 2090 new_dmap = new_imap[0]; 2091 newdsize = CLMAP_ENTRIES; 2092 } 2093 } else { 2094 new_imap = NULL; 2095 newdsize = new_size; 2096 /* 2097 * If the new map is a direct map, then the old map must 2098 * also have been a direct map. All we have to do is 2099 * to allocate a new direct map, copy the old entries 2100 * into it and free the old map. 2101 */ 2102 if ((new_dmap = (struct vs_map *) 2103 kalloc(CLMAP_SIZE(new_size))) == NULL) { 2104 return -1; 2105 } 2106 } 2107 if (newdsize) { 2108 2109 /* Free the old map */ 2110 old_map = (void *) vs->vs_dmap; 2111 old_map_size = CLMAP_SIZE(vs->vs_size); 2112 2113 /* Copy info from the old map into the new map */ 2114 memcpy(new_dmap, vs->vs_dmap, old_map_size); 2115 2116 /* Initialize the rest of the new map */ 2117 for (i = vs->vs_size; i < newdsize; i++) 2118 VSM_CLR(new_dmap[i]); 2119 } 2120 if (new_imap) { 2121 vs->vs_imap = new_imap; 2122 vs->vs_indirect = TRUE; 2123 } else 2124 vs->vs_dmap = new_dmap; 2125 bs_commit(new_size - vs->vs_size); 2126 vs->vs_size = new_size; 2127 if (old_map) 2128 kfree(old_map, old_map_size); 2129 return 0; 2130} 2131 2132dp_offset_t 2133ps_clmap( 2134 vstruct_t vs, 2135 dp_offset_t offset, 2136 struct clmap *clmap, 2137 int flag, 2138 dp_size_t size, 2139 int error) 2140{ 2141 dp_offset_t cluster; /* The cluster of offset. */ 2142 dp_offset_t newcl; /* The new cluster allocated. */ 2143 dp_offset_t newoff; 2144 unsigned int i; 2145 struct vs_map *vsmap; 2146 2147 VS_MAP_LOCK(vs); 2148 2149 ASSERT(vs->vs_dmap); 2150 cluster = atop_32(offset) >> vs->vs_clshift; 2151 2152 /* 2153 * Initialize cluster error value 2154 */ 2155 clmap->cl_error = 0; 2156 2157 /* 2158 * If the object has grown, extend the page map. 2159 */ 2160 if (cluster >= vs->vs_size) { 2161 if (flag == CL_FIND) { 2162 /* Do not allocate if just doing a lookup */ 2163 VS_MAP_UNLOCK(vs); 2164 return (dp_offset_t) -1; 2165 } 2166 if (ps_map_extend(vs, cluster + 1)) { 2167 VS_MAP_UNLOCK(vs); 2168 return (dp_offset_t) -1; 2169 } 2170 } 2171 2172 /* 2173 * Look for the desired cluster. If the map is indirect, then we 2174 * have a two level lookup. First find the indirect block, then 2175 * find the actual cluster. If the indirect block has not yet 2176 * been allocated, then do so. If the cluster has not yet been 2177 * allocated, then do so. 2178 * 2179 * If any of the allocations fail, then return an error. 2180 * Don't allocate if just doing a lookup. 2181 */ 2182 if (vs->vs_indirect) { 2183 long ind_block = cluster/CLMAP_ENTRIES; 2184 2185 /* Is the indirect block allocated? */ 2186 vsmap = vs->vs_imap[ind_block]; 2187 if (vsmap == NULL) { 2188 if (flag == CL_FIND) { 2189 VS_MAP_UNLOCK(vs); 2190 return (dp_offset_t) -1; 2191 } 2192 2193 /* Allocate the indirect block */ 2194 vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD); 2195 if (vsmap == NULL) { 2196 VS_MAP_UNLOCK(vs); 2197 return (dp_offset_t) -1; 2198 } 2199 /* Initialize the cluster offsets */ 2200 for (i = 0; i < CLMAP_ENTRIES; i++) 2201 VSM_CLR(vsmap[i]); 2202 vs->vs_imap[ind_block] = vsmap; 2203 } 2204 } else 2205 vsmap = vs->vs_dmap; 2206 2207 ASSERT(vsmap); 2208 vsmap += cluster%CLMAP_ENTRIES; 2209 2210 /* 2211 * At this point, vsmap points to the struct vs_map desired. 2212 * 2213 * Look in the map for the cluster, if there was an error on a 2214 * previous write, flag it and return. If it is not yet 2215 * allocated, then allocate it, if we're writing; if we're 2216 * doing a lookup and the cluster's not allocated, return error. 2217 */ 2218 if (VSM_ISERR(*vsmap)) { 2219 clmap->cl_error = VSM_GETERR(*vsmap); 2220 VS_MAP_UNLOCK(vs); 2221 return (dp_offset_t) -1; 2222 } else if (VSM_ISCLR(*vsmap)) { 2223 int psindex; 2224 2225 if (flag == CL_FIND) { 2226 /* 2227 * If there's an error and the entry is clear, then 2228 * we've run out of swap space. Record the error 2229 * here and return. 2230 */ 2231 if (error) { 2232 VSM_SETERR(*vsmap, error); 2233 } 2234 VS_MAP_UNLOCK(vs); 2235 return (dp_offset_t) -1; 2236 } else { 2237 /* 2238 * Attempt to allocate a cluster from the paging segment 2239 */ 2240 newcl = ps_allocate_cluster(vs, &psindex, 2241 PAGING_SEGMENT_NULL); 2242 if (newcl == (dp_offset_t) -1) { 2243 VS_MAP_UNLOCK(vs); 2244 return (dp_offset_t) -1; 2245 } 2246 VSM_CLR(*vsmap); 2247 VSM_SETCLOFF(*vsmap, newcl); 2248 VSM_SETPS(*vsmap, psindex); 2249 } 2250 } else 2251 newcl = VSM_CLOFF(*vsmap); 2252 2253 /* 2254 * Fill in pertinent fields of the clmap 2255 */ 2256 clmap->cl_ps = VSM_PS(*vsmap); 2257 clmap->cl_numpages = VSCLSIZE(vs); 2258 clmap->cl_bmap.clb_map = (unsigned int) VSM_BMAP(*vsmap); 2259 2260 /* 2261 * Byte offset in paging segment is byte offset to cluster plus 2262 * byte offset within cluster. It looks ugly, but should be 2263 * relatively quick. 2264 */ 2265 ASSERT(trunc_page(offset) == offset); 2266 newcl = ptoa_32(newcl) << vs->vs_clshift; 2267 newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1); 2268 if (flag == CL_ALLOC) { 2269 /* 2270 * set bits in the allocation bitmap according to which 2271 * pages were requested. size is in bytes. 2272 */ 2273 i = atop_32(newoff); 2274 while ((size > 0) && (i < VSCLSIZE(vs))) { 2275 VSM_SETALLOC(*vsmap, i); 2276 i++; 2277 size -= vm_page_size; 2278 } 2279 } 2280 clmap->cl_alloc.clb_map = (unsigned int) VSM_ALLOC(*vsmap); 2281 if (newoff) { 2282 /* 2283 * Offset is not cluster aligned, so number of pages 2284 * and bitmaps must be adjusted 2285 */ 2286 clmap->cl_numpages -= atop_32(newoff); 2287 CLMAP_SHIFT(clmap, vs); 2288 CLMAP_SHIFTALLOC(clmap, vs); 2289 } 2290 2291 /* 2292 * 2293 * The setting of valid bits and handling of write errors 2294 * must be done here, while we hold the lock on the map. 2295 * It logically should be done in ps_vs_write_complete(). 2296 * The size and error information has been passed from 2297 * ps_vs_write_complete(). If the size parameter is non-zero, 2298 * then there is work to be done. If error is also non-zero, 2299 * then the error number is recorded in the cluster and the 2300 * entire cluster is in error. 2301 */ 2302 if (size && flag == CL_FIND) { 2303 dp_offset_t off = (dp_offset_t) 0; 2304 2305 if (!error) { 2306 for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0; 2307 i++) { 2308 VSM_SETPG(*vsmap, i); 2309 size -= vm_page_size; 2310 } 2311 ASSERT(i <= VSCLSIZE(vs)); 2312 } else { 2313 BS_STAT(clmap->cl_ps->ps_bs, 2314 clmap->cl_ps->ps_bs->bs_pages_out_fail += 2315 atop_32(size)); 2316 off = VSM_CLOFF(*vsmap); 2317 VSM_SETERR(*vsmap, error); 2318 } 2319 /* 2320 * Deallocate cluster if error, and no valid pages 2321 * already present. 2322 */ 2323 if (off != (dp_offset_t) 0) 2324 ps_deallocate_cluster(clmap->cl_ps, off); 2325 VS_MAP_UNLOCK(vs); 2326 return (dp_offset_t) 0; 2327 } else 2328 VS_MAP_UNLOCK(vs); 2329 2330 DP_DEBUG(DEBUG_VS_INTERNAL, 2331 ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n", 2332 newcl+newoff, (int) vs, (int) vsmap, flag)); 2333 DP_DEBUG(DEBUG_VS_INTERNAL, 2334 (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n", 2335 (int) clmap->cl_ps, clmap->cl_numpages, 2336 (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map)); 2337 2338 return (newcl + newoff); 2339} 2340 2341void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */ 2342 2343void 2344ps_clunmap( 2345 vstruct_t vs, 2346 dp_offset_t offset, 2347 dp_size_t length) 2348{ 2349 dp_offset_t cluster; /* The cluster number of offset */ 2350 struct vs_map *vsmap; 2351 struct ps_vnode_trim_data trim_data; 2352 2353 ps_vnode_trim_init(&trim_data); 2354 2355 VS_MAP_LOCK(vs); 2356 2357 /* 2358 * Loop through all clusters in this range, freeing paging segment 2359 * clusters and map entries as encountered. 2360 */ 2361 while (length > 0) { 2362 dp_offset_t newoff; 2363 unsigned int i; 2364 2365 cluster = atop_32(offset) >> vs->vs_clshift; 2366 if (vs->vs_indirect) /* indirect map */ 2367 vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES]; 2368 else 2369 vsmap = vs->vs_dmap; 2370 if (vsmap == NULL) { 2371 ps_vnode_trim_now(&trim_data); 2372 VS_MAP_UNLOCK(vs); 2373 return; 2374 } 2375 vsmap += cluster%CLMAP_ENTRIES; 2376 if (VSM_ISCLR(*vsmap)) { 2377 ps_vnode_trim_now(&trim_data); 2378 length -= vm_page_size; 2379 offset += vm_page_size; 2380 continue; 2381 } 2382 /* 2383 * We've got a valid mapping. Clear it and deallocate 2384 * paging segment cluster pages. 2385 * Optimize for entire cluster cleraing. 2386 */ 2387 if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) { 2388 /* 2389 * Not cluster aligned. 2390 */ 2391 ASSERT(trunc_page(newoff) == newoff); 2392 i = atop_32(newoff); 2393 } else 2394 i = 0; 2395 while ((i < VSCLSIZE(vs)) && (length > 0)) { 2396 VSM_CLRPG(*vsmap, i); 2397 VSM_CLRALLOC(*vsmap, i); 2398 length -= vm_page_size; 2399 offset += vm_page_size; 2400 i++; 2401 } 2402 2403 /* 2404 * If map entry is empty, clear and deallocate cluster. 2405 */ 2406 if (!VSM_BMAP(*vsmap)) { 2407 ps_vnode_trim_more(&trim_data, 2408 vsmap, 2409 vs->vs_clshift, 2410 VSCLSIZE(vs) * vm_page_size); 2411 ps_deallocate_cluster(VSM_PS(*vsmap), 2412 VSM_CLOFF(*vsmap)); 2413 VSM_CLR(*vsmap); 2414 } else { 2415 ps_vnode_trim_now(&trim_data); 2416 } 2417 } 2418 ps_vnode_trim_now(&trim_data); 2419 2420 VS_MAP_UNLOCK(vs); 2421} 2422 2423void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */ 2424 2425void 2426ps_vs_write_complete( 2427 vstruct_t vs, 2428 dp_offset_t offset, 2429 dp_size_t size, 2430 int error) 2431{ 2432 struct clmap clmap; 2433 2434 /* 2435 * Get the struct vsmap for this cluster. 2436 * Use READ, even though it was written, because the 2437 * cluster MUST be present, unless there was an error 2438 * in the original ps_clmap (e.g. no space), in which 2439 * case, nothing happens. 2440 * 2441 * Must pass enough information to ps_clmap to allow it 2442 * to set the vs_map structure bitmap under lock. 2443 */ 2444 (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error); 2445} 2446 2447void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */ 2448 2449void 2450vs_cl_write_complete( 2451 vstruct_t vs, 2452 __unused paging_segment_t ps, 2453 dp_offset_t offset, 2454 __unused vm_offset_t addr, 2455 dp_size_t size, 2456 boolean_t async, 2457 int error) 2458{ 2459// kern_return_t kr; 2460 2461 if (error) { 2462 /* 2463 * For internal objects, the error is recorded on a 2464 * per-cluster basis by ps_clmap() which is called 2465 * by ps_vs_write_complete() below. 2466 */ 2467 dprintf(("write failed error = 0x%x\n", error)); 2468 /* add upl_abort code here */ 2469 } else 2470 GSTAT(global_stats.gs_pages_out += atop_32(size)); 2471 /* 2472 * Notify the vstruct mapping code, so it can do its accounting. 2473 */ 2474 ps_vs_write_complete(vs, offset, size, error); 2475 2476 if (async) { 2477 VS_LOCK(vs); 2478 ASSERT(vs->vs_async_pending > 0); 2479 vs->vs_async_pending -= size; 2480 if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { 2481 vs->vs_waiting_async = FALSE; 2482 VS_UNLOCK(vs); 2483 thread_wakeup(&vs->vs_async_pending); 2484 } else { 2485 VS_UNLOCK(vs); 2486 } 2487 } 2488} 2489 2490#ifdef DEVICE_PAGING 2491kern_return_t device_write_reply(MACH_PORT_FACE, kern_return_t, io_buf_len_t); 2492 2493kern_return_t 2494device_write_reply( 2495 MACH_PORT_FACE reply_port, 2496 kern_return_t device_code, 2497 io_buf_len_t bytes_written) 2498{ 2499 struct vs_async *vsa; 2500 2501 vsa = (struct vs_async *) 2502 ((struct vstruct_alias *)(reply_port->ip_alias))->vs; 2503 2504 if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) { 2505 device_code = KERN_FAILURE; 2506 } 2507 2508 vsa->vsa_error = device_code; 2509 2510 2511 ASSERT(vsa->vsa_vs != VSTRUCT_NULL); 2512 if(vsa->vsa_flags & VSA_TRANSFER) { 2513 /* revisit when async disk segments redone */ 2514 if(vsa->vsa_error) { 2515 /* need to consider error condition. re-write data or */ 2516 /* throw it away here. */ 2517 vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr); 2518 } 2519 ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset, 2520 vsa->vsa_size, vsa->vsa_error); 2521 } else { 2522 vs_cl_write_complete(vsa->vsa_vs, vsa->vsa_ps, vsa->vsa_offset, 2523 vsa->vsa_addr, vsa->vsa_size, TRUE, 2524 vsa->vsa_error); 2525 } 2526 VS_FREE_ASYNC(vsa); 2527 2528 return KERN_SUCCESS; 2529} 2530 2531kern_return_t device_write_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_len_t); 2532kern_return_t 2533device_write_reply_inband( 2534 MACH_PORT_FACE reply_port, 2535 kern_return_t return_code, 2536 io_buf_len_t bytes_written) 2537{ 2538 panic("device_write_reply_inband: illegal"); 2539 return KERN_SUCCESS; 2540} 2541 2542kern_return_t device_read_reply(MACH_PORT_FACE, kern_return_t, io_buf_ptr_t, mach_msg_type_number_t); 2543kern_return_t 2544device_read_reply( 2545 MACH_PORT_FACE reply_port, 2546 kern_return_t return_code, 2547 io_buf_ptr_t data, 2548 mach_msg_type_number_t dataCnt) 2549{ 2550 struct vs_async *vsa; 2551 vsa = (struct vs_async *) 2552 ((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs; 2553 vsa->vsa_addr = (vm_offset_t)data; 2554 vsa->vsa_size = (vm_size_t)dataCnt; 2555 vsa->vsa_error = return_code; 2556 thread_wakeup(&vsa); 2557 return KERN_SUCCESS; 2558} 2559 2560kern_return_t device_read_reply_inband(MACH_PORT_FACE, kern_return_t, io_buf_ptr_inband_t, mach_msg_type_number_t); 2561kern_return_t 2562device_read_reply_inband( 2563 MACH_PORT_FACE reply_port, 2564 kern_return_t return_code, 2565 io_buf_ptr_inband_t data, 2566 mach_msg_type_number_t dataCnt) 2567{ 2568 panic("device_read_reply_inband: illegal"); 2569 return KERN_SUCCESS; 2570} 2571 2572kern_return_t device_read_reply_overwrite(MACH_PORT_FACE, kern_return_t, io_buf_len_t); 2573kern_return_t 2574device_read_reply_overwrite( 2575 MACH_PORT_FACE reply_port, 2576 kern_return_t return_code, 2577 io_buf_len_t bytes_read) 2578{ 2579 panic("device_read_reply_overwrite: illegal\n"); 2580 return KERN_SUCCESS; 2581} 2582 2583kern_return_t device_open_reply(MACH_PORT_FACE, kern_return_t, MACH_PORT_FACE); 2584kern_return_t 2585device_open_reply( 2586 MACH_PORT_FACE reply_port, 2587 kern_return_t return_code, 2588 MACH_PORT_FACE device_port) 2589{ 2590 panic("device_open_reply: illegal\n"); 2591 return KERN_SUCCESS; 2592} 2593 2594kern_return_t 2595ps_read_device( 2596 paging_segment_t ps, 2597 dp_offset_t offset, 2598 vm_offset_t *bufferp, 2599 unsigned int size, 2600 unsigned int *residualp, 2601 int flags) 2602{ 2603 kern_return_t kr; 2604 recnum_t dev_offset; 2605 unsigned int bytes_wanted; 2606 unsigned int bytes_read; 2607 unsigned int total_read; 2608 vm_offset_t dev_buffer; 2609 vm_offset_t buf_ptr; 2610 unsigned int records_read; 2611 struct vs_async *vsa; 2612 2613 device_t device; 2614 vm_map_copy_t device_data = NULL; 2615 default_pager_thread_t *dpt = NULL; 2616 2617 device = dev_port_lookup(ps->ps_device); 2618 clustered_reads[atop_32(size)]++; 2619 2620 dev_offset = (ps->ps_offset + 2621 (offset >> (vm_page_shift - ps->ps_record_shift))); 2622 bytes_wanted = size; 2623 total_read = 0; 2624 *bufferp = (vm_offset_t)NULL; 2625 2626 do { 2627 vsa = VS_ALLOC_ASYNC(); 2628 if (vsa) { 2629 vsa->vsa_vs = NULL; 2630 vsa->vsa_addr = 0; 2631 vsa->vsa_offset = 0; 2632 vsa->vsa_size = 0; 2633 vsa->vsa_ps = NULL; 2634 } 2635 ip_lock(vsa->reply_port); 2636 vsa->reply_port->ip_sorights++; 2637 ip_reference(vsa->reply_port); 2638 ip_unlock(vsa->reply_port); 2639 kr = ds_device_read_common(device, 2640 vsa->reply_port, 2641 (mach_msg_type_name_t) 2642 MACH_MSG_TYPE_MOVE_SEND_ONCE, 2643 (dev_mode_t) 0, 2644 dev_offset, 2645 bytes_wanted, 2646 (IO_READ | IO_CALL), 2647 (io_buf_ptr_t *) &dev_buffer, 2648 (mach_msg_type_number_t *) &bytes_read); 2649 if(kr == MIG_NO_REPLY) { 2650 assert_wait(&vsa, THREAD_UNINT); 2651 thread_block(THREAD_CONTINUE_NULL); 2652 2653 dev_buffer = vsa->vsa_addr; 2654 bytes_read = (unsigned int)vsa->vsa_size; 2655 kr = vsa->vsa_error; 2656 } 2657 VS_FREE_ASYNC(vsa); 2658 if (kr != KERN_SUCCESS || bytes_read == 0) { 2659 break; 2660 } 2661 total_read += bytes_read; 2662 2663 /* 2664 * If we got the entire range, use the returned dev_buffer. 2665 */ 2666 if (bytes_read == size) { 2667 *bufferp = (vm_offset_t)dev_buffer; 2668 break; 2669 } 2670 2671#if 1 2672 dprintf(("read only %d bytes out of %d\n", 2673 bytes_read, bytes_wanted)); 2674#endif 2675 if(dpt == NULL) { 2676 dpt = get_read_buffer(); 2677 buf_ptr = dpt->dpt_buffer; 2678 *bufferp = (vm_offset_t)buf_ptr; 2679 } 2680 /* 2681 * Otherwise, copy the data into the provided buffer (*bufferp) 2682 * and append the rest of the range as it comes in. 2683 */ 2684 memcpy((void *) buf_ptr, (void *) dev_buffer, bytes_read); 2685 buf_ptr += bytes_read; 2686 bytes_wanted -= bytes_read; 2687 records_read = (bytes_read >> 2688 (vm_page_shift - ps->ps_record_shift)); 2689 dev_offset += records_read; 2690 DP_DEBUG(DEBUG_VS_INTERNAL, 2691 ("calling vm_deallocate(addr=0x%X,size=0x%X)\n", 2692 dev_buffer, bytes_read)); 2693 if (vm_deallocate(kernel_map, dev_buffer, bytes_read) 2694 != KERN_SUCCESS) 2695 Panic("dealloc buf"); 2696 } while (bytes_wanted); 2697 2698 *residualp = size - total_read; 2699 if((dev_buffer != *bufferp) && (total_read != 0)) { 2700 vm_offset_t temp_buffer; 2701 vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE); 2702 memcpy((void *) temp_buffer, (void *) *bufferp, total_read); 2703 if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read, 2704 VM_MAP_COPYIN_OPT_SRC_DESTROY | 2705 VM_MAP_COPYIN_OPT_STEAL_PAGES | 2706 VM_MAP_COPYIN_OPT_PMAP_ENTER, 2707 (vm_map_copy_t *)&device_data, FALSE)) 2708 panic("ps_read_device: cannot copyin locally provided buffer\n"); 2709 } 2710 else if((kr == KERN_SUCCESS) && (total_read != 0) && (dev_buffer != 0)){ 2711 if(vm_map_copyin_page_list(kernel_map, dev_buffer, bytes_read, 2712 VM_MAP_COPYIN_OPT_SRC_DESTROY | 2713 VM_MAP_COPYIN_OPT_STEAL_PAGES | 2714 VM_MAP_COPYIN_OPT_PMAP_ENTER, 2715 (vm_map_copy_t *)&device_data, FALSE)) 2716 panic("ps_read_device: cannot copyin backing store provided buffer\n"); 2717 } 2718 else { 2719 device_data = NULL; 2720 } 2721 *bufferp = (vm_offset_t)device_data; 2722 2723 if(dpt != NULL) { 2724 /* Free the receive buffer */ 2725 dpt->checked_out = 0; 2726 thread_wakeup(&dpt_array); 2727 } 2728 return KERN_SUCCESS; 2729} 2730 2731kern_return_t 2732ps_write_device( 2733 paging_segment_t ps, 2734 dp_offset_t offset, 2735 vm_offset_t addr, 2736 unsigned int size, 2737 struct vs_async *vsa) 2738{ 2739 recnum_t dev_offset; 2740 io_buf_len_t bytes_to_write, bytes_written; 2741 recnum_t records_written; 2742 kern_return_t kr; 2743 MACH_PORT_FACE reply_port; 2744 2745 2746 2747 clustered_writes[atop_32(size)]++; 2748 2749 dev_offset = (ps->ps_offset + 2750 (offset >> (vm_page_shift - ps->ps_record_shift))); 2751 bytes_to_write = size; 2752 2753 if (vsa) { 2754 /* 2755 * Asynchronous write. 2756 */ 2757 reply_port = vsa->reply_port; 2758 ip_lock(reply_port); 2759 reply_port->ip_sorights++; 2760 ip_reference(reply_port); 2761 ip_unlock(reply_port); 2762 { 2763 device_t device; 2764 device = dev_port_lookup(ps->ps_device); 2765 2766 vsa->vsa_addr = addr; 2767 kr=ds_device_write_common(device, 2768 reply_port, 2769 (mach_msg_type_name_t) MACH_MSG_TYPE_MOVE_SEND_ONCE, 2770 (dev_mode_t) 0, 2771 dev_offset, 2772 (io_buf_ptr_t) addr, 2773 size, 2774 (IO_WRITE | IO_CALL), 2775 &bytes_written); 2776 } 2777 if ((kr != KERN_SUCCESS) && (kr != MIG_NO_REPLY)) { 2778 if (verbose) 2779 dprintf(("%s0x%x, addr=0x%x," 2780 "size=0x%x,offset=0x%x\n", 2781 "device_write_request returned ", 2782 kr, addr, size, offset)); 2783 BS_STAT(ps->ps_bs, 2784 ps->ps_bs->bs_pages_out_fail += atop_32(size)); 2785 /* do the completion notification to free resources */ 2786 device_write_reply(reply_port, kr, 0); 2787 return PAGER_ERROR; 2788 } 2789 } else do { 2790 /* 2791 * Synchronous write. 2792 */ 2793 { 2794 device_t device; 2795 device = dev_port_lookup(ps->ps_device); 2796 kr=ds_device_write_common(device, 2797 IP_NULL, 0, 2798 (dev_mode_t) 0, 2799 dev_offset, 2800 (io_buf_ptr_t) addr, 2801 size, 2802 (IO_WRITE | IO_SYNC | IO_KERNEL_BUF), 2803 &bytes_written); 2804 } 2805 if (kr != KERN_SUCCESS) { 2806 dprintf(("%s0x%x, addr=0x%x,size=0x%x,offset=0x%x\n", 2807 "device_write returned ", 2808 kr, addr, size, offset)); 2809 BS_STAT(ps->ps_bs, 2810 ps->ps_bs->bs_pages_out_fail += atop_32(size)); 2811 return PAGER_ERROR; 2812 } 2813 if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1)) 2814 Panic("fragmented write"); 2815 records_written = (bytes_written >> 2816 (vm_page_shift - ps->ps_record_shift)); 2817 dev_offset += records_written; 2818#if 1 2819 if (bytes_written != bytes_to_write) { 2820 dprintf(("wrote only %d bytes out of %d\n", 2821 bytes_written, bytes_to_write)); 2822 } 2823#endif 2824 bytes_to_write -= bytes_written; 2825 addr += bytes_written; 2826 } while (bytes_to_write > 0); 2827 2828 return PAGER_SUCCESS; 2829} 2830 2831 2832#else /* !DEVICE_PAGING */ 2833 2834kern_return_t 2835ps_read_device( 2836 __unused paging_segment_t ps, 2837 __unused dp_offset_t offset, 2838 __unused vm_offset_t *bufferp, 2839 __unused unsigned int size, 2840 __unused unsigned int *residualp, 2841 __unused int flags) 2842{ 2843 panic("ps_read_device not supported"); 2844 return KERN_FAILURE; 2845} 2846 2847kern_return_t 2848ps_write_device( 2849 __unused paging_segment_t ps, 2850 __unused dp_offset_t offset, 2851 __unused vm_offset_t addr, 2852 __unused unsigned int size, 2853 __unused struct vs_async *vsa) 2854{ 2855 panic("ps_write_device not supported"); 2856 return KERN_FAILURE; 2857} 2858 2859#endif /* DEVICE_PAGING */ 2860void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */ 2861 2862void 2863pvs_object_data_provided( 2864 __unused vstruct_t vs, 2865 __unused upl_t upl, 2866 __unused upl_offset_t offset, 2867 upl_size_t size) 2868{ 2869#if RECLAIM_SWAP 2870 boolean_t empty; 2871#endif 2872 2873 DP_DEBUG(DEBUG_VS_INTERNAL, 2874 ("buffer=0x%x,offset=0x%x,size=0x%x\n", 2875 upl, offset, size)); 2876 2877 ASSERT(size > 0); 2878 GSTAT(global_stats.gs_pages_in += atop_32(size)); 2879 2880/* check upl iosync flag instead of using RECLAIM_SWAP*/ 2881#if RECLAIM_SWAP 2882 if (size != upl->size) { 2883 if (size) { 2884 ps_clunmap(vs, offset, size); 2885 upl_commit_range(upl, 0, size, 0, NULL, 0, &empty); 2886 } 2887 upl_abort(upl, UPL_ABORT_ERROR); 2888 upl_deallocate(upl); 2889 } else { 2890 ps_clunmap(vs, offset, size); 2891 upl_commit(upl, NULL, 0); 2892 upl_deallocate(upl); 2893 } 2894#endif /* RECLAIM_SWAP */ 2895 2896} 2897 2898static memory_object_offset_t last_start; 2899static vm_size_t last_length; 2900 2901/* 2902 * A "cnt" of 0 means that the caller just wants to check if the page at 2903 * offset "vs_offset" exists in the backing store. That page hasn't been 2904 * prepared, so no need to release it. 2905 * 2906 * A "cnt" of -1 means that the caller wants to bring back from the backing 2907 * store all existing pages in the cluster containing "vs_offset". 2908 */ 2909kern_return_t 2910pvs_cluster_read( 2911 vstruct_t vs, 2912 dp_offset_t vs_offset, 2913 dp_size_t cnt, 2914 void *fault_info) 2915{ 2916 kern_return_t error = KERN_SUCCESS; 2917 unsigned int size; 2918 unsigned int residual; 2919 unsigned int request_flags; 2920 int io_flags = 0; 2921 int seg_index; 2922 int pages_in_cl; 2923 int cl_size; 2924 int cl_mask; 2925 int cl_index; 2926 unsigned int xfer_size; 2927 dp_offset_t orig_vs_offset; 2928 dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT]; 2929 paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT]; 2930 struct clmap clmap; 2931 upl_t upl; 2932 unsigned int page_list_count; 2933 memory_object_offset_t cluster_start; 2934 vm_size_t cluster_length; 2935 uint32_t io_streaming; 2936 int i; 2937 boolean_t io_sync = FALSE; 2938 boolean_t reclaim_all = FALSE; 2939 2940 pages_in_cl = 1 << vs->vs_clshift; 2941 cl_size = pages_in_cl * vm_page_size; 2942 cl_mask = cl_size - 1; 2943 2944 request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; 2945 2946 if (cnt == (dp_size_t) -1) 2947 reclaim_all = TRUE; 2948 2949 if (reclaim_all == TRUE) { 2950 /* 2951 * We've been called from ps_vstruct_reclaim() to move all 2952 * the object's swapped pages back to VM pages. 2953 * This can put memory pressure on the system, so we do want 2954 * to wait for free pages, to avoid getting in the way of the 2955 * vm_pageout_scan() thread. 2956 * Let's not use UPL_NOBLOCK in this case. 2957 */ 2958 vs_offset &= ~cl_mask; 2959 i = pages_in_cl; 2960 } else { 2961 i = 1; 2962 2963 /* 2964 * if the I/O cluster size == PAGE_SIZE, we don't want to set 2965 * the UPL_NOBLOCK since we may be trying to recover from a 2966 * previous partial pagein I/O that occurred because we were low 2967 * on memory and bailed early in order to honor the UPL_NOBLOCK... 2968 * since we're only asking for a single page, we can block w/o fear 2969 * of tying up pages while waiting for more to become available 2970 */ 2971 if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE) 2972 request_flags |= UPL_NOBLOCK; 2973 } 2974 2975again: 2976 cl_index = (vs_offset & cl_mask) / vm_page_size; 2977 2978 if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) || 2979 !CLMAP_ISSET(clmap, cl_index)) { 2980 /* 2981 * the needed page doesn't exist in the backing store... 2982 * we don't want to try to do any I/O, just abort the 2983 * page and let the fault handler provide a zero-fill 2984 */ 2985 if (cnt == 0) { 2986 /* 2987 * The caller was just poking at us to see if 2988 * the page has been paged out. No need to 2989 * mess with the page at all. 2990 * Just let the caller know we don't have that page. 2991 */ 2992 return KERN_FAILURE; 2993 } 2994 if (reclaim_all == TRUE) { 2995 i--; 2996 if (i == 0) { 2997 /* no more pages in this cluster */ 2998 return KERN_FAILURE; 2999 } 3000 /* try the next page in this cluster */ 3001 vs_offset += vm_page_size; 3002 goto again; 3003 } 3004 3005 page_list_count = 0; 3006 3007 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, 3008 PAGE_SIZE, PAGE_SIZE, 3009 &upl, NULL, &page_list_count, 3010 request_flags | UPL_SET_INTERNAL); 3011 upl_range_needed(upl, 0, 1); 3012 3013 if (clmap.cl_error) 3014 upl_abort(upl, UPL_ABORT_ERROR); 3015 else 3016 upl_abort(upl, UPL_ABORT_UNAVAILABLE); 3017 upl_deallocate(upl); 3018 3019 return KERN_SUCCESS; 3020 } 3021 3022 if (cnt == 0) { 3023 /* 3024 * The caller was just poking at us to see if 3025 * the page has been paged out. No need to 3026 * mess with the page at all. 3027 * Just let the caller know we do have that page. 3028 */ 3029 return KERN_SUCCESS; 3030 } 3031 3032 if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) { 3033 io_sync = TRUE; 3034 } else { 3035#if RECLAIM_SWAP 3036 io_sync = TRUE; 3037#endif /* RECLAIM_SWAP */ 3038 } 3039 3040 if( io_sync == TRUE ) { 3041 3042 io_flags |= UPL_IOSYNC | UPL_NOCOMMIT; 3043#if USE_PRECIOUS 3044 request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE; 3045#else /* USE_PRECIOUS */ 3046 request_flags |= UPL_REQUEST_SET_DIRTY; 3047#endif /* USE_PRECIOUS */ 3048 } 3049 3050 assert(dp_encryption_inited); 3051 if (dp_encryption) { 3052 /* 3053 * ENCRYPTED SWAP: 3054 * request that the UPL be prepared for 3055 * decryption. 3056 */ 3057 request_flags |= UPL_ENCRYPT; 3058 io_flags |= UPL_PAGING_ENCRYPTED; 3059 } 3060 orig_vs_offset = vs_offset; 3061 3062 assert(cnt != 0); 3063 cnt = VM_SUPER_CLUSTER; 3064 cluster_start = (memory_object_offset_t) vs_offset; 3065 cluster_length = (vm_size_t) cnt; 3066 io_streaming = 0; 3067 3068 /* 3069 * determine how big a speculative I/O we should try for... 3070 */ 3071 if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) { 3072 assert(vs_offset >= (dp_offset_t) cluster_start && 3073 vs_offset < (dp_offset_t) (cluster_start + cluster_length)); 3074 vs_offset = (dp_offset_t) cluster_start; 3075 cnt = (dp_size_t) cluster_length; 3076 } else { 3077 cluster_length = PAGE_SIZE; 3078 cnt = PAGE_SIZE; 3079 } 3080 3081 if (io_streaming) 3082 io_flags |= UPL_IOSTREAMING; 3083 3084 last_start = cluster_start; 3085 last_length = cluster_length; 3086 3087 /* 3088 * This loop will be executed multiple times until the entire 3089 * range has been looked at or we issue an I/O... if the request spans cluster 3090 * boundaries, the clusters will be checked for logical continunity, 3091 * if contiguous the I/O request will span multiple clusters... 3092 * at most only 1 I/O will be issued... it will encompass the original offset 3093 */ 3094 while (cnt && error == KERN_SUCCESS) { 3095 int ps_info_valid; 3096 3097 if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) { 3098 size = VM_SUPER_CLUSTER; 3099 size -= vs_offset & cl_mask; 3100 } else if (cnt > VM_SUPER_CLUSTER) 3101 size = VM_SUPER_CLUSTER; 3102 else 3103 size = cnt; 3104 3105 cnt -= size; 3106 3107 ps_info_valid = 0; 3108 seg_index = 0; 3109 3110 while (size > 0 && error == KERN_SUCCESS) { 3111 unsigned int abort_size; 3112 unsigned int lsize; 3113 int failed_size; 3114 int beg_pseg; 3115 int beg_indx; 3116 dp_offset_t cur_offset; 3117 3118 if ( !ps_info_valid) { 3119 ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0); 3120 psp[seg_index] = CLMAP_PS(clmap); 3121 ps_info_valid = 1; 3122 } 3123 /* 3124 * skip over unallocated physical segments 3125 */ 3126 if (ps_offset[seg_index] == (dp_offset_t) -1) { 3127 abort_size = cl_size - (vs_offset & cl_mask); 3128 abort_size = MIN(abort_size, size); 3129 3130 size -= abort_size; 3131 vs_offset += abort_size; 3132 3133 seg_index++; 3134 ps_info_valid = 0; 3135 3136 continue; 3137 } 3138 cl_index = (vs_offset & cl_mask) / vm_page_size; 3139 3140 for (abort_size = 0; cl_index < pages_in_cl && abort_size < size; cl_index++) { 3141 /* 3142 * skip over unallocated pages 3143 */ 3144 if (CLMAP_ISSET(clmap, cl_index)) 3145 break; 3146 abort_size += vm_page_size; 3147 } 3148 if (abort_size) { 3149 size -= abort_size; 3150 vs_offset += abort_size; 3151 3152 if (cl_index == pages_in_cl) { 3153 /* 3154 * if we're at the end of this physical cluster 3155 * then bump to the next one and continue looking 3156 */ 3157 seg_index++; 3158 ps_info_valid = 0; 3159 3160 continue; 3161 } 3162 if (size == 0) 3163 break; 3164 } 3165 /* 3166 * remember the starting point of the first allocated page 3167 * for the I/O we're about to issue 3168 */ 3169 beg_pseg = seg_index; 3170 beg_indx = cl_index; 3171 cur_offset = vs_offset; 3172 3173 /* 3174 * calculate the size of the I/O that we can do... 3175 * this may span multiple physical segments if 3176 * they are contiguous 3177 */ 3178 for (xfer_size = 0; xfer_size < size; ) { 3179 3180 while (cl_index < pages_in_cl && xfer_size < size) { 3181 /* 3182 * accumulate allocated pages within 3183 * a physical segment 3184 */ 3185 if (CLMAP_ISSET(clmap, cl_index)) { 3186 xfer_size += vm_page_size; 3187 cur_offset += vm_page_size; 3188 cl_index++; 3189 3190 BS_STAT(psp[seg_index]->ps_bs, 3191 psp[seg_index]->ps_bs->bs_pages_in++); 3192 } else 3193 break; 3194 } 3195 if (cl_index < pages_in_cl || xfer_size >= size) { 3196 /* 3197 * we've hit an unallocated page or 3198 * the end of this request... see if 3199 * it's time to fire the I/O 3200 */ 3201 break; 3202 } 3203 /* 3204 * we've hit the end of the current physical 3205 * segment and there's more to do, so try 3206 * moving to the next one 3207 */ 3208 seg_index++; 3209 3210 ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0); 3211 psp[seg_index] = CLMAP_PS(clmap); 3212 ps_info_valid = 1; 3213 3214 if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) { 3215 /* 3216 * if the physical segment we're about 3217 * to step into is not contiguous to 3218 * the one we're currently in, or it's 3219 * in a different paging file, or 3220 * it hasn't been allocated.... 3221 * we stop this run and go check 3222 * to see if it's time to fire the I/O 3223 */ 3224 break; 3225 } 3226 /* 3227 * start with first page of the next physical 3228 * segment 3229 */ 3230 cl_index = 0; 3231 } 3232 if (xfer_size == 0) { 3233 /* 3234 * no I/O to generate for this segment 3235 */ 3236 continue; 3237 } 3238 if (cur_offset <= orig_vs_offset) { 3239 /* 3240 * we've hit a hole in our speculative cluster 3241 * before the offset that we're really after... 3242 * don't issue the I/O since it doesn't encompass 3243 * the original offset and we're looking to only 3244 * pull in the speculative pages if they can be 3245 * made part of a single I/O 3246 */ 3247 size -= xfer_size; 3248 vs_offset += xfer_size; 3249 3250 continue; 3251 } 3252 /* 3253 * we have a contiguous range of allocated pages 3254 * to read from that encompasses the original offset 3255 */ 3256 page_list_count = 0; 3257 memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, 3258 xfer_size, xfer_size, 3259 &upl, NULL, &page_list_count, 3260 request_flags | UPL_SET_INTERNAL); 3261 3262 error = ps_read_file(psp[beg_pseg], 3263 upl, (upl_offset_t) 0, 3264 ps_offset[beg_pseg] + (beg_indx * vm_page_size), 3265 xfer_size, &residual, io_flags); 3266 3267 3268 /* 3269 * Adjust counts and send response to VM. Optimize 3270 * for the common case, i.e. no error and/or partial 3271 * data. If there was an error, then we need to error 3272 * the entire range, even if some data was successfully 3273 * read. If there was a partial read we may supply some 3274 * data and may error some as well. In all cases the 3275 * VM must receive some notification for every page 3276 * in the range. 3277 */ 3278 if ((error == KERN_SUCCESS) && (residual == 0)) { 3279 /* 3280 * Got everything we asked for, supply the data 3281 * to the VM. Note that as a side effect of 3282 * supplying the data, the buffer holding the 3283 * supplied data is deallocated from the pager's 3284 * address space. 3285 */ 3286 lsize = xfer_size; 3287 failed_size = 0; 3288 } else { 3289 lsize = 0; 3290 failed_size = xfer_size; 3291 3292 if (error == KERN_SUCCESS) { 3293 if (residual == xfer_size) { 3294 /* 3295 * If a read operation returns no error 3296 * and no data moved, we turn it into 3297 * an error, assuming we're reading at 3298 * or beyong EOF. 3299 * Fall through and error the entire range. 3300 */ 3301 error = KERN_FAILURE; 3302 } else { 3303 /* 3304 * Otherwise, we have partial read. If 3305 * the part read is a integral number 3306 * of pages supply it. Otherwise round 3307 * it up to a page boundary, zero fill 3308 * the unread part, and supply it. 3309 * Fall through and error the remainder 3310 * of the range, if any. 3311 */ 3312 int fill; 3313 3314 fill = residual & (vm_page_size - 1); 3315 lsize = (xfer_size - residual) + fill; 3316 3317 if (lsize < xfer_size) 3318 failed_size = xfer_size - lsize; 3319 3320 if (reclaim_all == FALSE) 3321 error = KERN_FAILURE; 3322 } 3323 } 3324 } 3325 pvs_object_data_provided(vs, upl, vs_offset, lsize); 3326 3327 if (failed_size) { 3328 /* 3329 * There was an error in some part of the range, tell 3330 * the VM. Note that error is explicitly checked again 3331 * since it can be modified above. 3332 */ 3333 BS_STAT(psp[beg_pseg]->ps_bs, 3334 psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size)); 3335 } 3336 /* 3337 * we've issued a single I/O that encompassed the original offset 3338 * at this point we either met our speculative request length or 3339 * we ran into a 'hole' (i.e. page not present in the cluster, cluster 3340 * not present or not physically contiguous to the previous one), so 3341 * we're done issuing I/O at this point 3342 */ 3343 return (error); 3344 } 3345 } 3346 return error; 3347} 3348 3349int vs_do_async_write = 1; 3350 3351kern_return_t 3352vs_cluster_write( 3353 vstruct_t vs, 3354 upl_t internal_upl, 3355 upl_offset_t offset, 3356 upl_size_t cnt, 3357 boolean_t dp_internal, 3358 int flags) 3359{ 3360 upl_size_t transfer_size; 3361 int error = 0; 3362 struct clmap clmap; 3363 3364 dp_offset_t actual_offset; /* Offset within paging segment */ 3365 paging_segment_t ps; 3366 dp_offset_t mobj_base_addr; 3367 dp_offset_t mobj_target_addr; 3368 3369 upl_t upl; 3370 upl_page_info_t *pl; 3371 int page_index; 3372 unsigned int page_max_index; 3373 int list_size; 3374 int pages_in_cl; 3375 unsigned int cl_size; 3376 int base_index; 3377 unsigned int seg_size; 3378 unsigned int upl_offset_in_object; 3379 boolean_t minimal_clustering = FALSE; 3380 boolean_t found_dirty; 3381 3382 if (!dp_encryption_inited) { 3383 /* 3384 * ENCRYPTED SWAP: 3385 * Once we've started using swap, we 3386 * can't change our mind on whether 3387 * it needs to be encrypted or 3388 * not. 3389 */ 3390 dp_encryption_inited = TRUE; 3391 } 3392 if (dp_encryption) { 3393 /* 3394 * ENCRYPTED SWAP: 3395 * the UPL will need to be encrypted... 3396 */ 3397 flags |= UPL_PAGING_ENCRYPTED; 3398 } 3399 3400 pages_in_cl = 1 << vs->vs_clshift; 3401 cl_size = pages_in_cl * vm_page_size; 3402 3403#if CONFIG_FREEZE 3404 minimal_clustering = TRUE; 3405#else 3406 if (dp_isssd == TRUE) 3407 minimal_clustering = TRUE; 3408#endif 3409 if (!dp_internal) { 3410 unsigned int page_list_count; 3411 int request_flags; 3412 unsigned int super_size; 3413 int first_dirty; 3414 int num_dirty; 3415 int num_of_pages; 3416 int seg_index; 3417 upl_offset_t upl_offset; 3418 upl_offset_t upl_offset_aligned; 3419 dp_offset_t seg_offset; 3420 dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1]; 3421 paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1]; 3422 3423 3424 if (bs_low) 3425 super_size = cl_size; 3426 else 3427 super_size = VM_SUPER_CLUSTER; 3428 3429 request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE | 3430 UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM | 3431 UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE; 3432 3433 if (dp_encryption) { 3434 /* 3435 * ENCRYPTED SWAP: 3436 * request that the UPL be prepared for 3437 * encryption. 3438 */ 3439 request_flags |= UPL_ENCRYPT; 3440 flags |= UPL_PAGING_ENCRYPTED; 3441 } 3442 3443 page_list_count = 0; 3444 memory_object_super_upl_request(vs->vs_control, 3445 (memory_object_offset_t)offset, 3446 cnt, super_size, 3447 &upl, NULL, &page_list_count, 3448 request_flags | UPL_FOR_PAGEOUT); 3449 3450 /* 3451 * The default pager does not handle objects larger than 3452 * 4GB, so it does not deal with offset that don't fit in 3453 * 32-bit. Cast down upl->offset now and make sure we 3454 * did not lose any valuable bits. 3455 */ 3456 upl_offset_in_object = (unsigned int) upl->offset; 3457 assert(upl->offset == upl_offset_in_object); 3458 3459 pl = UPL_GET_INTERNAL_PAGE_LIST(upl); 3460 3461 seg_size = cl_size - (upl_offset_in_object % cl_size); 3462 upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1); 3463 page_index = 0; 3464 page_max_index = upl->size / PAGE_SIZE; 3465 found_dirty = TRUE; 3466 3467 for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) { 3468 3469 unsigned int seg_pgcnt; 3470 3471 seg_pgcnt = seg_size / PAGE_SIZE; 3472 3473 if (minimal_clustering == TRUE) { 3474 unsigned int non_dirty; 3475 3476 non_dirty = 0; 3477 found_dirty = FALSE; 3478 3479 for (; non_dirty < seg_pgcnt; non_dirty++) { 3480 if ((page_index + non_dirty) >= page_max_index) 3481 break; 3482 3483 if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) || 3484 UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) { 3485 found_dirty = TRUE; 3486 break; 3487 } 3488 } 3489 } 3490 if (found_dirty == TRUE) { 3491 ps_offset[seg_index] = 3492 ps_clmap(vs, 3493 upl_offset_aligned, 3494 &clmap, CL_ALLOC, 3495 cl_size, 0); 3496 3497 if (ps_offset[seg_index] == (dp_offset_t) -1) { 3498 upl_abort(upl, 0); 3499 upl_deallocate(upl); 3500 3501 return KERN_FAILURE; 3502 } 3503 psp[seg_index] = CLMAP_PS(clmap); 3504 } 3505 if (transfer_size > seg_size) { 3506 page_index += seg_pgcnt; 3507 transfer_size -= seg_size; 3508 upl_offset_aligned += cl_size; 3509 seg_size = cl_size; 3510 seg_index++; 3511 } else 3512 transfer_size = 0; 3513 } 3514 /* 3515 * Ignore any non-present pages at the end of the 3516 * UPL. 3517 */ 3518 for (page_index = upl->size / vm_page_size; page_index > 0;) { 3519 if (UPL_PAGE_PRESENT(pl, --page_index)) { 3520 page_index++; 3521 break; 3522 } 3523 } 3524 if (page_index == 0) { 3525 /* 3526 * no pages in the UPL 3527 * abort and return 3528 */ 3529 upl_abort(upl, 0); 3530 upl_deallocate(upl); 3531 3532 return KERN_SUCCESS; 3533 } 3534 num_of_pages = page_index; 3535 3536 base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE; 3537 3538 for (page_index = 0; page_index < num_of_pages; ) { 3539 /* 3540 * skip over non-dirty pages 3541 */ 3542 for ( ; page_index < num_of_pages; page_index++) { 3543 if (UPL_DIRTY_PAGE(pl, page_index) 3544 || UPL_PRECIOUS_PAGE(pl, page_index)) 3545 /* 3546 * this is a page we need to write 3547 * go see if we can buddy it up with 3548 * others that are contiguous to it 3549 */ 3550 break; 3551 /* 3552 * if the page is not-dirty, but present we 3553 * need to commit it... This is an unusual 3554 * case since we only asked for dirty pages 3555 */ 3556 if (UPL_PAGE_PRESENT(pl, page_index)) { 3557 boolean_t empty = FALSE; 3558 upl_commit_range(upl, 3559 page_index * vm_page_size, 3560 vm_page_size, 3561 UPL_COMMIT_NOTIFY_EMPTY, 3562 pl, 3563 page_list_count, 3564 &empty); 3565 if (empty) { 3566 assert(page_index == 3567 num_of_pages - 1); 3568 upl_deallocate(upl); 3569 } 3570 } 3571 } 3572 if (page_index == num_of_pages) 3573 /* 3574 * no more pages to look at, we're out of here 3575 */ 3576 break; 3577 3578 /* 3579 * gather up contiguous dirty pages... we have at 3580 * least 1 * otherwise we would have bailed above 3581 * make sure that each physical segment that we step 3582 * into is contiguous to the one we're currently in 3583 * if it's not, we have to stop and write what we have 3584 */ 3585 for (first_dirty = page_index; 3586 page_index < num_of_pages; ) { 3587 if ( !UPL_DIRTY_PAGE(pl, page_index) 3588 && !UPL_PRECIOUS_PAGE(pl, page_index)) 3589 break; 3590 page_index++; 3591 /* 3592 * if we just looked at the last page in the UPL 3593 * we don't need to check for physical segment 3594 * continuity 3595 */ 3596 if (page_index < num_of_pages) { 3597 int cur_seg; 3598 int nxt_seg; 3599 3600 cur_seg = (base_index + (page_index - 1))/pages_in_cl; 3601 nxt_seg = (base_index + page_index)/pages_in_cl; 3602 3603 if (cur_seg != nxt_seg) { 3604 if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg])) 3605 /* 3606 * if the segment we're about 3607 * to step into is not 3608 * contiguous to the one we're 3609 * currently in, or it's in a 3610 * different paging file.... 3611 * we stop here and generate 3612 * the I/O 3613 */ 3614 break; 3615 } 3616 } 3617 } 3618 num_dirty = page_index - first_dirty; 3619 3620 if (num_dirty) { 3621 upl_offset = first_dirty * vm_page_size; 3622 transfer_size = num_dirty * vm_page_size; 3623 3624 while (transfer_size) { 3625 3626 if ((seg_size = cl_size - 3627 ((upl_offset_in_object + 3628 upl_offset) % cl_size)) 3629 > transfer_size) 3630 seg_size = transfer_size; 3631 3632 ps_vs_write_complete( 3633 vs, 3634 (upl_offset_in_object + 3635 upl_offset), 3636 seg_size, error); 3637 3638 transfer_size -= seg_size; 3639 upl_offset += seg_size; 3640 } 3641 upl_offset = first_dirty * vm_page_size; 3642 transfer_size = num_dirty * vm_page_size; 3643 3644 seg_index = (base_index + first_dirty) / pages_in_cl; 3645 seg_offset = (upl_offset_in_object + upl_offset) % cl_size; 3646 3647 error = ps_write_file(psp[seg_index], 3648 upl, upl_offset, 3649 ps_offset[seg_index] 3650 + seg_offset, 3651 transfer_size, flags); 3652 } 3653 } 3654 3655 } else { 3656 assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift)); 3657 list_size = cnt; 3658 3659 page_index = 0; 3660 /* The caller provides a mapped_data which is derived */ 3661 /* from a temporary object. The targeted pages are */ 3662 /* guaranteed to be set at offset 0 in the mapped_data */ 3663 /* The actual offset however must still be derived */ 3664 /* from the offset in the vs in question */ 3665 mobj_base_addr = offset; 3666 mobj_target_addr = mobj_base_addr; 3667 3668 for (transfer_size = list_size; transfer_size != 0;) { 3669 actual_offset = ps_clmap(vs, mobj_target_addr, 3670 &clmap, CL_ALLOC, 3671 transfer_size < cl_size ? 3672 transfer_size : cl_size, 0); 3673 if(actual_offset == (dp_offset_t) -1) { 3674 error = 1; 3675 break; 3676 } 3677 cnt = MIN(transfer_size, 3678 (unsigned) CLMAP_NPGS(clmap) * vm_page_size); 3679 ps = CLMAP_PS(clmap); 3680 /* Assume that the caller has given us contiguous */ 3681 /* pages */ 3682 if(cnt) { 3683 ps_vs_write_complete(vs, mobj_target_addr, 3684 cnt, error); 3685 error = ps_write_file(ps, internal_upl, 3686 0, actual_offset, 3687 cnt, flags); 3688 if (error) 3689 break; 3690 } 3691 if (error) 3692 break; 3693 actual_offset += cnt; 3694 mobj_target_addr += cnt; 3695 transfer_size -= cnt; 3696 cnt = 0; 3697 3698 if (error) 3699 break; 3700 } 3701 } 3702 if(error) 3703 return KERN_FAILURE; 3704 else 3705 return KERN_SUCCESS; 3706} 3707 3708vm_size_t 3709ps_vstruct_allocated_size( 3710 vstruct_t vs) 3711{ 3712 int num_pages; 3713 struct vs_map *vsmap; 3714 unsigned int i, j, k; 3715 3716 num_pages = 0; 3717 if (vs->vs_indirect) { 3718 /* loop on indirect maps */ 3719 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { 3720 vsmap = vs->vs_imap[i]; 3721 if (vsmap == NULL) 3722 continue; 3723 /* loop on clusters in this indirect map */ 3724 for (j = 0; j < CLMAP_ENTRIES; j++) { 3725 if (VSM_ISCLR(vsmap[j]) || 3726 VSM_ISERR(vsmap[j])) 3727 continue; 3728 /* loop on pages in this cluster */ 3729 for (k = 0; k < VSCLSIZE(vs); k++) { 3730 if ((VSM_BMAP(vsmap[j])) & (1 << k)) 3731 num_pages++; 3732 } 3733 } 3734 } 3735 } else { 3736 vsmap = vs->vs_dmap; 3737 if (vsmap == NULL) 3738 return 0; 3739 /* loop on clusters in the direct map */ 3740 for (j = 0; j < CLMAP_ENTRIES; j++) { 3741 if (VSM_ISCLR(vsmap[j]) || 3742 VSM_ISERR(vsmap[j])) 3743 continue; 3744 /* loop on pages in this cluster */ 3745 for (k = 0; k < VSCLSIZE(vs); k++) { 3746 if ((VSM_BMAP(vsmap[j])) & (1 << k)) 3747 num_pages++; 3748 } 3749 } 3750 } 3751 3752 return ptoa_32(num_pages); 3753} 3754 3755unsigned int 3756ps_vstruct_allocated_pages( 3757 vstruct_t vs, 3758 default_pager_page_t *pages, 3759 unsigned int pages_size) 3760{ 3761 unsigned int num_pages; 3762 struct vs_map *vsmap; 3763 dp_offset_t offset; 3764 unsigned int i, j, k; 3765 3766 num_pages = 0; 3767 offset = 0; 3768 if (vs->vs_indirect) { 3769 /* loop on indirect maps */ 3770 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { 3771 vsmap = vs->vs_imap[i]; 3772 if (vsmap == NULL) { 3773 offset += (vm_page_size * CLMAP_ENTRIES * 3774 VSCLSIZE(vs)); 3775 continue; 3776 } 3777 /* loop on clusters in this indirect map */ 3778 for (j = 0; j < CLMAP_ENTRIES; j++) { 3779 if (VSM_ISCLR(vsmap[j]) || 3780 VSM_ISERR(vsmap[j])) { 3781 offset += vm_page_size * VSCLSIZE(vs); 3782 continue; 3783 } 3784 /* loop on pages in this cluster */ 3785 for (k = 0; k < VSCLSIZE(vs); k++) { 3786 if ((VSM_BMAP(vsmap[j])) & (1 << k)) { 3787 num_pages++; 3788 if (num_pages < pages_size) 3789 pages++->dpp_offset = 3790 offset; 3791 } 3792 offset += vm_page_size; 3793 } 3794 } 3795 } 3796 } else { 3797 vsmap = vs->vs_dmap; 3798 if (vsmap == NULL) 3799 return 0; 3800 /* loop on clusters in the direct map */ 3801 for (j = 0; j < CLMAP_ENTRIES; j++) { 3802 if (VSM_ISCLR(vsmap[j]) || 3803 VSM_ISERR(vsmap[j])) { 3804 offset += vm_page_size * VSCLSIZE(vs); 3805 continue; 3806 } 3807 /* loop on pages in this cluster */ 3808 for (k = 0; k < VSCLSIZE(vs); k++) { 3809 if ((VSM_BMAP(vsmap[j])) & (1 << k)) { 3810 num_pages++; 3811 if (num_pages < pages_size) 3812 pages++->dpp_offset = offset; 3813 } 3814 offset += vm_page_size; 3815 } 3816 } 3817 } 3818 3819 return num_pages; 3820} 3821 3822 3823kern_return_t 3824ps_vstruct_transfer_from_segment( 3825 vstruct_t vs, 3826 paging_segment_t segment, 3827 upl_t upl) 3828{ 3829 struct vs_map *vsmap; 3830// struct vs_map old_vsmap; 3831// struct vs_map new_vsmap; 3832 unsigned int i, j; 3833 3834 VS_LOCK(vs); /* block all work on this vstruct */ 3835 /* can't allow the normal multiple write */ 3836 /* semantic because writes may conflict */ 3837 vs->vs_xfer_pending = TRUE; 3838 vs_wait_for_sync_writers(vs); 3839 vs_start_write(vs); 3840 vs_wait_for_readers(vs); 3841 /* we will unlock the vs to allow other writes while transferring */ 3842 /* and will be guaranteed of the persistance of the vs struct */ 3843 /* because the caller of ps_vstruct_transfer_from_segment bumped */ 3844 /* vs_async_pending */ 3845 /* OK we now have guaranteed no other parties are accessing this */ 3846 /* vs. Now that we are also supporting simple lock versions of */ 3847 /* vs_lock we cannot hold onto VS_LOCK as we may block below. */ 3848 /* our purpose in holding it before was the multiple write case */ 3849 /* we now use the boolean xfer_pending to do that. We can use */ 3850 /* a boolean instead of a count because we have guaranteed single */ 3851 /* file access to this code in its caller */ 3852 VS_UNLOCK(vs); 3853vs_changed: 3854 if (vs->vs_indirect) { 3855 unsigned int vsmap_size; 3856 int clmap_off; 3857 /* loop on indirect maps */ 3858 for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { 3859 vsmap = vs->vs_imap[i]; 3860 if (vsmap == NULL) 3861 continue; 3862 /* loop on clusters in this indirect map */ 3863 clmap_off = (vm_page_size * CLMAP_ENTRIES * 3864 VSCLSIZE(vs) * i); 3865 if(i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size)) 3866 vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i); 3867 else 3868 vsmap_size = CLMAP_ENTRIES; 3869 for (j = 0; j < vsmap_size; j++) { 3870 if (VSM_ISCLR(vsmap[j]) || 3871 VSM_ISERR(vsmap[j]) || 3872 (VSM_PS(vsmap[j]) != segment)) 3873 continue; 3874 if(vs_cluster_transfer(vs, 3875 (vm_page_size * (j << vs->vs_clshift)) 3876 + clmap_off, 3877 vm_page_size << vs->vs_clshift, 3878 upl) 3879 != KERN_SUCCESS) { 3880 VS_LOCK(vs); 3881 vs->vs_xfer_pending = FALSE; 3882 VS_UNLOCK(vs); 3883 vs_finish_write(vs); 3884 return KERN_FAILURE; 3885 } 3886 /* allow other readers/writers during transfer*/ 3887 VS_LOCK(vs); 3888 vs->vs_xfer_pending = FALSE; 3889 VS_UNLOCK(vs); 3890 vs_finish_write(vs); 3891 3892 if (backing_store_abort_compaction || backing_store_stop_compaction) { 3893 backing_store_abort_compaction = FALSE; 3894 dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n")); 3895 return KERN_FAILURE; 3896 } 3897 vnode_pager_throttle(); 3898 3899 VS_LOCK(vs); 3900 vs->vs_xfer_pending = TRUE; 3901 vs_wait_for_sync_writers(vs); 3902 vs_start_write(vs); 3903 vs_wait_for_readers(vs); 3904 VS_UNLOCK(vs); 3905 if (!(vs->vs_indirect)) { 3906 goto vs_changed; 3907 } 3908 } 3909 } 3910 } else { 3911 vsmap = vs->vs_dmap; 3912 if (vsmap == NULL) { 3913 VS_LOCK(vs); 3914 vs->vs_xfer_pending = FALSE; 3915 VS_UNLOCK(vs); 3916 vs_finish_write(vs); 3917 return KERN_SUCCESS; 3918 } 3919 /* loop on clusters in the direct map */ 3920 for (j = 0; j < vs->vs_size; j++) { 3921 if (VSM_ISCLR(vsmap[j]) || 3922 VSM_ISERR(vsmap[j]) || 3923 (VSM_PS(vsmap[j]) != segment)) 3924 continue; 3925 if(vs_cluster_transfer(vs, 3926 vm_page_size * (j << vs->vs_clshift), 3927 vm_page_size << vs->vs_clshift, 3928 upl) != KERN_SUCCESS) { 3929 VS_LOCK(vs); 3930 vs->vs_xfer_pending = FALSE; 3931 VS_UNLOCK(vs); 3932 vs_finish_write(vs); 3933 return KERN_FAILURE; 3934 } 3935 /* allow other readers/writers during transfer*/ 3936 VS_LOCK(vs); 3937 vs->vs_xfer_pending = FALSE; 3938 VS_UNLOCK(vs); 3939 vs_finish_write(vs); 3940 VS_LOCK(vs); 3941 vs->vs_xfer_pending = TRUE; 3942 vs_wait_for_sync_writers(vs); 3943 vs_start_write(vs); 3944 vs_wait_for_readers(vs); 3945 VS_UNLOCK(vs); 3946 if (vs->vs_indirect) { 3947 goto vs_changed; 3948 } 3949 } 3950 } 3951 3952 VS_LOCK(vs); 3953 vs->vs_xfer_pending = FALSE; 3954 VS_UNLOCK(vs); 3955 vs_finish_write(vs); 3956 return KERN_SUCCESS; 3957} 3958 3959 3960 3961vs_map_t 3962vs_get_map_entry( 3963 vstruct_t vs, 3964 dp_offset_t offset) 3965{ 3966 struct vs_map *vsmap; 3967 dp_offset_t cluster; 3968 3969 cluster = atop_32(offset) >> vs->vs_clshift; 3970 if (vs->vs_indirect) { 3971 long ind_block = cluster/CLMAP_ENTRIES; 3972 3973 /* Is the indirect block allocated? */ 3974 vsmap = vs->vs_imap[ind_block]; 3975 if(vsmap == (vs_map_t) NULL) 3976 return vsmap; 3977 } else 3978 vsmap = vs->vs_dmap; 3979 vsmap += cluster%CLMAP_ENTRIES; 3980 return vsmap; 3981} 3982 3983kern_return_t 3984vs_cluster_transfer( 3985 vstruct_t vs, 3986 dp_offset_t offset, 3987 dp_size_t cnt, 3988 upl_t upl) 3989{ 3990 dp_offset_t actual_offset; 3991 paging_segment_t ps; 3992 struct clmap clmap; 3993 kern_return_t error = KERN_SUCCESS; 3994 unsigned int size, size_wanted; 3995 int i; 3996 unsigned int residual = 0; 3997 unsigned int unavail_size; 3998// default_pager_thread_t *dpt; 3999// boolean_t dealloc; 4000 struct vs_map *vsmap_ptr = NULL; 4001 struct vs_map read_vsmap; 4002 struct vs_map original_read_vsmap; 4003 struct vs_map write_vsmap; 4004// upl_t sync_upl; 4005// vm_offset_t ioaddr; 4006 4007 /* vs_cluster_transfer reads in the pages of a cluster and 4008 * then writes these pages back to new backing store. The 4009 * segment the pages are being read from is assumed to have 4010 * been taken off-line and is no longer considered for new 4011 * space requests. 4012 */ 4013 4014 /* 4015 * This loop will be executed once per cluster referenced. 4016 * Typically this means once, since it's unlikely that the 4017 * VM system will ask for anything spanning cluster boundaries. 4018 * 4019 * If there are holes in a cluster (in a paging segment), we stop 4020 * reading at the hole, then loop again, hoping to 4021 * find valid pages later in the cluster. This continues until 4022 * the entire range has been examined, and read, if present. The 4023 * pages are written as they are read. If a failure occurs after 4024 * some pages are written the unmap call at the bottom of the loop 4025 * recovers the backing store and the old backing store remains 4026 * in effect. 4027 */ 4028 4029 VSM_CLR(write_vsmap); 4030 VSM_CLR(original_read_vsmap); 4031 /* grab the actual object's pages to sync with I/O */ 4032 while (cnt && (error == KERN_SUCCESS)) { 4033 vsmap_ptr = vs_get_map_entry(vs, offset); 4034 actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0); 4035 4036 if (actual_offset == (dp_offset_t) -1) { 4037 4038 /* 4039 * Nothing left to write in this cluster at least 4040 * set write cluster information for any previous 4041 * write, clear for next cluster, if there is one 4042 */ 4043 unsigned int local_size, clmask, clsize; 4044 4045 clsize = vm_page_size << vs->vs_clshift; 4046 clmask = clsize - 1; 4047 local_size = clsize - (offset & clmask); 4048 ASSERT(local_size); 4049 local_size = MIN(local_size, cnt); 4050 4051 /* This cluster has no data in it beyond what may */ 4052 /* have been found on a previous iteration through */ 4053 /* the loop "write_vsmap" */ 4054 *vsmap_ptr = write_vsmap; 4055 VSM_CLR(write_vsmap); 4056 VSM_CLR(original_read_vsmap); 4057 4058 cnt -= local_size; 4059 offset += local_size; 4060 continue; 4061 } 4062 4063 /* 4064 * Count up contiguous available or unavailable 4065 * pages. 4066 */ 4067 ps = CLMAP_PS(clmap); 4068 ASSERT(ps); 4069 size = 0; 4070 unavail_size = 0; 4071 for (i = 0; 4072 (size < cnt) && (unavail_size < cnt) && 4073 (i < CLMAP_NPGS(clmap)); i++) { 4074 if (CLMAP_ISSET(clmap, i)) { 4075 if (unavail_size != 0) 4076 break; 4077 size += vm_page_size; 4078 BS_STAT(ps->ps_bs, 4079 ps->ps_bs->bs_pages_in++); 4080 } else { 4081 if (size != 0) 4082 break; 4083 unavail_size += vm_page_size; 4084 } 4085 } 4086 4087 if (size == 0) { 4088 ASSERT(unavail_size); 4089 ps_clunmap(vs, offset, unavail_size); 4090 cnt -= unavail_size; 4091 offset += unavail_size; 4092 if((offset & ((vm_page_size << vs->vs_clshift) - 1)) 4093 == 0) { 4094 /* There is no more to transfer in this 4095 cluster 4096 */ 4097 *vsmap_ptr = write_vsmap; 4098 VSM_CLR(write_vsmap); 4099 VSM_CLR(original_read_vsmap); 4100 } 4101 continue; 4102 } 4103 4104 if(VSM_ISCLR(original_read_vsmap)) 4105 original_read_vsmap = *vsmap_ptr; 4106 4107 if(ps->ps_segtype == PS_PARTITION) { 4108 panic("swap partition not supported\n"); 4109 /*NOTREACHED*/ 4110 error = KERN_FAILURE; 4111 residual = size; 4112/* 4113 NEED TO ISSUE WITH SYNC & NO COMMIT 4114 error = ps_read_device(ps, actual_offset, &buffer, 4115 size, &residual, flags); 4116*/ 4117 } else { 4118 /* NEED TO ISSUE WITH SYNC & NO COMMIT */ 4119 error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset, 4120 size, &residual, 4121 (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0))); 4122 } 4123 4124 read_vsmap = *vsmap_ptr; 4125 4126 4127 /* 4128 * Adjust counts and put data in new BS. Optimize for the 4129 * common case, i.e. no error and/or partial data. 4130 * If there was an error, then we need to error the entire 4131 * range, even if some data was successfully read. 4132 * 4133 */ 4134 if ((error == KERN_SUCCESS) && (residual == 0)) { 4135 4136 /* 4137 * Got everything we asked for, supply the data to 4138 * the new BS. Note that as a side effect of supplying 4139 * the data, the buffer holding the supplied data is 4140 * deallocated from the pager's address space unless 4141 * the write is unsuccessful. 4142 */ 4143 4144 /* note buffer will be cleaned up in all cases by */ 4145 /* internal_cluster_write or if an error on write */ 4146 /* the vm_map_copy_page_discard call */ 4147 *vsmap_ptr = write_vsmap; 4148 4149 if(vs_cluster_write(vs, upl, offset, 4150 size, TRUE, UPL_IOSYNC | UPL_NOCOMMIT ) != KERN_SUCCESS) { 4151 error = KERN_FAILURE; 4152 if(!(VSM_ISCLR(*vsmap_ptr))) { 4153 /* unmap the new backing store object */ 4154 ps_clunmap(vs, offset, size); 4155 } 4156 /* original vsmap */ 4157 *vsmap_ptr = original_read_vsmap; 4158 VSM_CLR(write_vsmap); 4159 } else { 4160 if((offset + size) & 4161 ((vm_page_size << vs->vs_clshift) 4162 - 1)) { 4163 /* There is more to transfer in this 4164 cluster 4165 */ 4166 write_vsmap = *vsmap_ptr; 4167 *vsmap_ptr = read_vsmap; 4168 ps_clunmap(vs, offset, size); 4169 } else { 4170 /* discard the old backing object */ 4171 write_vsmap = *vsmap_ptr; 4172 *vsmap_ptr = read_vsmap; 4173 ps_clunmap(vs, offset, size); 4174 *vsmap_ptr = write_vsmap; 4175 VSM_CLR(write_vsmap); 4176 VSM_CLR(original_read_vsmap); 4177 } 4178 } 4179 } else { 4180 size_wanted = size; 4181 if (error == KERN_SUCCESS) { 4182 if (residual == size) { 4183 /* 4184 * If a read operation returns no error 4185 * and no data moved, we turn it into 4186 * an error, assuming we're reading at 4187 * or beyond EOF. 4188 * Fall through and error the entire 4189 * range. 4190 */ 4191 error = KERN_FAILURE; 4192 *vsmap_ptr = write_vsmap; 4193 if(!(VSM_ISCLR(*vsmap_ptr))) { 4194 /* unmap the new backing store object */ 4195 ps_clunmap(vs, offset, size); 4196 } 4197 *vsmap_ptr = original_read_vsmap; 4198 VSM_CLR(write_vsmap); 4199 continue; 4200 } else { 4201 /* 4202 * Otherwise, we have partial read. 4203 * This is also considered an error 4204 * for the purposes of cluster transfer 4205 */ 4206 error = KERN_FAILURE; 4207 *vsmap_ptr = write_vsmap; 4208 if(!(VSM_ISCLR(*vsmap_ptr))) { 4209 /* unmap the new backing store object */ 4210 ps_clunmap(vs, offset, size); 4211 } 4212 *vsmap_ptr = original_read_vsmap; 4213 VSM_CLR(write_vsmap); 4214 continue; 4215 } 4216 } 4217 4218 } 4219 cnt -= size; 4220 offset += size; 4221 4222 } /* END while (cnt && (error == 0)) */ 4223 if(!VSM_ISCLR(write_vsmap)) 4224 *vsmap_ptr = write_vsmap; 4225 4226 return error; 4227} 4228 4229kern_return_t 4230default_pager_add_file( 4231 MACH_PORT_FACE backing_store, 4232 vnode_ptr_t vp, 4233 int record_size, 4234 vm_size_t size) 4235{ 4236 backing_store_t bs; 4237 paging_segment_t ps; 4238 int i; 4239 unsigned int j; 4240 int error; 4241 4242 if ((bs = backing_store_lookup(backing_store)) 4243 == BACKING_STORE_NULL) 4244 return KERN_INVALID_ARGUMENT; 4245 4246 PSL_LOCK(); 4247 for (i = 0; i <= paging_segment_max; i++) { 4248 ps = paging_segments[i]; 4249 if (ps == PAGING_SEGMENT_NULL) 4250 continue; 4251 if (ps->ps_segtype != PS_FILE) 4252 continue; 4253 4254 /* 4255 * Check for overlap on same device. 4256 */ 4257 if (ps->ps_vnode == (struct vnode *)vp) { 4258 PSL_UNLOCK(); 4259 BS_UNLOCK(bs); 4260 return KERN_INVALID_ARGUMENT; 4261 } 4262 } 4263 PSL_UNLOCK(); 4264 4265 /* 4266 * Set up the paging segment 4267 */ 4268 ps = (paging_segment_t) kalloc(sizeof (struct paging_segment)); 4269 if (ps == PAGING_SEGMENT_NULL) { 4270 BS_UNLOCK(bs); 4271 return KERN_RESOURCE_SHORTAGE; 4272 } 4273 4274 ps->ps_segtype = PS_FILE; 4275 ps->ps_vnode = (struct vnode *)vp; 4276 ps->ps_offset = 0; 4277 ps->ps_record_shift = local_log2(vm_page_size / record_size); 4278 assert((dp_size_t) size == size); 4279 ps->ps_recnum = (dp_size_t) size; 4280 ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift; 4281 4282 ps->ps_pgcount = ps->ps_pgnum; 4283 ps->ps_clshift = local_log2(bs->bs_clsize); 4284 ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; 4285 ps->ps_special_clusters = 0; 4286 ps->ps_hint = 0; 4287 4288 PS_LOCK_INIT(ps); 4289 ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); 4290 if (!ps->ps_bmap) { 4291 PS_LOCK_DESTROY(ps); 4292 kfree(ps, sizeof *ps); 4293 BS_UNLOCK(bs); 4294 return KERN_RESOURCE_SHORTAGE; 4295 } 4296 for (j = 0; j < ps->ps_ncls; j++) { 4297 clrbit(ps->ps_bmap, j); 4298 } 4299 4300 if(paging_segment_count == 0) { 4301 ps->ps_state = PS_EMERGENCY_SEGMENT; 4302 if(use_emergency_swap_file_first) { 4303 ps->ps_state |= PS_CAN_USE; 4304 } 4305 emergency_segment_backing_store = backing_store; 4306 } else { 4307 ps->ps_state = PS_CAN_USE; 4308 } 4309 4310 ps->ps_bs = bs; 4311 4312 if ((error = ps_enter(ps)) != 0) { 4313 kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); 4314 PS_LOCK_DESTROY(ps); 4315 kfree(ps, sizeof *ps); 4316 BS_UNLOCK(bs); 4317 return KERN_RESOURCE_SHORTAGE; 4318 } 4319 4320 bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; 4321 bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift; 4322 PSL_LOCK(); 4323 if(IS_PS_OK_TO_USE(ps)) { 4324 dp_pages_free += ps->ps_pgcount; 4325 } else { 4326 dp_pages_reserve += ps->ps_pgcount; 4327 } 4328 PSL_UNLOCK(); 4329 4330 BS_UNLOCK(bs); 4331 4332 bs_more_space(ps->ps_clcount); 4333 4334 /* 4335 * If the paging segment being activated is not the emergency 4336 * segment and we notice that the emergency segment is being 4337 * used then we help recover it. If all goes well, the 4338 * emergency segment will be back to its original state of 4339 * online but not activated (till it's needed the next time). 4340 */ 4341#if CONFIG_FREEZE 4342 if (!memorystatus_freeze_enabled) 4343#endif 4344 { 4345 ps = paging_segments[EMERGENCY_PSEG_INDEX]; 4346 if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) { 4347 if(default_pager_backing_store_delete(emergency_segment_backing_store)) { 4348 dprintf(("Failed to recover emergency paging segment\n")); 4349 } else { 4350 dprintf(("Recovered emergency paging segment\n")); 4351 } 4352 } 4353 } 4354 4355 DP_DEBUG(DEBUG_BS_INTERNAL, 4356 ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", 4357 device, offset, (dp_size_t) size, record_size, 4358 ps->ps_record_shift, ps->ps_pgnum)); 4359 4360 return KERN_SUCCESS; 4361} 4362 4363 4364 4365kern_return_t 4366ps_read_file( 4367 paging_segment_t ps, 4368 upl_t upl, 4369 upl_offset_t upl_offset, 4370 dp_offset_t offset, 4371 upl_size_t size, 4372 unsigned int *residualp, 4373 int flags) 4374{ 4375 vm_object_offset_t f_offset; 4376 int error = 0; 4377 int result; 4378 4379 assert(dp_encryption_inited); 4380 4381 clustered_reads[atop_32(size)]++; 4382 4383 f_offset = (vm_object_offset_t)(ps->ps_offset + offset); 4384 4385 /* 4386 * for transfer case we need to pass uploffset and flags 4387 */ 4388 assert((upl_size_t) size == size); 4389 error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL); 4390 4391 /* The vnode_pagein semantic is somewhat at odds with the existing */ 4392 /* device_read semantic. Partial reads are not experienced at this */ 4393 /* level. It is up to the bit map code and cluster read code to */ 4394 /* check that requested data locations are actually backed, and the */ 4395 /* pagein code to either read all of the requested data or return an */ 4396 /* error. */ 4397 4398 if (error) 4399 result = KERN_FAILURE; 4400 else { 4401 *residualp = 0; 4402 result = KERN_SUCCESS; 4403 } 4404 return result; 4405} 4406 4407kern_return_t 4408ps_write_file( 4409 paging_segment_t ps, 4410 upl_t upl, 4411 upl_offset_t upl_offset, 4412 dp_offset_t offset, 4413 unsigned int size, 4414 int flags) 4415{ 4416 vm_object_offset_t f_offset; 4417 kern_return_t result; 4418 4419 assert(dp_encryption_inited); 4420 4421 clustered_writes[atop_32(size)]++; 4422 f_offset = (vm_object_offset_t)(ps->ps_offset + offset); 4423 4424 if (flags & UPL_PAGING_ENCRYPTED) { 4425 /* 4426 * ENCRYPTED SWAP: 4427 * encrypt all the pages that we're going 4428 * to pageout. 4429 */ 4430 upl_encrypt(upl, upl_offset, size); 4431 } 4432 assert((upl_size_t) size == size); 4433 if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL)) 4434 result = KERN_FAILURE; 4435 else 4436 result = KERN_SUCCESS; 4437 4438 return result; 4439} 4440 4441static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data) 4442{ 4443#pragma unused(data) 4444} 4445 4446static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data) 4447{ 4448#pragma unused(data) 4449} 4450 4451static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length) 4452{ 4453#pragma unused(data, map, shift, length) 4454} 4455 4456kern_return_t 4457default_pager_triggers( __unused MACH_PORT_FACE default_pager, 4458 int hi_wat, 4459 int lo_wat, 4460 int flags, 4461 MACH_PORT_FACE trigger_port) 4462{ 4463 MACH_PORT_FACE release = IPC_PORT_NULL; 4464 kern_return_t kr; 4465 clock_sec_t now; 4466 clock_nsec_t nanoseconds_dummy; 4467 static clock_sec_t error_notify = 0; 4468 4469 PSL_LOCK(); 4470 if (flags == SWAP_ENCRYPT_ON) { 4471 /* ENCRYPTED SWAP: turn encryption on */ 4472 release = trigger_port; 4473 if (!dp_encryption_inited) { 4474 dp_encryption_inited = TRUE; 4475 dp_encryption = TRUE; 4476 kr = KERN_SUCCESS; 4477 } else { 4478 kr = KERN_FAILURE; 4479 } 4480 } else if (flags == SWAP_ENCRYPT_OFF) { 4481 /* ENCRYPTED SWAP: turn encryption off */ 4482 release = trigger_port; 4483 if (!dp_encryption_inited) { 4484 dp_encryption_inited = TRUE; 4485 dp_encryption = FALSE; 4486 kr = KERN_SUCCESS; 4487 } else { 4488 kr = KERN_FAILURE; 4489 } 4490 } else if (flags == HI_WAT_ALERT) { 4491 release = min_pages_trigger_port; 4492#if CONFIG_FREEZE 4493 /* High and low water signals aren't applicable when freeze is */ 4494 /* enabled, so release the trigger ports here and return */ 4495 /* KERN_FAILURE. */ 4496 if (memorystatus_freeze_enabled) { 4497 if (IP_VALID( trigger_port )){ 4498 ipc_port_release_send( trigger_port ); 4499 } 4500 min_pages_trigger_port = IPC_PORT_NULL; 4501 kr = KERN_FAILURE; 4502 } 4503 else 4504#endif 4505 { 4506 min_pages_trigger_port = trigger_port; 4507 minimum_pages_remaining = hi_wat/vm_page_size; 4508 bs_low = FALSE; 4509 kr = KERN_SUCCESS; 4510 } 4511 } else if (flags == LO_WAT_ALERT) { 4512 release = max_pages_trigger_port; 4513#if CONFIG_FREEZE 4514 if (memorystatus_freeze_enabled) { 4515 if (IP_VALID( trigger_port )){ 4516 ipc_port_release_send( trigger_port ); 4517 } 4518 max_pages_trigger_port = IPC_PORT_NULL; 4519 kr = KERN_FAILURE; 4520 } 4521 else 4522#endif 4523 { 4524 max_pages_trigger_port = trigger_port; 4525 maximum_pages_free = lo_wat/vm_page_size; 4526 kr = KERN_SUCCESS; 4527 } 4528 } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) { 4529 use_emergency_swap_file_first = TRUE; 4530 release = trigger_port; 4531 kr = KERN_SUCCESS; 4532 } else if (flags == SWAP_FILE_CREATION_ERROR) { 4533 release = trigger_port; 4534 kr = KERN_SUCCESS; 4535 if( paging_segment_count == 1) { 4536 use_emergency_swap_file_first = TRUE; 4537 } 4538 no_paging_space_action(); 4539 clock_get_system_nanotime(&now, &nanoseconds_dummy); 4540 if (now > error_notify + 5) { 4541 dprintf(("Swap File Error.\n")); 4542 error_notify = now; 4543 } 4544 } else { 4545 release = trigger_port; 4546 kr = KERN_INVALID_ARGUMENT; 4547 } 4548 PSL_UNLOCK(); 4549 4550 if (IP_VALID(release)) 4551 ipc_port_release_send(release); 4552 4553 return kr; 4554} 4555 4556/* 4557 * Monitor the amount of available backing store vs. the amount of 4558 * required backing store, notify a listener (if present) when 4559 * backing store may safely be removed. 4560 * 4561 * We attempt to avoid the situation where backing store is 4562 * discarded en masse, as this can lead to thrashing as the 4563 * backing store is compacted. 4564 */ 4565 4566#define PF_INTERVAL 3 /* time between free level checks */ 4567#define PF_LATENCY 10 /* number of intervals before release */ 4568 4569static int dp_pages_free_low_count = 0; 4570thread_call_t default_pager_backing_store_monitor_callout; 4571 4572void 4573default_pager_backing_store_monitor(__unused thread_call_param_t p1, 4574 __unused thread_call_param_t p2) 4575{ 4576// unsigned long long average; 4577 ipc_port_t trigger; 4578 uint64_t deadline; 4579 4580 /* 4581 * We determine whether it will be safe to release some 4582 * backing store by watching the free page level. If 4583 * it remains below the maximum_pages_free threshold for 4584 * at least PF_LATENCY checks (taken at PF_INTERVAL seconds) 4585 * then we deem it safe. 4586 * 4587 * Note that this establishes a maximum rate at which backing 4588 * store will be released, as each notification (currently) 4589 * only results in a single backing store object being 4590 * released. 4591 */ 4592 if (dp_pages_free > maximum_pages_free) { 4593 dp_pages_free_low_count++; 4594 } else { 4595 dp_pages_free_low_count = 0; 4596 } 4597 4598 /* decide whether to send notification */ 4599 trigger = IP_NULL; 4600 if (max_pages_trigger_port && 4601 (backing_store_release_trigger_disable == 0) && 4602 (dp_pages_free_low_count > PF_LATENCY)) { 4603 trigger = max_pages_trigger_port; 4604 max_pages_trigger_port = NULL; 4605 } 4606 4607 /* send notification */ 4608 if (trigger != IP_NULL) { 4609 VSL_LOCK(); 4610 if(backing_store_release_trigger_disable != 0) { 4611 assert_wait((event_t) 4612 &backing_store_release_trigger_disable, 4613 THREAD_UNINT); 4614 VSL_UNLOCK(); 4615 thread_block(THREAD_CONTINUE_NULL); 4616 } else { 4617 VSL_UNLOCK(); 4618 } 4619 dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n")); 4620 4621 default_pager_space_alert(trigger, LO_WAT_ALERT); 4622 ipc_port_release_send(trigger); 4623 dp_pages_free_low_count = 0; 4624 } 4625 4626 clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline); 4627 thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline); 4628} 4629 4630#if CONFIG_FREEZE 4631unsigned int default_pager_swap_pages_free() { 4632 return dp_pages_free; 4633} 4634#endif 4635