1/* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 */ 14 15#include <linux/config.h> 16#include <linux/mm.h> 17#include <linux/swap.h> 18#include <linux/swapctl.h> 19#include <linux/interrupt.h> 20#include <linux/pagemap.h> 21#include <linux/bootmem.h> 22#include <linux/slab.h> 23#include <linux/module.h> 24 25int nr_swap_pages; 26int nr_active_pages; 27int nr_inactive_pages; 28LIST_HEAD(inactive_list); 29LIST_HEAD(active_list); 30pg_data_t *pgdat_list; 31 32/* 33 * 34 * The zone_table array is used to look up the address of the 35 * struct zone corresponding to a given zone number (ZONE_DMA, 36 * ZONE_NORMAL, or ZONE_HIGHMEM). 37 */ 38zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; 39EXPORT_SYMBOL(zone_table); 40 41static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 42static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; 43// Joey: make min threshold higher 44//static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; 45static int zone_balance_min[MAX_NR_ZONES] __initdata = { 128 , 128, 128, }; 46 47static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; 48 49/* 50 * Temporary debugging check. 51 */ 52#define BAD_RANGE(zone, page) \ 53( \ 54 (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ 55 || (((page) - mem_map) < (zone)->zone_start_mapnr) \ 56 || ((zone) != page_zone(page)) \ 57) 58 59/* 60 * Freeing function for a buddy system allocator. 61 * Contrary to prior comments, this is *NOT* hairy, and there 62 * is no reason for anyone not to understand it. 63 * 64 * The concept of a buddy system is to maintain direct-mapped tables 65 * (containing bit values) for memory blocks of various "orders". 66 * The bottom level table contains the map for the smallest allocatable 67 * units of memory (here, pages), and each level above it describes 68 * pairs of units from the levels below, hence, "buddies". 69 * At a high level, all that happens here is marking the table entry 70 * at the bottom level available, and propagating the changes upward 71 * as necessary, plus some accounting needed to play nicely with other 72 * parts of the VM system. 73 * At each level, we keep one bit for each pair of blocks, which 74 * is set to 1 iff only one of the pair is allocated. So when we 75 * are allocating or freeing one, we can derive the state of the 76 * other. That is, if we allocate a small block, and both were 77 * free, the remainder of the region must be split into blocks. 78 * If a block is freed, and its buddy is also free, then this 79 * triggers coalescing into a block of larger size. 80 * 81 * -- wli 82 */ 83 84static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); 85static void __free_pages_ok (struct page *page, unsigned int order) 86{ 87 unsigned long index, page_idx, mask, flags; 88 free_area_t *area; 89 struct page *base; 90 zone_t *zone; 91 92 /* 93 * Yes, think what happens when other parts of the kernel take 94 * a reference to a page in order to pin it for io. -ben 95 */ 96 if (PageLRU(page)) { 97 if (unlikely(in_interrupt())) 98 BUG(); 99 lru_cache_del(page); 100 } 101 102 if (page->buffers) 103 BUG(); 104 if (page->mapping) 105 BUG(); 106 if (!VALID_PAGE(page)) 107 BUG(); 108 if (PageLocked(page)) 109 BUG(); 110 if (PageActive(page)) 111 BUG(); 112 page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty)); 113 114 if (current->flags & PF_FREE_PAGES) 115 goto local_freelist; 116 back_local_freelist: 117 118 zone = page_zone(page); 119 120 mask = (~0UL) << order; 121 base = zone->zone_mem_map; 122 page_idx = page - base; 123 if (page_idx & ~mask) 124 BUG(); 125 index = page_idx >> (1 + order); 126 127 area = zone->free_area + order; 128 129 spin_lock_irqsave(&zone->lock, flags); 130 131 zone->free_pages -= mask; 132 133 while (mask + (1 << (MAX_ORDER-1))) { 134 struct page *buddy1, *buddy2; 135 136 if (area >= zone->free_area + MAX_ORDER) 137 BUG(); 138 if (!__test_and_change_bit(index, area->map)) 139 /* 140 * the buddy page is still allocated. 141 */ 142 break; 143 /* 144 * Move the buddy up one level. 145 * This code is taking advantage of the identity: 146 * -mask = 1+~mask 147 */ 148 buddy1 = base + (page_idx ^ -mask); 149 buddy2 = base + page_idx; 150 if (BAD_RANGE(zone,buddy1)) 151 BUG(); 152 if (BAD_RANGE(zone,buddy2)) 153 BUG(); 154 155 list_del(&buddy1->list); 156 mask <<= 1; 157 area++; 158 index >>= 1; 159 page_idx &= mask; 160 } 161 list_add(&(base + page_idx)->list, &area->free_list); 162 163 spin_unlock_irqrestore(&zone->lock, flags); 164 return; 165 166 local_freelist: 167 if (current->nr_local_pages) 168 goto back_local_freelist; 169 if (in_interrupt()) 170 goto back_local_freelist; 171 172 list_add(&page->list, ¤t->local_pages); 173 page->index = order; 174 current->nr_local_pages++; 175} 176 177#define MARK_USED(index, order, area) \ 178 __change_bit((index) >> (1+(order)), (area)->map) 179 180static inline struct page * expand (zone_t *zone, struct page *page, 181 unsigned long index, int low, int high, free_area_t * area) 182{ 183 unsigned long size = 1 << high; 184 185 while (high > low) { 186 if (BAD_RANGE(zone,page)) 187 BUG(); 188 area--; 189 high--; 190 size >>= 1; 191 list_add(&(page)->list, &(area)->free_list); 192 MARK_USED(index, high, area); 193 index += size; 194 page += size; 195 } 196 if (BAD_RANGE(zone,page)) 197 BUG(); 198 return page; 199} 200 201static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); 202static struct page * rmqueue(zone_t *zone, unsigned int order) 203{ 204 free_area_t * area = zone->free_area + order; 205 unsigned int curr_order = order; 206 struct list_head *head, *curr; 207 unsigned long flags; 208 struct page *page; 209 210 spin_lock_irqsave(&zone->lock, flags); 211 do { 212 head = &area->free_list; 213 curr = head->next; 214 215 if (curr != head) { 216 unsigned int index; 217 218 page = list_entry(curr, struct page, list); 219 if (BAD_RANGE(zone,page)) 220 BUG(); 221 list_del(curr); 222 index = page - zone->zone_mem_map; 223 if (curr_order != MAX_ORDER-1) 224 MARK_USED(index, curr_order, area); 225 zone->free_pages -= 1UL << order; 226 227 page = expand(zone, page, index, order, curr_order, area); 228 spin_unlock_irqrestore(&zone->lock, flags); 229 230 set_page_count(page, 1); 231 if (BAD_RANGE(zone,page)) 232 BUG(); 233 if (PageLRU(page)) 234 BUG(); 235 if (PageActive(page)) 236 BUG(); 237 return page; 238 } 239 curr_order++; 240 area++; 241 } while (curr_order < MAX_ORDER); 242 spin_unlock_irqrestore(&zone->lock, flags); 243 244 return NULL; 245} 246 247#ifndef CONFIG_DISCONTIGMEM 248struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) 249{ 250 return __alloc_pages(gfp_mask, order, 251 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); 252} 253#endif 254 255static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); 256static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) 257{ 258 struct page * page = NULL; 259 int __freed = 0; 260 261 if (!(gfp_mask & __GFP_WAIT)) 262 goto out; 263 if (in_interrupt()) 264 BUG(); 265 266 current->allocation_order = order; 267 current->flags |= PF_MEMALLOC | PF_FREE_PAGES; 268 269 __freed = try_to_free_pages_zone(classzone, gfp_mask); 270 271 current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); 272 273 if (current->nr_local_pages) { 274 struct list_head * entry, * local_pages; 275 struct page * tmp; 276 int nr_pages; 277 278 local_pages = ¤t->local_pages; 279 280 if (likely(__freed)) { 281 /* pick from the last inserted so we're lifo */ 282 entry = local_pages->next; 283 do { 284 tmp = list_entry(entry, struct page, list); 285 if (tmp->index == order && memclass(page_zone(tmp), classzone)) { 286 list_del(entry); 287 current->nr_local_pages--; 288 set_page_count(tmp, 1); 289 page = tmp; 290 291 if (page->buffers) 292 BUG(); 293 if (page->mapping) 294 BUG(); 295 if (!VALID_PAGE(page)) 296 BUG(); 297 if (PageLocked(page)) 298 BUG(); 299 if (PageLRU(page)) 300 BUG(); 301 if (PageActive(page)) 302 BUG(); 303 if (PageDirty(page)) 304 BUG(); 305 306 break; 307 } 308 } while ((entry = entry->next) != local_pages); 309 } 310 311 nr_pages = current->nr_local_pages; 312 /* free in reverse order so that the global order will be lifo */ 313 while ((entry = local_pages->prev) != local_pages) { 314 list_del(entry); 315 tmp = list_entry(entry, struct page, list); 316 __free_pages_ok(tmp, tmp->index); 317 if (!nr_pages--) 318 BUG(); 319 } 320 current->nr_local_pages = 0; 321 } 322 out: 323 *freed = __freed; 324 return page; 325} 326 327/* 328 * This is the 'heart' of the zoned buddy allocator: 329 */ 330struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) 331{ 332 unsigned long min; 333 zone_t **zone, * classzone; 334 struct page * page; 335 int freed; 336 337 zone = zonelist->zones; 338 classzone = *zone; 339 if (classzone == NULL) 340 return NULL; 341 min = 1UL << order; 342 for (;;) { 343 zone_t *z = *(zone++); 344 if (!z) 345 break; 346 347 min += z->pages_low; 348 if (z->free_pages > min) { 349 page = rmqueue(z, order); 350 if (page) 351 return page; 352 } 353 } 354 355 classzone->need_balance = 1; 356 mb(); 357 if (waitqueue_active(&kswapd_wait)) 358 wake_up_interruptible(&kswapd_wait); 359 360 zone = zonelist->zones; 361 min = 1UL << order; 362 for (;;) { 363 unsigned long local_min; 364 zone_t *z = *(zone++); 365 if (!z) 366 break; 367 368 local_min = z->pages_min; 369 if (!(gfp_mask & __GFP_WAIT)) 370 local_min >>= 2; 371 min += local_min; 372 if (z->free_pages > min) { 373 page = rmqueue(z, order); 374 if (page) 375 return page; 376 } 377 } 378 379 /* here we're in the low on memory slow path */ 380 381rebalance: 382 if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { 383 zone = zonelist->zones; 384 for (;;) { 385 zone_t *z = *(zone++); 386 if (!z) 387 break; 388 389 page = rmqueue(z, order); 390 if (page) 391 return page; 392 } 393 return NULL; 394 } 395 396 /* Atomic allocations - we can't balance anything */ 397 if (!(gfp_mask & __GFP_WAIT)) 398 return NULL; 399 400 page = balance_classzone(classzone, gfp_mask, order, &freed); 401 if (page) 402 return page; 403 404 zone = zonelist->zones; 405 min = 1UL << order; 406 for (;;) { 407 zone_t *z = *(zone++); 408 if (!z) 409 break; 410 411 min += z->pages_min; 412 if (z->free_pages > min) { 413 page = rmqueue(z, order); 414 if (page) 415 return page; 416 } 417 } 418 419 /* Don't let big-order allocations loop */ 420 if (order > 3) 421 return NULL; 422 423 /* Yield for kswapd, and try again */ 424 yield(); 425 goto rebalance; 426} 427 428/* 429 * Common helper functions. 430 */ 431unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) 432{ 433 struct page * page; 434 435 page = alloc_pages(gfp_mask, order); 436 if (!page) 437 return 0; 438 return (unsigned long) page_address(page); 439} 440 441unsigned long get_zeroed_page(unsigned int gfp_mask) 442{ 443 struct page * page; 444 445 page = alloc_pages(gfp_mask, 0); 446 if (page) { 447 void *address = page_address(page); 448 clear_page(address); 449 return (unsigned long) address; 450 } 451 return 0; 452} 453 454void __free_pages(struct page *page, unsigned int order) 455{ 456 if (!PageReserved(page) && put_page_testzero(page)) 457 __free_pages_ok(page, order); 458} 459 460void free_pages(unsigned long addr, unsigned int order) 461{ 462 if (addr != 0) 463 __free_pages(virt_to_page(addr), order); 464} 465 466/* 467 * Total amount of free (allocatable) RAM: 468 */ 469unsigned int nr_free_pages (void) 470{ 471 unsigned int sum = 0; 472 zone_t *zone; 473 474 for_each_zone(zone) 475 sum += zone->free_pages; 476 477 return sum; 478} 479 480/* 481 * Amount of free RAM allocatable as buffer memory: 482 */ 483unsigned int nr_free_buffer_pages (void) 484{ 485 pg_data_t *pgdat; 486 unsigned int sum = 0; 487 488 for_each_pgdat(pgdat) { 489 zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); 490 zone_t **zonep = zonelist->zones; 491 zone_t *zone; 492 493 for (zone = *zonep++; zone; zone = *zonep++) { 494 unsigned long size = zone->size; 495 unsigned long high = zone->pages_high; 496 if (size > high) 497 sum += size - high; 498 } 499 } 500 501 return sum; 502} 503 504#if CONFIG_HIGHMEM 505unsigned int nr_free_highpages (void) 506{ 507 pg_data_t *pgdat; 508 unsigned int pages = 0; 509 510 for_each_pgdat(pgdat) 511 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 512 513 return pages; 514} 515#endif 516 517#define K(x) ((x) << (PAGE_SHIFT-10)) 518 519/* 520 * Show free area list (used inside shift_scroll-lock stuff) 521 * We also calculate the percentage fragmentation. We do this by counting the 522 * memory on each free list with the exception of the first item on the list. 523 */ 524void show_free_areas_core(pg_data_t *pgdat) 525{ 526 unsigned int order; 527 unsigned type; 528 pg_data_t *tmpdat = pgdat; 529 530 printk("Free pages: %6dkB (%6dkB HighMem)\n", 531 K(nr_free_pages()), 532 K(nr_free_highpages())); 533 534 while (tmpdat) { 535 zone_t *zone; 536 for (zone = tmpdat->node_zones; 537 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) 538 printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 539 "high:%6lukB\n", 540 zone->name, 541 K(zone->free_pages), 542 K(zone->pages_min), 543 K(zone->pages_low), 544 K(zone->pages_high)); 545 546 tmpdat = tmpdat->node_next; 547 } 548 549 printk("( Active: %d, inactive: %d, free: %d )\n", 550 nr_active_pages, 551 nr_inactive_pages, 552 nr_free_pages()); 553 554 for (type = 0; type < MAX_NR_ZONES; type++) { 555 struct list_head *head, *curr; 556 zone_t *zone = pgdat->node_zones + type; 557 unsigned long nr, total, flags; 558 559 total = 0; 560 if (zone->size) { 561 spin_lock_irqsave(&zone->lock, flags); 562 for (order = 0; order < MAX_ORDER; order++) { 563 head = &(zone->free_area + order)->free_list; 564 curr = head; 565 nr = 0; 566 for (;;) { 567 if ((curr = curr->next) == head) 568 break; 569 nr++; 570 } 571 total += nr * (1 << order); 572 printk("%lu*%lukB ", nr, K(1UL) << order); 573 } 574 spin_unlock_irqrestore(&zone->lock, flags); 575 } 576 printk("= %lukB)\n", K(total)); 577 } 578 579#ifdef SWAP_CACHE_INFO 580 show_swap_cache_info(); 581#endif 582} 583 584void show_free_areas(void) 585{ 586 show_free_areas_core(pgdat_list); 587} 588 589/* 590 * Builds allocation fallback zone lists. 591 */ 592static inline void build_zonelists(pg_data_t *pgdat) 593{ 594 int i, j, k; 595 596 for (i = 0; i <= GFP_ZONEMASK; i++) { 597 zonelist_t *zonelist; 598 zone_t *zone; 599 600 zonelist = pgdat->node_zonelists + i; 601 memset(zonelist, 0, sizeof(*zonelist)); 602 603 j = 0; 604 k = ZONE_NORMAL; 605 if (i & __GFP_HIGHMEM) 606 k = ZONE_HIGHMEM; 607 if (i & __GFP_DMA) 608 k = ZONE_DMA; 609 610 switch (k) { 611 default: 612 BUG(); 613 /* 614 * fallthrough: 615 */ 616 case ZONE_HIGHMEM: 617 zone = pgdat->node_zones + ZONE_HIGHMEM; 618 if (zone->size) { 619#ifndef CONFIG_HIGHMEM 620 BUG(); 621#endif 622 zonelist->zones[j++] = zone; 623 } 624 case ZONE_NORMAL: 625 zone = pgdat->node_zones + ZONE_NORMAL; 626 if (zone->size) 627 zonelist->zones[j++] = zone; 628 case ZONE_DMA: 629 zone = pgdat->node_zones + ZONE_DMA; 630 if (zone->size) 631 zonelist->zones[j++] = zone; 632 } 633 zonelist->zones[j++] = NULL; 634 } 635} 636 637/* 638 * Helper functions to size the waitqueue hash table. 639 * Essentially these want to choose hash table sizes sufficiently 640 * large so that collisions trying to wait on pages are rare. 641 * But in fact, the number of active page waitqueues on typical 642 * systems is ridiculously low, less than 200. So this is even 643 * conservative, even though it seems large. 644 * 645 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 646 * waitqueues, i.e. the size of the waitq table given the number of pages. 647 */ 648#define PAGES_PER_WAITQUEUE 256 649 650static inline unsigned long wait_table_size(unsigned long pages) 651{ 652 unsigned long size = 1; 653 654 pages /= PAGES_PER_WAITQUEUE; 655 656 while (size < pages) 657 size <<= 1; 658 659 /* 660 * Once we have dozens or even hundreds of threads sleeping 661 * on IO we've got bigger problems than wait queue collision. 662 * Limit the size of the wait table to a reasonable size. 663 */ 664 size = min(size, 4096UL); 665 666 return size; 667} 668 669/* 670 * This is an integer logarithm so that shifts can be used later 671 * to extract the more random high bits from the multiplicative 672 * hash function before the remainder is taken. 673 */ 674static inline unsigned long wait_table_bits(unsigned long size) 675{ 676 return ffz(~size); 677} 678 679#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 680 681/* 682 * Set up the zone data structures: 683 * - mark all pages reserved 684 * - mark all memory queues empty 685 * - clear the memory bitmaps 686 */ 687void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, 688 unsigned long *zones_size, unsigned long zone_start_paddr, 689 unsigned long *zholes_size, struct page *lmem_map) 690{ 691 unsigned long i, j; 692 unsigned long map_size; 693 unsigned long totalpages, offset, realtotalpages; 694 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); 695 696 if (zone_start_paddr & ~PAGE_MASK) 697 BUG(); 698 699 totalpages = 0; 700 for (i = 0; i < MAX_NR_ZONES; i++) { 701 unsigned long size = zones_size[i]; 702 totalpages += size; 703 } 704 realtotalpages = totalpages; 705 if (zholes_size) 706 for (i = 0; i < MAX_NR_ZONES; i++) 707 realtotalpages -= zholes_size[i]; 708 709 printk("On node %d totalpages: %lu\n", nid, realtotalpages); 710 711 /* 712 * Some architectures (with lots of mem and discontinous memory 713 * maps) have to search for a good mem_map area: 714 * For discontigmem, the conceptual mem map array starts from 715 * PAGE_OFFSET, we need to align the actual array onto a mem map 716 * boundary, so that MAP_NR works. 717 */ 718 map_size = (totalpages + 1)*sizeof(struct page); 719 if (lmem_map == (struct page *)0) { 720 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); 721 lmem_map = (struct page *)(PAGE_OFFSET + 722 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); 723 } 724 *gmap = pgdat->node_mem_map = lmem_map; 725 pgdat->node_size = totalpages; 726 pgdat->node_start_paddr = zone_start_paddr; 727 pgdat->node_start_mapnr = (lmem_map - mem_map); 728 pgdat->nr_zones = 0; 729 730 offset = lmem_map - mem_map; 731 for (j = 0; j < MAX_NR_ZONES; j++) { 732 zone_t *zone = pgdat->node_zones + j; 733 unsigned long mask; 734 unsigned long size, realsize; 735 736 zone_table[nid * MAX_NR_ZONES + j] = zone; 737 realsize = size = zones_size[j]; 738 if (zholes_size) 739 realsize -= zholes_size[j]; 740 741 printk("zone(%lu): %lu pages.\n", j, size); 742 zone->size = size; 743 zone->name = zone_names[j]; 744 zone->lock = SPIN_LOCK_UNLOCKED; 745 zone->zone_pgdat = pgdat; 746 zone->free_pages = 0; 747 zone->need_balance = 0; 748 if (!size) 749 continue; 750 751 /* 752 * The per-page waitqueue mechanism uses hashed waitqueues 753 * per zone. 754 */ 755 zone->wait_table_size = wait_table_size(size); 756 zone->wait_table_shift = 757 BITS_PER_LONG - wait_table_bits(zone->wait_table_size); 758 zone->wait_table = (wait_queue_head_t *) 759 alloc_bootmem_node(pgdat, zone->wait_table_size 760 * sizeof(wait_queue_head_t)); 761 762 for(i = 0; i < zone->wait_table_size; ++i) 763 init_waitqueue_head(zone->wait_table + i); 764 765 pgdat->nr_zones = j+1; 766 767 mask = (realsize / zone_balance_ratio[j]); 768 if (mask < zone_balance_min[j]) 769 mask = zone_balance_min[j]; 770 else if (mask > zone_balance_max[j]) 771 mask = zone_balance_max[j]; 772 zone->pages_min = mask; 773 zone->pages_low = mask*2; 774 zone->pages_high = mask*3; 775 776 zone->zone_mem_map = mem_map + offset; 777 zone->zone_start_mapnr = offset; 778 zone->zone_start_paddr = zone_start_paddr; 779 780 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) 781 printk("BUG: wrong zone alignment, it will crash\n"); 782 783 /* 784 * Initially all pages are reserved - free ones are freed 785 * up by free_all_bootmem() once the early boot process is 786 * done. Non-atomic initialization, single-pass. 787 */ 788 for (i = 0; i < size; i++) { 789 struct page *page = mem_map + offset + i; 790 set_page_zone(page, nid * MAX_NR_ZONES + j); 791 set_page_count(page, 0); 792 SetPageReserved(page); 793 INIT_LIST_HEAD(&page->list); 794 if (j != ZONE_HIGHMEM) 795 set_page_address(page, __va(zone_start_paddr)); 796 zone_start_paddr += PAGE_SIZE; 797 } 798 799 offset += size; 800 for (i = 0; ; i++) { 801 unsigned long bitmap_size; 802 803 INIT_LIST_HEAD(&zone->free_area[i].free_list); 804 if (i == MAX_ORDER-1) { 805 zone->free_area[i].map = NULL; 806 break; 807 } 808 809 /* 810 * Page buddy system uses "index >> (i+1)", 811 * where "index" is at most "size-1". 812 * 813 * The extra "+3" is to round down to byte 814 * size (8 bits per byte assumption). Thus 815 * we get "(size-1) >> (i+4)" as the last byte 816 * we can access. 817 * 818 * The "+1" is because we want to round the 819 * byte allocation up rather than down. So 820 * we should have had a "+7" before we shifted 821 * down by three. Also, we have to add one as 822 * we actually _use_ the last bit (it's [0,n] 823 * inclusive, not [0,n[). 824 * 825 * So we actually had +7+1 before we shift 826 * down by 3. But (n+8) >> 3 == (n >> 3) + 1 827 * (modulo overflows, which we do not have). 828 * 829 * Finally, we LONG_ALIGN because all bitmap 830 * operations are on longs. 831 */ 832 bitmap_size = (size-1) >> (i+4); 833 bitmap_size = LONG_ALIGN(bitmap_size+1); 834 zone->free_area[i].map = 835 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); 836 } 837 } 838 build_zonelists(pgdat); 839} 840 841void __init free_area_init(unsigned long *zones_size) 842{ 843 free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); 844} 845 846static int __init setup_mem_frac(char *str) 847{ 848 int j = 0; 849 850 while (get_option(&str, &zone_balance_ratio[j++]) == 2); 851 printk("setup_mem_frac: "); 852 for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); 853 printk("\n"); 854 return 1; 855} 856 857__setup("memfrac=", setup_mem_frac); 858