1/*
2 *  linux/mm/page_alloc.c
3 *
4 *  Manages the free list, the system allocates free pages here.
5 *  Note that kmalloc() lives in slab.c
6 *
7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8 *  Swap reorganised 29.12.95, Stephen Tweedie
9 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/swap.h>
18#include <linux/swapctl.h>
19#include <linux/interrupt.h>
20#include <linux/pagemap.h>
21#include <linux/bootmem.h>
22#include <linux/slab.h>
23#include <linux/module.h>
24
25int nr_swap_pages;
26int nr_active_pages;
27int nr_inactive_pages;
28LIST_HEAD(inactive_list);
29LIST_HEAD(active_list);
30pg_data_t *pgdat_list;
31
32/*
33 *
34 * The zone_table array is used to look up the address of the
35 * struct zone corresponding to a given zone number (ZONE_DMA,
36 * ZONE_NORMAL, or ZONE_HIGHMEM).
37 */
38zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
39EXPORT_SYMBOL(zone_table);
40
41static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
42static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
43// Joey: make min threshold higher
44//static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
45static int zone_balance_min[MAX_NR_ZONES] __initdata = { 128 , 128, 128, };
46
47static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
48
49/*
50 * Temporary debugging check.
51 */
52#define BAD_RANGE(zone, page)						\
53(									\
54	(((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size))	\
55	|| (((page) - mem_map) < (zone)->zone_start_mapnr)		\
56	|| ((zone) != page_zone(page))					\
57)
58
59/*
60 * Freeing function for a buddy system allocator.
61 * Contrary to prior comments, this is *NOT* hairy, and there
62 * is no reason for anyone not to understand it.
63 *
64 * The concept of a buddy system is to maintain direct-mapped tables
65 * (containing bit values) for memory blocks of various "orders".
66 * The bottom level table contains the map for the smallest allocatable
67 * units of memory (here, pages), and each level above it describes
68 * pairs of units from the levels below, hence, "buddies".
69 * At a high level, all that happens here is marking the table entry
70 * at the bottom level available, and propagating the changes upward
71 * as necessary, plus some accounting needed to play nicely with other
72 * parts of the VM system.
73 * At each level, we keep one bit for each pair of blocks, which
74 * is set to 1 iff only one of the pair is allocated.  So when we
75 * are allocating or freeing one, we can derive the state of the
76 * other.  That is, if we allocate a small block, and both were
77 * free, the remainder of the region must be split into blocks.
78 * If a block is freed, and its buddy is also free, then this
79 * triggers coalescing into a block of larger size.
80 *
81 * -- wli
82 */
83
84static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
85static void __free_pages_ok (struct page *page, unsigned int order)
86{
87	unsigned long index, page_idx, mask, flags;
88	free_area_t *area;
89	struct page *base;
90	zone_t *zone;
91
92	/*
93	 * Yes, think what happens when other parts of the kernel take
94	 * a reference to a page in order to pin it for io. -ben
95	 */
96	if (PageLRU(page)) {
97		if (unlikely(in_interrupt()))
98			BUG();
99		lru_cache_del(page);
100	}
101
102	if (page->buffers)
103		BUG();
104	if (page->mapping)
105		BUG();
106	if (!VALID_PAGE(page))
107		BUG();
108	if (PageLocked(page))
109		BUG();
110	if (PageActive(page))
111		BUG();
112	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
113
114	if (current->flags & PF_FREE_PAGES)
115		goto local_freelist;
116 back_local_freelist:
117
118	zone = page_zone(page);
119
120	mask = (~0UL) << order;
121	base = zone->zone_mem_map;
122	page_idx = page - base;
123	if (page_idx & ~mask)
124		BUG();
125	index = page_idx >> (1 + order);
126
127	area = zone->free_area + order;
128
129	spin_lock_irqsave(&zone->lock, flags);
130
131	zone->free_pages -= mask;
132
133	while (mask + (1 << (MAX_ORDER-1))) {
134		struct page *buddy1, *buddy2;
135
136		if (area >= zone->free_area + MAX_ORDER)
137			BUG();
138		if (!__test_and_change_bit(index, area->map))
139			/*
140			 * the buddy page is still allocated.
141			 */
142			break;
143		/*
144		 * Move the buddy up one level.
145		 * This code is taking advantage of the identity:
146		 * 	-mask = 1+~mask
147		 */
148		buddy1 = base + (page_idx ^ -mask);
149		buddy2 = base + page_idx;
150		if (BAD_RANGE(zone,buddy1))
151			BUG();
152		if (BAD_RANGE(zone,buddy2))
153			BUG();
154
155		list_del(&buddy1->list);
156		mask <<= 1;
157		area++;
158		index >>= 1;
159		page_idx &= mask;
160	}
161	list_add(&(base + page_idx)->list, &area->free_list);
162
163	spin_unlock_irqrestore(&zone->lock, flags);
164	return;
165
166 local_freelist:
167	if (current->nr_local_pages)
168		goto back_local_freelist;
169	if (in_interrupt())
170		goto back_local_freelist;
171
172	list_add(&page->list, &current->local_pages);
173	page->index = order;
174	current->nr_local_pages++;
175}
176
177#define MARK_USED(index, order, area) \
178	__change_bit((index) >> (1+(order)), (area)->map)
179
180static inline struct page * expand (zone_t *zone, struct page *page,
181	 unsigned long index, int low, int high, free_area_t * area)
182{
183	unsigned long size = 1 << high;
184
185	while (high > low) {
186		if (BAD_RANGE(zone,page))
187			BUG();
188		area--;
189		high--;
190		size >>= 1;
191		list_add(&(page)->list, &(area)->free_list);
192		MARK_USED(index, high, area);
193		index += size;
194		page += size;
195	}
196	if (BAD_RANGE(zone,page))
197		BUG();
198	return page;
199}
200
201static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
202static struct page * rmqueue(zone_t *zone, unsigned int order)
203{
204	free_area_t * area = zone->free_area + order;
205	unsigned int curr_order = order;
206	struct list_head *head, *curr;
207	unsigned long flags;
208	struct page *page;
209
210	spin_lock_irqsave(&zone->lock, flags);
211	do {
212		head = &area->free_list;
213		curr = head->next;
214
215		if (curr != head) {
216			unsigned int index;
217
218			page = list_entry(curr, struct page, list);
219			if (BAD_RANGE(zone,page))
220				BUG();
221			list_del(curr);
222			index = page - zone->zone_mem_map;
223			if (curr_order != MAX_ORDER-1)
224				MARK_USED(index, curr_order, area);
225			zone->free_pages -= 1UL << order;
226
227			page = expand(zone, page, index, order, curr_order, area);
228			spin_unlock_irqrestore(&zone->lock, flags);
229
230			set_page_count(page, 1);
231			if (BAD_RANGE(zone,page))
232				BUG();
233			if (PageLRU(page))
234				BUG();
235			if (PageActive(page))
236				BUG();
237			return page;
238		}
239		curr_order++;
240		area++;
241	} while (curr_order < MAX_ORDER);
242	spin_unlock_irqrestore(&zone->lock, flags);
243
244	return NULL;
245}
246
247#ifndef CONFIG_DISCONTIGMEM
248struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
249{
250	return __alloc_pages(gfp_mask, order,
251		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
252}
253#endif
254
255static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
256static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
257{
258	struct page * page = NULL;
259	int __freed = 0;
260
261	if (!(gfp_mask & __GFP_WAIT))
262		goto out;
263	if (in_interrupt())
264		BUG();
265
266	current->allocation_order = order;
267	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
268
269	__freed = try_to_free_pages_zone(classzone, gfp_mask);
270
271	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
272
273	if (current->nr_local_pages) {
274		struct list_head * entry, * local_pages;
275		struct page * tmp;
276		int nr_pages;
277
278		local_pages = &current->local_pages;
279
280		if (likely(__freed)) {
281			/* pick from the last inserted so we're lifo */
282			entry = local_pages->next;
283			do {
284				tmp = list_entry(entry, struct page, list);
285				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
286					list_del(entry);
287					current->nr_local_pages--;
288					set_page_count(tmp, 1);
289					page = tmp;
290
291					if (page->buffers)
292						BUG();
293					if (page->mapping)
294						BUG();
295					if (!VALID_PAGE(page))
296						BUG();
297					if (PageLocked(page))
298						BUG();
299					if (PageLRU(page))
300						BUG();
301					if (PageActive(page))
302						BUG();
303					if (PageDirty(page))
304						BUG();
305
306					break;
307				}
308			} while ((entry = entry->next) != local_pages);
309		}
310
311		nr_pages = current->nr_local_pages;
312		/* free in reverse order so that the global order will be lifo */
313		while ((entry = local_pages->prev) != local_pages) {
314			list_del(entry);
315			tmp = list_entry(entry, struct page, list);
316			__free_pages_ok(tmp, tmp->index);
317			if (!nr_pages--)
318				BUG();
319		}
320		current->nr_local_pages = 0;
321	}
322 out:
323	*freed = __freed;
324	return page;
325}
326
327/*
328 * This is the 'heart' of the zoned buddy allocator:
329 */
330struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
331{
332	unsigned long min;
333	zone_t **zone, * classzone;
334	struct page * page;
335	int freed;
336
337	zone = zonelist->zones;
338	classzone = *zone;
339	if (classzone == NULL)
340		return NULL;
341	min = 1UL << order;
342	for (;;) {
343		zone_t *z = *(zone++);
344		if (!z)
345			break;
346
347		min += z->pages_low;
348		if (z->free_pages > min) {
349			page = rmqueue(z, order);
350			if (page)
351				return page;
352		}
353	}
354
355	classzone->need_balance = 1;
356	mb();
357	if (waitqueue_active(&kswapd_wait))
358		wake_up_interruptible(&kswapd_wait);
359
360	zone = zonelist->zones;
361	min = 1UL << order;
362	for (;;) {
363		unsigned long local_min;
364		zone_t *z = *(zone++);
365		if (!z)
366			break;
367
368		local_min = z->pages_min;
369		if (!(gfp_mask & __GFP_WAIT))
370			local_min >>= 2;
371		min += local_min;
372		if (z->free_pages > min) {
373			page = rmqueue(z, order);
374			if (page)
375				return page;
376		}
377	}
378
379	/* here we're in the low on memory slow path */
380
381rebalance:
382	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
383		zone = zonelist->zones;
384		for (;;) {
385			zone_t *z = *(zone++);
386			if (!z)
387				break;
388
389			page = rmqueue(z, order);
390			if (page)
391				return page;
392		}
393		return NULL;
394	}
395
396	/* Atomic allocations - we can't balance anything */
397	if (!(gfp_mask & __GFP_WAIT))
398		return NULL;
399
400	page = balance_classzone(classzone, gfp_mask, order, &freed);
401	if (page)
402		return page;
403
404	zone = zonelist->zones;
405	min = 1UL << order;
406	for (;;) {
407		zone_t *z = *(zone++);
408		if (!z)
409			break;
410
411		min += z->pages_min;
412		if (z->free_pages > min) {
413			page = rmqueue(z, order);
414			if (page)
415				return page;
416		}
417	}
418
419	/* Don't let big-order allocations loop */
420	if (order > 3)
421		return NULL;
422
423	/* Yield for kswapd, and try again */
424	yield();
425	goto rebalance;
426}
427
428/*
429 * Common helper functions.
430 */
431unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
432{
433	struct page * page;
434
435	page = alloc_pages(gfp_mask, order);
436	if (!page)
437		return 0;
438	return (unsigned long) page_address(page);
439}
440
441unsigned long get_zeroed_page(unsigned int gfp_mask)
442{
443	struct page * page;
444
445	page = alloc_pages(gfp_mask, 0);
446	if (page) {
447		void *address = page_address(page);
448		clear_page(address);
449		return (unsigned long) address;
450	}
451	return 0;
452}
453
454void __free_pages(struct page *page, unsigned int order)
455{
456	if (!PageReserved(page) && put_page_testzero(page))
457		__free_pages_ok(page, order);
458}
459
460void free_pages(unsigned long addr, unsigned int order)
461{
462	if (addr != 0)
463		__free_pages(virt_to_page(addr), order);
464}
465
466/*
467 * Total amount of free (allocatable) RAM:
468 */
469unsigned int nr_free_pages (void)
470{
471	unsigned int sum = 0;
472	zone_t *zone;
473
474	for_each_zone(zone)
475		sum += zone->free_pages;
476
477	return sum;
478}
479
480/*
481 * Amount of free RAM allocatable as buffer memory:
482 */
483unsigned int nr_free_buffer_pages (void)
484{
485	pg_data_t *pgdat;
486	unsigned int sum = 0;
487
488	for_each_pgdat(pgdat) {
489		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
490		zone_t **zonep = zonelist->zones;
491		zone_t *zone;
492
493		for (zone = *zonep++; zone; zone = *zonep++) {
494			unsigned long size = zone->size;
495			unsigned long high = zone->pages_high;
496			if (size > high)
497				sum += size - high;
498		}
499	}
500
501	return sum;
502}
503
504#if CONFIG_HIGHMEM
505unsigned int nr_free_highpages (void)
506{
507	pg_data_t *pgdat;
508	unsigned int pages = 0;
509
510	for_each_pgdat(pgdat)
511		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
512
513	return pages;
514}
515#endif
516
517#define K(x) ((x) << (PAGE_SHIFT-10))
518
519/*
520 * Show free area list (used inside shift_scroll-lock stuff)
521 * We also calculate the percentage fragmentation. We do this by counting the
522 * memory on each free list with the exception of the first item on the list.
523 */
524void show_free_areas_core(pg_data_t *pgdat)
525{
526 	unsigned int order;
527	unsigned type;
528	pg_data_t *tmpdat = pgdat;
529
530	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
531		K(nr_free_pages()),
532		K(nr_free_highpages()));
533
534	while (tmpdat) {
535		zone_t *zone;
536		for (zone = tmpdat->node_zones;
537			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
538			printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB "
539				       "high:%6lukB\n",
540					zone->name,
541					K(zone->free_pages),
542					K(zone->pages_min),
543					K(zone->pages_low),
544					K(zone->pages_high));
545
546		tmpdat = tmpdat->node_next;
547	}
548
549	printk("( Active: %d, inactive: %d, free: %d )\n",
550	       nr_active_pages,
551	       nr_inactive_pages,
552	       nr_free_pages());
553
554	for (type = 0; type < MAX_NR_ZONES; type++) {
555		struct list_head *head, *curr;
556		zone_t *zone = pgdat->node_zones + type;
557 		unsigned long nr, total, flags;
558
559		total = 0;
560		if (zone->size) {
561			spin_lock_irqsave(&zone->lock, flags);
562		 	for (order = 0; order < MAX_ORDER; order++) {
563				head = &(zone->free_area + order)->free_list;
564				curr = head;
565				nr = 0;
566				for (;;) {
567					if ((curr = curr->next) == head)
568						break;
569					nr++;
570				}
571				total += nr * (1 << order);
572				printk("%lu*%lukB ", nr, K(1UL) << order);
573			}
574			spin_unlock_irqrestore(&zone->lock, flags);
575		}
576		printk("= %lukB)\n", K(total));
577	}
578
579#ifdef SWAP_CACHE_INFO
580	show_swap_cache_info();
581#endif
582}
583
584void show_free_areas(void)
585{
586	show_free_areas_core(pgdat_list);
587}
588
589/*
590 * Builds allocation fallback zone lists.
591 */
592static inline void build_zonelists(pg_data_t *pgdat)
593{
594	int i, j, k;
595
596	for (i = 0; i <= GFP_ZONEMASK; i++) {
597		zonelist_t *zonelist;
598		zone_t *zone;
599
600		zonelist = pgdat->node_zonelists + i;
601		memset(zonelist, 0, sizeof(*zonelist));
602
603		j = 0;
604		k = ZONE_NORMAL;
605		if (i & __GFP_HIGHMEM)
606			k = ZONE_HIGHMEM;
607		if (i & __GFP_DMA)
608			k = ZONE_DMA;
609
610		switch (k) {
611			default:
612				BUG();
613			/*
614			 * fallthrough:
615			 */
616			case ZONE_HIGHMEM:
617				zone = pgdat->node_zones + ZONE_HIGHMEM;
618				if (zone->size) {
619#ifndef CONFIG_HIGHMEM
620					BUG();
621#endif
622					zonelist->zones[j++] = zone;
623				}
624			case ZONE_NORMAL:
625				zone = pgdat->node_zones + ZONE_NORMAL;
626				if (zone->size)
627					zonelist->zones[j++] = zone;
628			case ZONE_DMA:
629				zone = pgdat->node_zones + ZONE_DMA;
630				if (zone->size)
631					zonelist->zones[j++] = zone;
632		}
633		zonelist->zones[j++] = NULL;
634	}
635}
636
637/*
638 * Helper functions to size the waitqueue hash table.
639 * Essentially these want to choose hash table sizes sufficiently
640 * large so that collisions trying to wait on pages are rare.
641 * But in fact, the number of active page waitqueues on typical
642 * systems is ridiculously low, less than 200. So this is even
643 * conservative, even though it seems large.
644 *
645 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
646 * waitqueues, i.e. the size of the waitq table given the number of pages.
647 */
648#define PAGES_PER_WAITQUEUE	256
649
650static inline unsigned long wait_table_size(unsigned long pages)
651{
652	unsigned long size = 1;
653
654	pages /= PAGES_PER_WAITQUEUE;
655
656	while (size < pages)
657		size <<= 1;
658
659	/*
660	 * Once we have dozens or even hundreds of threads sleeping
661	 * on IO we've got bigger problems than wait queue collision.
662	 * Limit the size of the wait table to a reasonable size.
663	 */
664	size = min(size, 4096UL);
665
666	return size;
667}
668
669/*
670 * This is an integer logarithm so that shifts can be used later
671 * to extract the more random high bits from the multiplicative
672 * hash function before the remainder is taken.
673 */
674static inline unsigned long wait_table_bits(unsigned long size)
675{
676	return ffz(~size);
677}
678
679#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
680
681/*
682 * Set up the zone data structures:
683 *   - mark all pages reserved
684 *   - mark all memory queues empty
685 *   - clear the memory bitmaps
686 */
687void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
688	unsigned long *zones_size, unsigned long zone_start_paddr,
689	unsigned long *zholes_size, struct page *lmem_map)
690{
691	unsigned long i, j;
692	unsigned long map_size;
693	unsigned long totalpages, offset, realtotalpages;
694	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
695
696	if (zone_start_paddr & ~PAGE_MASK)
697		BUG();
698
699	totalpages = 0;
700	for (i = 0; i < MAX_NR_ZONES; i++) {
701		unsigned long size = zones_size[i];
702		totalpages += size;
703	}
704	realtotalpages = totalpages;
705	if (zholes_size)
706		for (i = 0; i < MAX_NR_ZONES; i++)
707			realtotalpages -= zholes_size[i];
708
709	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
710
711	/*
712	 * Some architectures (with lots of mem and discontinous memory
713	 * maps) have to search for a good mem_map area:
714	 * For discontigmem, the conceptual mem map array starts from
715	 * PAGE_OFFSET, we need to align the actual array onto a mem map
716	 * boundary, so that MAP_NR works.
717	 */
718	map_size = (totalpages + 1)*sizeof(struct page);
719	if (lmem_map == (struct page *)0) {
720		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
721		lmem_map = (struct page *)(PAGE_OFFSET +
722			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
723	}
724	*gmap = pgdat->node_mem_map = lmem_map;
725	pgdat->node_size = totalpages;
726	pgdat->node_start_paddr = zone_start_paddr;
727	pgdat->node_start_mapnr = (lmem_map - mem_map);
728	pgdat->nr_zones = 0;
729
730	offset = lmem_map - mem_map;
731	for (j = 0; j < MAX_NR_ZONES; j++) {
732		zone_t *zone = pgdat->node_zones + j;
733		unsigned long mask;
734		unsigned long size, realsize;
735
736		zone_table[nid * MAX_NR_ZONES + j] = zone;
737		realsize = size = zones_size[j];
738		if (zholes_size)
739			realsize -= zholes_size[j];
740
741		printk("zone(%lu): %lu pages.\n", j, size);
742		zone->size = size;
743		zone->name = zone_names[j];
744		zone->lock = SPIN_LOCK_UNLOCKED;
745		zone->zone_pgdat = pgdat;
746		zone->free_pages = 0;
747		zone->need_balance = 0;
748		if (!size)
749			continue;
750
751		/*
752		 * The per-page waitqueue mechanism uses hashed waitqueues
753		 * per zone.
754		 */
755		zone->wait_table_size = wait_table_size(size);
756		zone->wait_table_shift =
757			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
758		zone->wait_table = (wait_queue_head_t *)
759			alloc_bootmem_node(pgdat, zone->wait_table_size
760						* sizeof(wait_queue_head_t));
761
762		for(i = 0; i < zone->wait_table_size; ++i)
763			init_waitqueue_head(zone->wait_table + i);
764
765		pgdat->nr_zones = j+1;
766
767		mask = (realsize / zone_balance_ratio[j]);
768		if (mask < zone_balance_min[j])
769			mask = zone_balance_min[j];
770		else if (mask > zone_balance_max[j])
771			mask = zone_balance_max[j];
772		zone->pages_min = mask;
773		zone->pages_low = mask*2;
774		zone->pages_high = mask*3;
775
776		zone->zone_mem_map = mem_map + offset;
777		zone->zone_start_mapnr = offset;
778		zone->zone_start_paddr = zone_start_paddr;
779
780		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
781			printk("BUG: wrong zone alignment, it will crash\n");
782
783		/*
784		 * Initially all pages are reserved - free ones are freed
785		 * up by free_all_bootmem() once the early boot process is
786		 * done. Non-atomic initialization, single-pass.
787		 */
788		for (i = 0; i < size; i++) {
789			struct page *page = mem_map + offset + i;
790			set_page_zone(page, nid * MAX_NR_ZONES + j);
791			set_page_count(page, 0);
792			SetPageReserved(page);
793			INIT_LIST_HEAD(&page->list);
794			if (j != ZONE_HIGHMEM)
795				set_page_address(page, __va(zone_start_paddr));
796			zone_start_paddr += PAGE_SIZE;
797		}
798
799		offset += size;
800		for (i = 0; ; i++) {
801			unsigned long bitmap_size;
802
803			INIT_LIST_HEAD(&zone->free_area[i].free_list);
804			if (i == MAX_ORDER-1) {
805				zone->free_area[i].map = NULL;
806				break;
807			}
808
809			/*
810			 * Page buddy system uses "index >> (i+1)",
811			 * where "index" is at most "size-1".
812			 *
813			 * The extra "+3" is to round down to byte
814			 * size (8 bits per byte assumption). Thus
815			 * we get "(size-1) >> (i+4)" as the last byte
816			 * we can access.
817			 *
818			 * The "+1" is because we want to round the
819			 * byte allocation up rather than down. So
820			 * we should have had a "+7" before we shifted
821			 * down by three. Also, we have to add one as
822			 * we actually _use_ the last bit (it's [0,n]
823			 * inclusive, not [0,n[).
824			 *
825			 * So we actually had +7+1 before we shift
826			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
827			 * (modulo overflows, which we do not have).
828			 *
829			 * Finally, we LONG_ALIGN because all bitmap
830			 * operations are on longs.
831			 */
832			bitmap_size = (size-1) >> (i+4);
833			bitmap_size = LONG_ALIGN(bitmap_size+1);
834			zone->free_area[i].map =
835			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
836		}
837	}
838	build_zonelists(pgdat);
839}
840
841void __init free_area_init(unsigned long *zones_size)
842{
843	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
844}
845
846static int __init setup_mem_frac(char *str)
847{
848	int j = 0;
849
850	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
851	printk("setup_mem_frac: ");
852	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
853	printk("\n");
854	return 1;
855}
856
857__setup("memfrac=", setup_mem_frac);
858