1/*
2 *  linux/mm/vmstat.c
3 *
4 *  Manages VM statistics
5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6 *
7 *  zoned VM statistics
8 *  Copyright (C) 2006 Silicon Graphics, Inc.,
9 *		Christoph Lameter <christoph@lameter.com>
10 */
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/err.h>
14#include <linux/module.h>
15#include <linux/slab.h>
16#include <linux/cpu.h>
17#include <linux/vmstat.h>
18#include <linux/sched.h>
19#include <linux/math64.h>
20
21#ifdef CONFIG_VM_EVENT_COUNTERS
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
23EXPORT_PER_CPU_SYMBOL(vm_event_states);
24
25static void sum_vm_events(unsigned long *ret)
26{
27	int cpu;
28	int i;
29
30	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
31
32	for_each_online_cpu(cpu) {
33		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
34
35		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
36			ret[i] += this->event[i];
37	}
38}
39
40/*
41 * Accumulate the vm event counters across all CPUs.
42 * The result is unavoidably approximate - it can change
43 * during and after execution of this function.
44*/
45void all_vm_events(unsigned long *ret)
46{
47	get_online_cpus();
48	sum_vm_events(ret);
49	put_online_cpus();
50}
51EXPORT_SYMBOL_GPL(all_vm_events);
52
53#ifdef CONFIG_HOTPLUG
54/*
55 * Fold the foreign cpu events into our own.
56 *
57 * This is adding to the events on one processor
58 * but keeps the global counts constant.
59 */
60void vm_events_fold_cpu(int cpu)
61{
62	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
63	int i;
64
65	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
66		count_vm_events(i, fold_state->event[i]);
67		fold_state->event[i] = 0;
68	}
69}
70#endif /* CONFIG_HOTPLUG */
71
72#endif /* CONFIG_VM_EVENT_COUNTERS */
73
74/*
75 * Manage combined zone based / global counters
76 *
77 * vm_stat contains the global counters
78 */
79atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
80EXPORT_SYMBOL(vm_stat);
81
82#ifdef CONFIG_SMP
83
84static int calculate_pressure_threshold(struct zone *zone)
85{
86	int threshold;
87	int watermark_distance;
88
89	/*
90	 * As vmstats are not up to date, there is drift between the estimated
91	 * and real values. For high thresholds and a high number of CPUs, it
92	 * is possible for the min watermark to be breached while the estimated
93	 * value looks fine. The pressure threshold is a reduced value such
94	 * that even the maximum amount of drift will not accidentally breach
95	 * the min watermark
96	 */
97	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
98	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
99
100	/*
101	 * Maximum threshold is 125
102	 */
103	threshold = min(125, threshold);
104
105	return threshold;
106}
107
108static int calculate_threshold(struct zone *zone)
109{
110	int threshold;
111	int mem;	/* memory in 128 MB units */
112
113	/*
114	 * The threshold scales with the number of processors and the amount
115	 * of memory per zone. More memory means that we can defer updates for
116	 * longer, more processors could lead to more contention.
117 	 * fls() is used to have a cheap way of logarithmic scaling.
118	 *
119	 * Some sample thresholds:
120	 *
121	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
122	 * ------------------------------------------------------------------
123	 * 8		1		1	0.9-1 GB	4
124	 * 16		2		2	0.9-1 GB	4
125	 * 20 		2		2	1-2 GB		5
126	 * 24		2		2	2-4 GB		6
127	 * 28		2		2	4-8 GB		7
128	 * 32		2		2	8-16 GB		8
129	 * 4		2		2	<128M		1
130	 * 30		4		3	2-4 GB		5
131	 * 48		4		3	8-16 GB		8
132	 * 32		8		4	1-2 GB		4
133	 * 32		8		4	0.9-1GB		4
134	 * 10		16		5	<128M		1
135	 * 40		16		5	900M		4
136	 * 70		64		7	2-4 GB		5
137	 * 84		64		7	4-8 GB		6
138	 * 108		512		9	4-8 GB		6
139	 * 125		1024		10	8-16 GB		8
140	 * 125		1024		10	16-32 GB	9
141	 */
142
143	mem = zone->present_pages >> (27 - PAGE_SHIFT);
144
145	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
146
147	/*
148	 * Maximum threshold is 125
149	 */
150	threshold = min(125, threshold);
151
152	return threshold;
153}
154
155/*
156 * Refresh the thresholds for each zone.
157 */
158static void refresh_zone_stat_thresholds(void)
159{
160	struct zone *zone;
161	int cpu;
162	int threshold;
163
164	for_each_populated_zone(zone) {
165		unsigned long max_drift, tolerate_drift;
166
167		threshold = calculate_threshold(zone);
168
169		for_each_online_cpu(cpu)
170			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
171							= threshold;
172
173		/*
174		 * Only set percpu_drift_mark if there is a danger that
175		 * NR_FREE_PAGES reports the low watermark is ok when in fact
176		 * the min watermark could be breached by an allocation
177		 */
178		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
179		max_drift = num_online_cpus() * threshold;
180		if (max_drift > tolerate_drift)
181			zone->percpu_drift_mark = high_wmark_pages(zone) +
182					max_drift;
183	}
184}
185
186void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
187{
188	struct zone *zone;
189	int cpu;
190	int threshold;
191	int i;
192
193	get_online_cpus();
194	for (i = 0; i < pgdat->nr_zones; i++) {
195		zone = &pgdat->node_zones[i];
196		if (!zone->percpu_drift_mark)
197			continue;
198
199		threshold = calculate_pressure_threshold(zone);
200		for_each_online_cpu(cpu)
201			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
202							= threshold;
203	}
204	put_online_cpus();
205}
206
207void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
208{
209	struct zone *zone;
210	int cpu;
211	int threshold;
212	int i;
213
214	get_online_cpus();
215	for (i = 0; i < pgdat->nr_zones; i++) {
216		zone = &pgdat->node_zones[i];
217		if (!zone->percpu_drift_mark)
218			continue;
219
220		threshold = calculate_threshold(zone);
221		for_each_online_cpu(cpu)
222			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
223							= threshold;
224	}
225	put_online_cpus();
226}
227
228/*
229 * For use when we know that interrupts are disabled.
230 */
231void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
232				int delta)
233{
234	struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
235
236	s8 *p = pcp->vm_stat_diff + item;
237	long x;
238
239	x = delta + *p;
240
241	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
242		zone_page_state_add(x, zone, item);
243		x = 0;
244	}
245	*p = x;
246}
247EXPORT_SYMBOL(__mod_zone_page_state);
248
249/*
250 * For an unknown interrupt state
251 */
252void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
253					int delta)
254{
255	unsigned long flags;
256
257	local_irq_save(flags);
258	__mod_zone_page_state(zone, item, delta);
259	local_irq_restore(flags);
260}
261EXPORT_SYMBOL(mod_zone_page_state);
262
263/*
264 * Optimized increment and decrement functions.
265 *
266 * These are only for a single page and therefore can take a struct page *
267 * argument instead of struct zone *. This allows the inclusion of the code
268 * generated for page_zone(page) into the optimized functions.
269 *
270 * No overflow check is necessary and therefore the differential can be
271 * incremented or decremented in place which may allow the compilers to
272 * generate better code.
273 * The increment or decrement is known and therefore one boundary check can
274 * be omitted.
275 *
276 * NOTE: These functions are very performance sensitive. Change only
277 * with care.
278 *
279 * Some processors have inc/dec instructions that are atomic vs an interrupt.
280 * However, the code must first determine the differential location in a zone
281 * based on the processor number and then inc/dec the counter. There is no
282 * guarantee without disabling preemption that the processor will not change
283 * in between and therefore the atomicity vs. interrupt cannot be exploited
284 * in a useful way here.
285 */
286void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
287{
288	struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
289	s8 *p = pcp->vm_stat_diff + item;
290
291	(*p)++;
292
293	if (unlikely(*p > pcp->stat_threshold)) {
294		int overstep = pcp->stat_threshold / 2;
295
296		zone_page_state_add(*p + overstep, zone, item);
297		*p = -overstep;
298	}
299}
300
301void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
302{
303	__inc_zone_state(page_zone(page), item);
304}
305EXPORT_SYMBOL(__inc_zone_page_state);
306
307void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
308{
309	struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
310	s8 *p = pcp->vm_stat_diff + item;
311
312	(*p)--;
313
314	if (unlikely(*p < - pcp->stat_threshold)) {
315		int overstep = pcp->stat_threshold / 2;
316
317		zone_page_state_add(*p - overstep, zone, item);
318		*p = overstep;
319	}
320}
321
322void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
323{
324	__dec_zone_state(page_zone(page), item);
325}
326EXPORT_SYMBOL(__dec_zone_page_state);
327
328void inc_zone_state(struct zone *zone, enum zone_stat_item item)
329{
330	unsigned long flags;
331
332	local_irq_save(flags);
333	__inc_zone_state(zone, item);
334	local_irq_restore(flags);
335}
336
337void inc_zone_page_state(struct page *page, enum zone_stat_item item)
338{
339	unsigned long flags;
340	struct zone *zone;
341
342	zone = page_zone(page);
343	local_irq_save(flags);
344	__inc_zone_state(zone, item);
345	local_irq_restore(flags);
346}
347EXPORT_SYMBOL(inc_zone_page_state);
348
349void dec_zone_page_state(struct page *page, enum zone_stat_item item)
350{
351	unsigned long flags;
352
353	local_irq_save(flags);
354	__dec_zone_page_state(page, item);
355	local_irq_restore(flags);
356}
357EXPORT_SYMBOL(dec_zone_page_state);
358
359/*
360 * Update the zone counters for one cpu.
361 *
362 * The cpu specified must be either the current cpu or a processor that
363 * is not online. If it is the current cpu then the execution thread must
364 * be pinned to the current cpu.
365 *
366 * Note that refresh_cpu_vm_stats strives to only access
367 * node local memory. The per cpu pagesets on remote zones are placed
368 * in the memory local to the processor using that pageset. So the
369 * loop over all zones will access a series of cachelines local to
370 * the processor.
371 *
372 * The call to zone_page_state_add updates the cachelines with the
373 * statistics in the remote zone struct as well as the global cachelines
374 * with the global counters. These could cause remote node cache line
375 * bouncing and will have to be only done when necessary.
376 */
377void refresh_cpu_vm_stats(int cpu)
378{
379	struct zone *zone;
380	int i;
381	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
382
383	for_each_populated_zone(zone) {
384		struct per_cpu_pageset *p;
385
386		p = per_cpu_ptr(zone->pageset, cpu);
387
388		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
389			if (p->vm_stat_diff[i]) {
390				unsigned long flags;
391				int v;
392
393				local_irq_save(flags);
394				v = p->vm_stat_diff[i];
395				p->vm_stat_diff[i] = 0;
396				local_irq_restore(flags);
397				atomic_long_add(v, &zone->vm_stat[i]);
398				global_diff[i] += v;
399#ifdef CONFIG_NUMA
400				/* 3 seconds idle till flush */
401				p->expire = 3;
402#endif
403			}
404		cond_resched();
405#ifdef CONFIG_NUMA
406		/*
407		 * Deal with draining the remote pageset of this
408		 * processor
409		 *
410		 * Check if there are pages remaining in this pageset
411		 * if not then there is nothing to expire.
412		 */
413		if (!p->expire || !p->pcp.count)
414			continue;
415
416		/*
417		 * We never drain zones local to this processor.
418		 */
419		if (zone_to_nid(zone) == numa_node_id()) {
420			p->expire = 0;
421			continue;
422		}
423
424		p->expire--;
425		if (p->expire)
426			continue;
427
428		if (p->pcp.count)
429			drain_zone_pages(zone, &p->pcp);
430#endif
431	}
432
433	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
434		if (global_diff[i])
435			atomic_long_add(global_diff[i], &vm_stat[i]);
436}
437
438#endif
439
440#ifdef CONFIG_NUMA
441/*
442 * zonelist = the list of zones passed to the allocator
443 * z 	    = the zone from which the allocation occurred.
444 *
445 * Must be called with interrupts disabled.
446 */
447void zone_statistics(struct zone *preferred_zone, struct zone *z)
448{
449	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
450		__inc_zone_state(z, NUMA_HIT);
451	} else {
452		__inc_zone_state(z, NUMA_MISS);
453		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
454	}
455	if (z->node == numa_node_id())
456		__inc_zone_state(z, NUMA_LOCAL);
457	else
458		__inc_zone_state(z, NUMA_OTHER);
459}
460#endif
461
462#ifdef CONFIG_COMPACTION
463struct contig_page_info {
464	unsigned long free_pages;
465	unsigned long free_blocks_total;
466	unsigned long free_blocks_suitable;
467};
468
469/*
470 * Calculate the number of free pages in a zone, how many contiguous
471 * pages are free and how many are large enough to satisfy an allocation of
472 * the target size. Note that this function makes no attempt to estimate
473 * how many suitable free blocks there *might* be if MOVABLE pages were
474 * migrated. Calculating that is possible, but expensive and can be
475 * figured out from userspace
476 */
477static void fill_contig_page_info(struct zone *zone,
478				unsigned int suitable_order,
479				struct contig_page_info *info)
480{
481	unsigned int order;
482
483	info->free_pages = 0;
484	info->free_blocks_total = 0;
485	info->free_blocks_suitable = 0;
486
487	for (order = 0; order < MAX_ORDER; order++) {
488		unsigned long blocks;
489
490		/* Count number of free blocks */
491		blocks = zone->free_area[order].nr_free;
492		info->free_blocks_total += blocks;
493
494		/* Count free base pages */
495		info->free_pages += blocks << order;
496
497		/* Count the suitable free blocks */
498		if (order >= suitable_order)
499			info->free_blocks_suitable += blocks <<
500						(order - suitable_order);
501	}
502}
503
504/*
505 * A fragmentation index only makes sense if an allocation of a requested
506 * size would fail. If that is true, the fragmentation index indicates
507 * whether external fragmentation or a lack of memory was the problem.
508 * The value can be used to determine if page reclaim or compaction
509 * should be used
510 */
511static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
512{
513	unsigned long requested = 1UL << order;
514
515	if (!info->free_blocks_total)
516		return 0;
517
518	/* Fragmentation index only makes sense when a request would fail */
519	if (info->free_blocks_suitable)
520		return -1000;
521
522	/*
523	 * Index is between 0 and 1 so return within 3 decimal places
524	 *
525	 * 0 => allocation would fail due to lack of memory
526	 * 1 => allocation would fail due to fragmentation
527	 */
528	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
529}
530
531/* Same as __fragmentation index but allocs contig_page_info on stack */
532int fragmentation_index(struct zone *zone, unsigned int order)
533{
534	struct contig_page_info info;
535
536	fill_contig_page_info(zone, order, &info);
537	return __fragmentation_index(order, &info);
538}
539#endif
540
541#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
542#include <linux/proc_fs.h>
543#include <linux/seq_file.h>
544
545static char * const migratetype_names[MIGRATE_TYPES] = {
546	"Unmovable",
547	"Reclaimable",
548	"Movable",
549	"Reserve",
550	"Isolate",
551};
552
553static void *frag_start(struct seq_file *m, loff_t *pos)
554{
555	pg_data_t *pgdat;
556	loff_t node = *pos;
557	for (pgdat = first_online_pgdat();
558	     pgdat && node;
559	     pgdat = next_online_pgdat(pgdat))
560		--node;
561
562	return pgdat;
563}
564
565static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
566{
567	pg_data_t *pgdat = (pg_data_t *)arg;
568
569	(*pos)++;
570	return next_online_pgdat(pgdat);
571}
572
573static void frag_stop(struct seq_file *m, void *arg)
574{
575}
576
577/* Walk all the zones in a node and print using a callback */
578static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
579		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
580{
581	struct zone *zone;
582	struct zone *node_zones = pgdat->node_zones;
583	unsigned long flags;
584
585	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
586		if (!populated_zone(zone))
587			continue;
588
589		spin_lock_irqsave(&zone->lock, flags);
590		print(m, pgdat, zone);
591		spin_unlock_irqrestore(&zone->lock, flags);
592	}
593}
594#endif
595
596#ifdef CONFIG_PROC_FS
597static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
598						struct zone *zone)
599{
600	int order;
601
602	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
603	for (order = 0; order < MAX_ORDER; ++order)
604		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
605	seq_putc(m, '\n');
606}
607
608/*
609 * This walks the free areas for each zone.
610 */
611static int frag_show(struct seq_file *m, void *arg)
612{
613	pg_data_t *pgdat = (pg_data_t *)arg;
614	walk_zones_in_node(m, pgdat, frag_show_print);
615	return 0;
616}
617
618static void pagetypeinfo_showfree_print(struct seq_file *m,
619					pg_data_t *pgdat, struct zone *zone)
620{
621	int order, mtype;
622
623	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
624		seq_printf(m, "Node %4d, zone %8s, type %12s ",
625					pgdat->node_id,
626					zone->name,
627					migratetype_names[mtype]);
628		for (order = 0; order < MAX_ORDER; ++order) {
629			unsigned long freecount = 0;
630			struct free_area *area;
631			struct list_head *curr;
632
633			area = &(zone->free_area[order]);
634
635			list_for_each(curr, &area->free_list[mtype])
636				freecount++;
637			seq_printf(m, "%6lu ", freecount);
638		}
639		seq_putc(m, '\n');
640	}
641}
642
643/* Print out the free pages at each order for each migatetype */
644static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
645{
646	int order;
647	pg_data_t *pgdat = (pg_data_t *)arg;
648
649	/* Print header */
650	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
651	for (order = 0; order < MAX_ORDER; ++order)
652		seq_printf(m, "%6d ", order);
653	seq_putc(m, '\n');
654
655	walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
656
657	return 0;
658}
659
660static void pagetypeinfo_showblockcount_print(struct seq_file *m,
661					pg_data_t *pgdat, struct zone *zone)
662{
663	int mtype;
664	unsigned long pfn;
665	unsigned long start_pfn = zone->zone_start_pfn;
666	unsigned long end_pfn = start_pfn + zone->spanned_pages;
667	unsigned long count[MIGRATE_TYPES] = { 0, };
668
669	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
670		struct page *page;
671
672		if (!pfn_valid(pfn))
673			continue;
674
675		page = pfn_to_page(pfn);
676
677		/* Watch for unexpected holes punched in the memmap */
678		if (!memmap_valid_within(pfn, page, zone))
679			continue;
680
681		mtype = get_pageblock_migratetype(page);
682
683		if (mtype < MIGRATE_TYPES)
684			count[mtype]++;
685	}
686
687	/* Print counts */
688	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
689	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
690		seq_printf(m, "%12lu ", count[mtype]);
691	seq_putc(m, '\n');
692}
693
694/* Print out the free pages at each order for each migratetype */
695static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
696{
697	int mtype;
698	pg_data_t *pgdat = (pg_data_t *)arg;
699
700	seq_printf(m, "\n%-23s", "Number of blocks type ");
701	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
702		seq_printf(m, "%12s ", migratetype_names[mtype]);
703	seq_putc(m, '\n');
704	walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
705
706	return 0;
707}
708
709/*
710 * This prints out statistics in relation to grouping pages by mobility.
711 * It is expensive to collect so do not constantly read the file.
712 */
713static int pagetypeinfo_show(struct seq_file *m, void *arg)
714{
715	pg_data_t *pgdat = (pg_data_t *)arg;
716
717	/* check memoryless node */
718	if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
719		return 0;
720
721	seq_printf(m, "Page block order: %d\n", pageblock_order);
722	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
723	seq_putc(m, '\n');
724	pagetypeinfo_showfree(m, pgdat);
725	pagetypeinfo_showblockcount(m, pgdat);
726
727	return 0;
728}
729
730static const struct seq_operations fragmentation_op = {
731	.start	= frag_start,
732	.next	= frag_next,
733	.stop	= frag_stop,
734	.show	= frag_show,
735};
736
737static int fragmentation_open(struct inode *inode, struct file *file)
738{
739	return seq_open(file, &fragmentation_op);
740}
741
742static const struct file_operations fragmentation_file_operations = {
743	.open		= fragmentation_open,
744	.read		= seq_read,
745	.llseek		= seq_lseek,
746	.release	= seq_release,
747};
748
749static const struct seq_operations pagetypeinfo_op = {
750	.start	= frag_start,
751	.next	= frag_next,
752	.stop	= frag_stop,
753	.show	= pagetypeinfo_show,
754};
755
756static int pagetypeinfo_open(struct inode *inode, struct file *file)
757{
758	return seq_open(file, &pagetypeinfo_op);
759}
760
761static const struct file_operations pagetypeinfo_file_ops = {
762	.open		= pagetypeinfo_open,
763	.read		= seq_read,
764	.llseek		= seq_lseek,
765	.release	= seq_release,
766};
767
768#ifdef CONFIG_ZONE_DMA
769#define TEXT_FOR_DMA(xx) xx "_dma",
770#else
771#define TEXT_FOR_DMA(xx)
772#endif
773
774#ifdef CONFIG_ZONE_DMA32
775#define TEXT_FOR_DMA32(xx) xx "_dma32",
776#else
777#define TEXT_FOR_DMA32(xx)
778#endif
779
780#ifdef CONFIG_HIGHMEM
781#define TEXT_FOR_HIGHMEM(xx) xx "_high",
782#else
783#define TEXT_FOR_HIGHMEM(xx)
784#endif
785
786#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
787					TEXT_FOR_HIGHMEM(xx) xx "_movable",
788
789static const char * const vmstat_text[] = {
790	/* Zoned VM counters */
791	"nr_free_pages",
792	"nr_inactive_anon",
793	"nr_active_anon",
794	"nr_inactive_file",
795	"nr_active_file",
796	"nr_unevictable",
797	"nr_mlock",
798	"nr_anon_pages",
799	"nr_mapped",
800	"nr_file_pages",
801	"nr_dirty",
802	"nr_writeback",
803	"nr_slab_reclaimable",
804	"nr_slab_unreclaimable",
805	"nr_page_table_pages",
806	"nr_kernel_stack",
807	"nr_unstable",
808	"nr_bounce",
809	"nr_vmscan_write",
810	"nr_writeback_temp",
811	"nr_isolated_anon",
812	"nr_isolated_file",
813	"nr_shmem",
814#ifdef CONFIG_NUMA
815	"numa_hit",
816	"numa_miss",
817	"numa_foreign",
818	"numa_interleave",
819	"numa_local",
820	"numa_other",
821#endif
822
823#ifdef CONFIG_VM_EVENT_COUNTERS
824	"pgpgin",
825	"pgpgout",
826	"pswpin",
827	"pswpout",
828
829	TEXTS_FOR_ZONES("pgalloc")
830
831	"pgfree",
832	"pgactivate",
833	"pgdeactivate",
834
835	"pgfault",
836	"pgmajfault",
837
838	TEXTS_FOR_ZONES("pgrefill")
839	TEXTS_FOR_ZONES("pgsteal")
840	TEXTS_FOR_ZONES("pgscan_kswapd")
841	TEXTS_FOR_ZONES("pgscan_direct")
842
843#ifdef CONFIG_NUMA
844	"zone_reclaim_failed",
845#endif
846	"pginodesteal",
847	"slabs_scanned",
848	"kswapd_steal",
849	"kswapd_inodesteal",
850	"kswapd_low_wmark_hit_quickly",
851	"kswapd_high_wmark_hit_quickly",
852	"kswapd_skip_congestion_wait",
853	"pageoutrun",
854	"allocstall",
855
856	"pgrotated",
857
858#ifdef CONFIG_COMPACTION
859	"compact_blocks_moved",
860	"compact_pages_moved",
861	"compact_pagemigrate_failed",
862	"compact_stall",
863	"compact_fail",
864	"compact_success",
865#endif
866
867#ifdef CONFIG_HUGETLB_PAGE
868	"htlb_buddy_alloc_success",
869	"htlb_buddy_alloc_fail",
870#endif
871	"unevictable_pgs_culled",
872	"unevictable_pgs_scanned",
873	"unevictable_pgs_rescued",
874	"unevictable_pgs_mlocked",
875	"unevictable_pgs_munlocked",
876	"unevictable_pgs_cleared",
877	"unevictable_pgs_stranded",
878	"unevictable_pgs_mlockfreed",
879#endif
880};
881
882static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
883							struct zone *zone)
884{
885	int i;
886	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
887	seq_printf(m,
888		   "\n  pages free     %lu"
889		   "\n        min      %lu"
890		   "\n        low      %lu"
891		   "\n        high     %lu"
892		   "\n        scanned  %lu"
893		   "\n        spanned  %lu"
894		   "\n        present  %lu",
895		   zone_page_state(zone, NR_FREE_PAGES),
896		   min_wmark_pages(zone),
897		   low_wmark_pages(zone),
898		   high_wmark_pages(zone),
899		   zone->pages_scanned,
900		   zone->spanned_pages,
901		   zone->present_pages);
902
903	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
904		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
905				zone_page_state(zone, i));
906
907	seq_printf(m,
908		   "\n        protection: (%lu",
909		   zone->lowmem_reserve[0]);
910	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
911		seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
912	seq_printf(m,
913		   ")"
914		   "\n  pagesets");
915	for_each_online_cpu(i) {
916		struct per_cpu_pageset *pageset;
917
918		pageset = per_cpu_ptr(zone->pageset, i);
919		seq_printf(m,
920			   "\n    cpu: %i"
921			   "\n              count: %i"
922			   "\n              high:  %i"
923			   "\n              batch: %i",
924			   i,
925			   pageset->pcp.count,
926			   pageset->pcp.high,
927			   pageset->pcp.batch);
928#ifdef CONFIG_SMP
929		seq_printf(m, "\n  vm stats threshold: %d",
930				pageset->stat_threshold);
931#endif
932	}
933	seq_printf(m,
934		   "\n  all_unreclaimable: %u"
935		   "\n  start_pfn:         %lu"
936		   "\n  inactive_ratio:    %u",
937		   zone->all_unreclaimable,
938		   zone->zone_start_pfn,
939		   zone->inactive_ratio);
940	seq_putc(m, '\n');
941}
942
943/*
944 * Output information about zones in @pgdat.
945 */
946static int zoneinfo_show(struct seq_file *m, void *arg)
947{
948	pg_data_t *pgdat = (pg_data_t *)arg;
949	walk_zones_in_node(m, pgdat, zoneinfo_show_print);
950	return 0;
951}
952
953static const struct seq_operations zoneinfo_op = {
954	.start	= frag_start, /* iterate over all zones. The same as in
955			       * fragmentation. */
956	.next	= frag_next,
957	.stop	= frag_stop,
958	.show	= zoneinfo_show,
959};
960
961static int zoneinfo_open(struct inode *inode, struct file *file)
962{
963	return seq_open(file, &zoneinfo_op);
964}
965
966static const struct file_operations proc_zoneinfo_file_operations = {
967	.open		= zoneinfo_open,
968	.read		= seq_read,
969	.llseek		= seq_lseek,
970	.release	= seq_release,
971};
972
973static void *vmstat_start(struct seq_file *m, loff_t *pos)
974{
975	unsigned long *v;
976#ifdef CONFIG_VM_EVENT_COUNTERS
977	unsigned long *e;
978#endif
979	int i;
980
981	if (*pos >= ARRAY_SIZE(vmstat_text))
982		return NULL;
983
984#ifdef CONFIG_VM_EVENT_COUNTERS
985	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
986			+ sizeof(struct vm_event_state), GFP_KERNEL);
987#else
988	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
989			GFP_KERNEL);
990#endif
991	m->private = v;
992	if (!v)
993		return ERR_PTR(-ENOMEM);
994	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
995		v[i] = global_page_state(i);
996#ifdef CONFIG_VM_EVENT_COUNTERS
997	e = v + NR_VM_ZONE_STAT_ITEMS;
998	all_vm_events(e);
999	e[PGPGIN] /= 2;		/* sectors -> kbytes */
1000	e[PGPGOUT] /= 2;
1001#endif
1002	return v + *pos;
1003}
1004
1005static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1006{
1007	(*pos)++;
1008	if (*pos >= ARRAY_SIZE(vmstat_text))
1009		return NULL;
1010	return (unsigned long *)m->private + *pos;
1011}
1012
1013static int vmstat_show(struct seq_file *m, void *arg)
1014{
1015	unsigned long *l = arg;
1016	unsigned long off = l - (unsigned long *)m->private;
1017
1018	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1019	return 0;
1020}
1021
1022static void vmstat_stop(struct seq_file *m, void *arg)
1023{
1024	kfree(m->private);
1025	m->private = NULL;
1026}
1027
1028static const struct seq_operations vmstat_op = {
1029	.start	= vmstat_start,
1030	.next	= vmstat_next,
1031	.stop	= vmstat_stop,
1032	.show	= vmstat_show,
1033};
1034
1035static int vmstat_open(struct inode *inode, struct file *file)
1036{
1037	return seq_open(file, &vmstat_op);
1038}
1039
1040static const struct file_operations proc_vmstat_file_operations = {
1041	.open		= vmstat_open,
1042	.read		= seq_read,
1043	.llseek		= seq_lseek,
1044	.release	= seq_release,
1045};
1046#endif /* CONFIG_PROC_FS */
1047
1048#ifdef CONFIG_SMP
1049static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1050int sysctl_stat_interval __read_mostly = HZ;
1051
1052static void vmstat_update(struct work_struct *w)
1053{
1054	refresh_cpu_vm_stats(smp_processor_id());
1055	schedule_delayed_work(&__get_cpu_var(vmstat_work),
1056		round_jiffies_relative(sysctl_stat_interval));
1057}
1058
1059static void __cpuinit start_cpu_timer(int cpu)
1060{
1061	struct delayed_work *work = &per_cpu(vmstat_work, cpu);
1062
1063	INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
1064	schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1065}
1066
1067/*
1068 * Use the cpu notifier to insure that the thresholds are recalculated
1069 * when necessary.
1070 */
1071static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1072		unsigned long action,
1073		void *hcpu)
1074{
1075	long cpu = (long)hcpu;
1076
1077	switch (action) {
1078	case CPU_ONLINE:
1079	case CPU_ONLINE_FROZEN:
1080		refresh_zone_stat_thresholds();
1081		start_cpu_timer(cpu);
1082		node_set_state(cpu_to_node(cpu), N_CPU);
1083		break;
1084	case CPU_DOWN_PREPARE:
1085	case CPU_DOWN_PREPARE_FROZEN:
1086		cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
1087		per_cpu(vmstat_work, cpu).work.func = NULL;
1088		break;
1089	case CPU_DOWN_FAILED:
1090	case CPU_DOWN_FAILED_FROZEN:
1091		start_cpu_timer(cpu);
1092		break;
1093	case CPU_DEAD:
1094	case CPU_DEAD_FROZEN:
1095		refresh_zone_stat_thresholds();
1096		break;
1097	default:
1098		break;
1099	}
1100	return NOTIFY_OK;
1101}
1102
1103static struct notifier_block __cpuinitdata vmstat_notifier =
1104	{ &vmstat_cpuup_callback, NULL, 0 };
1105#endif
1106
1107static int __init setup_vmstat(void)
1108{
1109#ifdef CONFIG_SMP
1110	int cpu;
1111
1112	refresh_zone_stat_thresholds();
1113	register_cpu_notifier(&vmstat_notifier);
1114
1115	for_each_online_cpu(cpu)
1116		start_cpu_timer(cpu);
1117#endif
1118#ifdef CONFIG_PROC_FS
1119	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
1120	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
1121	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
1122	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
1123#endif
1124	return 0;
1125}
1126module_init(setup_vmstat)
1127
1128#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1129#include <linux/debugfs.h>
1130
1131static struct dentry *extfrag_debug_root;
1132
1133/*
1134 * Return an index indicating how much of the available free memory is
1135 * unusable for an allocation of the requested size.
1136 */
1137static int unusable_free_index(unsigned int order,
1138				struct contig_page_info *info)
1139{
1140	/* No free memory is interpreted as all free memory is unusable */
1141	if (info->free_pages == 0)
1142		return 1000;
1143
1144	/*
1145	 * Index should be a value between 0 and 1. Return a value to 3
1146	 * decimal places.
1147	 *
1148	 * 0 => no fragmentation
1149	 * 1 => high fragmentation
1150	 */
1151	return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1152
1153}
1154
1155static void unusable_show_print(struct seq_file *m,
1156					pg_data_t *pgdat, struct zone *zone)
1157{
1158	unsigned int order;
1159	int index;
1160	struct contig_page_info info;
1161
1162	seq_printf(m, "Node %d, zone %8s ",
1163				pgdat->node_id,
1164				zone->name);
1165	for (order = 0; order < MAX_ORDER; ++order) {
1166		fill_contig_page_info(zone, order, &info);
1167		index = unusable_free_index(order, &info);
1168		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1169	}
1170
1171	seq_putc(m, '\n');
1172}
1173
1174/*
1175 * Display unusable free space index
1176 *
1177 * The unusable free space index measures how much of the available free
1178 * memory cannot be used to satisfy an allocation of a given size and is a
1179 * value between 0 and 1. The higher the value, the more of free memory is
1180 * unusable and by implication, the worse the external fragmentation is. This
1181 * can be expressed as a percentage by multiplying by 100.
1182 */
1183static int unusable_show(struct seq_file *m, void *arg)
1184{
1185	pg_data_t *pgdat = (pg_data_t *)arg;
1186
1187	/* check memoryless node */
1188	if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
1189		return 0;
1190
1191	walk_zones_in_node(m, pgdat, unusable_show_print);
1192
1193	return 0;
1194}
1195
1196static const struct seq_operations unusable_op = {
1197	.start	= frag_start,
1198	.next	= frag_next,
1199	.stop	= frag_stop,
1200	.show	= unusable_show,
1201};
1202
1203static int unusable_open(struct inode *inode, struct file *file)
1204{
1205	return seq_open(file, &unusable_op);
1206}
1207
1208static const struct file_operations unusable_file_ops = {
1209	.open		= unusable_open,
1210	.read		= seq_read,
1211	.llseek		= seq_lseek,
1212	.release	= seq_release,
1213};
1214
1215static void extfrag_show_print(struct seq_file *m,
1216					pg_data_t *pgdat, struct zone *zone)
1217{
1218	unsigned int order;
1219	int index;
1220
1221	/* Alloc on stack as interrupts are disabled for zone walk */
1222	struct contig_page_info info;
1223
1224	seq_printf(m, "Node %d, zone %8s ",
1225				pgdat->node_id,
1226				zone->name);
1227	for (order = 0; order < MAX_ORDER; ++order) {
1228		fill_contig_page_info(zone, order, &info);
1229		index = __fragmentation_index(order, &info);
1230		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1231	}
1232
1233	seq_putc(m, '\n');
1234}
1235
1236/*
1237 * Display fragmentation index for orders that allocations would fail for
1238 */
1239static int extfrag_show(struct seq_file *m, void *arg)
1240{
1241	pg_data_t *pgdat = (pg_data_t *)arg;
1242
1243	walk_zones_in_node(m, pgdat, extfrag_show_print);
1244
1245	return 0;
1246}
1247
1248static const struct seq_operations extfrag_op = {
1249	.start	= frag_start,
1250	.next	= frag_next,
1251	.stop	= frag_stop,
1252	.show	= extfrag_show,
1253};
1254
1255static int extfrag_open(struct inode *inode, struct file *file)
1256{
1257	return seq_open(file, &extfrag_op);
1258}
1259
1260static const struct file_operations extfrag_file_ops = {
1261	.open		= extfrag_open,
1262	.read		= seq_read,
1263	.llseek		= seq_lseek,
1264	.release	= seq_release,
1265};
1266
1267static int __init extfrag_debug_init(void)
1268{
1269	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1270	if (!extfrag_debug_root)
1271		return -ENOMEM;
1272
1273	if (!debugfs_create_file("unusable_index", 0444,
1274			extfrag_debug_root, NULL, &unusable_file_ops))
1275		return -ENOMEM;
1276
1277	if (!debugfs_create_file("extfrag_index", 0444,
1278			extfrag_debug_root, NULL, &extfrag_file_ops))
1279		return -ENOMEM;
1280
1281	return 0;
1282}
1283
1284module_init(extfrag_debug_init);
1285#endif
1286