1/* 2 * linux/mm/vmstat.c 3 * 4 * Manages VM statistics 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * 7 * zoned VM statistics 8 * Copyright (C) 2006 Silicon Graphics, Inc., 9 * Christoph Lameter <christoph@lameter.com> 10 */ 11#include <linux/fs.h> 12#include <linux/mm.h> 13#include <linux/err.h> 14#include <linux/module.h> 15#include <linux/slab.h> 16#include <linux/cpu.h> 17#include <linux/vmstat.h> 18#include <linux/sched.h> 19#include <linux/math64.h> 20 21#ifdef CONFIG_VM_EVENT_COUNTERS 22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 23EXPORT_PER_CPU_SYMBOL(vm_event_states); 24 25static void sum_vm_events(unsigned long *ret) 26{ 27 int cpu; 28 int i; 29 30 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 31 32 for_each_online_cpu(cpu) { 33 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 34 35 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 36 ret[i] += this->event[i]; 37 } 38} 39 40/* 41 * Accumulate the vm event counters across all CPUs. 42 * The result is unavoidably approximate - it can change 43 * during and after execution of this function. 44*/ 45void all_vm_events(unsigned long *ret) 46{ 47 get_online_cpus(); 48 sum_vm_events(ret); 49 put_online_cpus(); 50} 51EXPORT_SYMBOL_GPL(all_vm_events); 52 53#ifdef CONFIG_HOTPLUG 54/* 55 * Fold the foreign cpu events into our own. 56 * 57 * This is adding to the events on one processor 58 * but keeps the global counts constant. 59 */ 60void vm_events_fold_cpu(int cpu) 61{ 62 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); 63 int i; 64 65 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 66 count_vm_events(i, fold_state->event[i]); 67 fold_state->event[i] = 0; 68 } 69} 70#endif /* CONFIG_HOTPLUG */ 71 72#endif /* CONFIG_VM_EVENT_COUNTERS */ 73 74/* 75 * Manage combined zone based / global counters 76 * 77 * vm_stat contains the global counters 78 */ 79atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 80EXPORT_SYMBOL(vm_stat); 81 82#ifdef CONFIG_SMP 83 84static int calculate_pressure_threshold(struct zone *zone) 85{ 86 int threshold; 87 int watermark_distance; 88 89 /* 90 * As vmstats are not up to date, there is drift between the estimated 91 * and real values. For high thresholds and a high number of CPUs, it 92 * is possible for the min watermark to be breached while the estimated 93 * value looks fine. The pressure threshold is a reduced value such 94 * that even the maximum amount of drift will not accidentally breach 95 * the min watermark 96 */ 97 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); 98 threshold = max(1, (int)(watermark_distance / num_online_cpus())); 99 100 /* 101 * Maximum threshold is 125 102 */ 103 threshold = min(125, threshold); 104 105 return threshold; 106} 107 108static int calculate_threshold(struct zone *zone) 109{ 110 int threshold; 111 int mem; /* memory in 128 MB units */ 112 113 /* 114 * The threshold scales with the number of processors and the amount 115 * of memory per zone. More memory means that we can defer updates for 116 * longer, more processors could lead to more contention. 117 * fls() is used to have a cheap way of logarithmic scaling. 118 * 119 * Some sample thresholds: 120 * 121 * Threshold Processors (fls) Zonesize fls(mem+1) 122 * ------------------------------------------------------------------ 123 * 8 1 1 0.9-1 GB 4 124 * 16 2 2 0.9-1 GB 4 125 * 20 2 2 1-2 GB 5 126 * 24 2 2 2-4 GB 6 127 * 28 2 2 4-8 GB 7 128 * 32 2 2 8-16 GB 8 129 * 4 2 2 <128M 1 130 * 30 4 3 2-4 GB 5 131 * 48 4 3 8-16 GB 8 132 * 32 8 4 1-2 GB 4 133 * 32 8 4 0.9-1GB 4 134 * 10 16 5 <128M 1 135 * 40 16 5 900M 4 136 * 70 64 7 2-4 GB 5 137 * 84 64 7 4-8 GB 6 138 * 108 512 9 4-8 GB 6 139 * 125 1024 10 8-16 GB 8 140 * 125 1024 10 16-32 GB 9 141 */ 142 143 mem = zone->present_pages >> (27 - PAGE_SHIFT); 144 145 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 146 147 /* 148 * Maximum threshold is 125 149 */ 150 threshold = min(125, threshold); 151 152 return threshold; 153} 154 155/* 156 * Refresh the thresholds for each zone. 157 */ 158static void refresh_zone_stat_thresholds(void) 159{ 160 struct zone *zone; 161 int cpu; 162 int threshold; 163 164 for_each_populated_zone(zone) { 165 unsigned long max_drift, tolerate_drift; 166 167 threshold = calculate_threshold(zone); 168 169 for_each_online_cpu(cpu) 170 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 171 = threshold; 172 173 /* 174 * Only set percpu_drift_mark if there is a danger that 175 * NR_FREE_PAGES reports the low watermark is ok when in fact 176 * the min watermark could be breached by an allocation 177 */ 178 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); 179 max_drift = num_online_cpus() * threshold; 180 if (max_drift > tolerate_drift) 181 zone->percpu_drift_mark = high_wmark_pages(zone) + 182 max_drift; 183 } 184} 185 186void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) 187{ 188 struct zone *zone; 189 int cpu; 190 int threshold; 191 int i; 192 193 get_online_cpus(); 194 for (i = 0; i < pgdat->nr_zones; i++) { 195 zone = &pgdat->node_zones[i]; 196 if (!zone->percpu_drift_mark) 197 continue; 198 199 threshold = calculate_pressure_threshold(zone); 200 for_each_online_cpu(cpu) 201 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 202 = threshold; 203 } 204 put_online_cpus(); 205} 206 207void restore_pgdat_percpu_threshold(pg_data_t *pgdat) 208{ 209 struct zone *zone; 210 int cpu; 211 int threshold; 212 int i; 213 214 get_online_cpus(); 215 for (i = 0; i < pgdat->nr_zones; i++) { 216 zone = &pgdat->node_zones[i]; 217 if (!zone->percpu_drift_mark) 218 continue; 219 220 threshold = calculate_threshold(zone); 221 for_each_online_cpu(cpu) 222 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 223 = threshold; 224 } 225 put_online_cpus(); 226} 227 228/* 229 * For use when we know that interrupts are disabled. 230 */ 231void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 232 int delta) 233{ 234 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 235 236 s8 *p = pcp->vm_stat_diff + item; 237 long x; 238 239 x = delta + *p; 240 241 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 242 zone_page_state_add(x, zone, item); 243 x = 0; 244 } 245 *p = x; 246} 247EXPORT_SYMBOL(__mod_zone_page_state); 248 249/* 250 * For an unknown interrupt state 251 */ 252void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 253 int delta) 254{ 255 unsigned long flags; 256 257 local_irq_save(flags); 258 __mod_zone_page_state(zone, item, delta); 259 local_irq_restore(flags); 260} 261EXPORT_SYMBOL(mod_zone_page_state); 262 263/* 264 * Optimized increment and decrement functions. 265 * 266 * These are only for a single page and therefore can take a struct page * 267 * argument instead of struct zone *. This allows the inclusion of the code 268 * generated for page_zone(page) into the optimized functions. 269 * 270 * No overflow check is necessary and therefore the differential can be 271 * incremented or decremented in place which may allow the compilers to 272 * generate better code. 273 * The increment or decrement is known and therefore one boundary check can 274 * be omitted. 275 * 276 * NOTE: These functions are very performance sensitive. Change only 277 * with care. 278 * 279 * Some processors have inc/dec instructions that are atomic vs an interrupt. 280 * However, the code must first determine the differential location in a zone 281 * based on the processor number and then inc/dec the counter. There is no 282 * guarantee without disabling preemption that the processor will not change 283 * in between and therefore the atomicity vs. interrupt cannot be exploited 284 * in a useful way here. 285 */ 286void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 287{ 288 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 289 s8 *p = pcp->vm_stat_diff + item; 290 291 (*p)++; 292 293 if (unlikely(*p > pcp->stat_threshold)) { 294 int overstep = pcp->stat_threshold / 2; 295 296 zone_page_state_add(*p + overstep, zone, item); 297 *p = -overstep; 298 } 299} 300 301void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 302{ 303 __inc_zone_state(page_zone(page), item); 304} 305EXPORT_SYMBOL(__inc_zone_page_state); 306 307void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 308{ 309 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 310 s8 *p = pcp->vm_stat_diff + item; 311 312 (*p)--; 313 314 if (unlikely(*p < - pcp->stat_threshold)) { 315 int overstep = pcp->stat_threshold / 2; 316 317 zone_page_state_add(*p - overstep, zone, item); 318 *p = overstep; 319 } 320} 321 322void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 323{ 324 __dec_zone_state(page_zone(page), item); 325} 326EXPORT_SYMBOL(__dec_zone_page_state); 327 328void inc_zone_state(struct zone *zone, enum zone_stat_item item) 329{ 330 unsigned long flags; 331 332 local_irq_save(flags); 333 __inc_zone_state(zone, item); 334 local_irq_restore(flags); 335} 336 337void inc_zone_page_state(struct page *page, enum zone_stat_item item) 338{ 339 unsigned long flags; 340 struct zone *zone; 341 342 zone = page_zone(page); 343 local_irq_save(flags); 344 __inc_zone_state(zone, item); 345 local_irq_restore(flags); 346} 347EXPORT_SYMBOL(inc_zone_page_state); 348 349void dec_zone_page_state(struct page *page, enum zone_stat_item item) 350{ 351 unsigned long flags; 352 353 local_irq_save(flags); 354 __dec_zone_page_state(page, item); 355 local_irq_restore(flags); 356} 357EXPORT_SYMBOL(dec_zone_page_state); 358 359/* 360 * Update the zone counters for one cpu. 361 * 362 * The cpu specified must be either the current cpu or a processor that 363 * is not online. If it is the current cpu then the execution thread must 364 * be pinned to the current cpu. 365 * 366 * Note that refresh_cpu_vm_stats strives to only access 367 * node local memory. The per cpu pagesets on remote zones are placed 368 * in the memory local to the processor using that pageset. So the 369 * loop over all zones will access a series of cachelines local to 370 * the processor. 371 * 372 * The call to zone_page_state_add updates the cachelines with the 373 * statistics in the remote zone struct as well as the global cachelines 374 * with the global counters. These could cause remote node cache line 375 * bouncing and will have to be only done when necessary. 376 */ 377void refresh_cpu_vm_stats(int cpu) 378{ 379 struct zone *zone; 380 int i; 381 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 382 383 for_each_populated_zone(zone) { 384 struct per_cpu_pageset *p; 385 386 p = per_cpu_ptr(zone->pageset, cpu); 387 388 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 389 if (p->vm_stat_diff[i]) { 390 unsigned long flags; 391 int v; 392 393 local_irq_save(flags); 394 v = p->vm_stat_diff[i]; 395 p->vm_stat_diff[i] = 0; 396 local_irq_restore(flags); 397 atomic_long_add(v, &zone->vm_stat[i]); 398 global_diff[i] += v; 399#ifdef CONFIG_NUMA 400 /* 3 seconds idle till flush */ 401 p->expire = 3; 402#endif 403 } 404 cond_resched(); 405#ifdef CONFIG_NUMA 406 /* 407 * Deal with draining the remote pageset of this 408 * processor 409 * 410 * Check if there are pages remaining in this pageset 411 * if not then there is nothing to expire. 412 */ 413 if (!p->expire || !p->pcp.count) 414 continue; 415 416 /* 417 * We never drain zones local to this processor. 418 */ 419 if (zone_to_nid(zone) == numa_node_id()) { 420 p->expire = 0; 421 continue; 422 } 423 424 p->expire--; 425 if (p->expire) 426 continue; 427 428 if (p->pcp.count) 429 drain_zone_pages(zone, &p->pcp); 430#endif 431 } 432 433 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 434 if (global_diff[i]) 435 atomic_long_add(global_diff[i], &vm_stat[i]); 436} 437 438#endif 439 440#ifdef CONFIG_NUMA 441/* 442 * zonelist = the list of zones passed to the allocator 443 * z = the zone from which the allocation occurred. 444 * 445 * Must be called with interrupts disabled. 446 */ 447void zone_statistics(struct zone *preferred_zone, struct zone *z) 448{ 449 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 450 __inc_zone_state(z, NUMA_HIT); 451 } else { 452 __inc_zone_state(z, NUMA_MISS); 453 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 454 } 455 if (z->node == numa_node_id()) 456 __inc_zone_state(z, NUMA_LOCAL); 457 else 458 __inc_zone_state(z, NUMA_OTHER); 459} 460#endif 461 462#ifdef CONFIG_COMPACTION 463struct contig_page_info { 464 unsigned long free_pages; 465 unsigned long free_blocks_total; 466 unsigned long free_blocks_suitable; 467}; 468 469/* 470 * Calculate the number of free pages in a zone, how many contiguous 471 * pages are free and how many are large enough to satisfy an allocation of 472 * the target size. Note that this function makes no attempt to estimate 473 * how many suitable free blocks there *might* be if MOVABLE pages were 474 * migrated. Calculating that is possible, but expensive and can be 475 * figured out from userspace 476 */ 477static void fill_contig_page_info(struct zone *zone, 478 unsigned int suitable_order, 479 struct contig_page_info *info) 480{ 481 unsigned int order; 482 483 info->free_pages = 0; 484 info->free_blocks_total = 0; 485 info->free_blocks_suitable = 0; 486 487 for (order = 0; order < MAX_ORDER; order++) { 488 unsigned long blocks; 489 490 /* Count number of free blocks */ 491 blocks = zone->free_area[order].nr_free; 492 info->free_blocks_total += blocks; 493 494 /* Count free base pages */ 495 info->free_pages += blocks << order; 496 497 /* Count the suitable free blocks */ 498 if (order >= suitable_order) 499 info->free_blocks_suitable += blocks << 500 (order - suitable_order); 501 } 502} 503 504/* 505 * A fragmentation index only makes sense if an allocation of a requested 506 * size would fail. If that is true, the fragmentation index indicates 507 * whether external fragmentation or a lack of memory was the problem. 508 * The value can be used to determine if page reclaim or compaction 509 * should be used 510 */ 511static int __fragmentation_index(unsigned int order, struct contig_page_info *info) 512{ 513 unsigned long requested = 1UL << order; 514 515 if (!info->free_blocks_total) 516 return 0; 517 518 /* Fragmentation index only makes sense when a request would fail */ 519 if (info->free_blocks_suitable) 520 return -1000; 521 522 /* 523 * Index is between 0 and 1 so return within 3 decimal places 524 * 525 * 0 => allocation would fail due to lack of memory 526 * 1 => allocation would fail due to fragmentation 527 */ 528 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); 529} 530 531/* Same as __fragmentation index but allocs contig_page_info on stack */ 532int fragmentation_index(struct zone *zone, unsigned int order) 533{ 534 struct contig_page_info info; 535 536 fill_contig_page_info(zone, order, &info); 537 return __fragmentation_index(order, &info); 538} 539#endif 540 541#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) 542#include <linux/proc_fs.h> 543#include <linux/seq_file.h> 544 545static char * const migratetype_names[MIGRATE_TYPES] = { 546 "Unmovable", 547 "Reclaimable", 548 "Movable", 549 "Reserve", 550 "Isolate", 551}; 552 553static void *frag_start(struct seq_file *m, loff_t *pos) 554{ 555 pg_data_t *pgdat; 556 loff_t node = *pos; 557 for (pgdat = first_online_pgdat(); 558 pgdat && node; 559 pgdat = next_online_pgdat(pgdat)) 560 --node; 561 562 return pgdat; 563} 564 565static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 566{ 567 pg_data_t *pgdat = (pg_data_t *)arg; 568 569 (*pos)++; 570 return next_online_pgdat(pgdat); 571} 572 573static void frag_stop(struct seq_file *m, void *arg) 574{ 575} 576 577/* Walk all the zones in a node and print using a callback */ 578static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, 579 void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) 580{ 581 struct zone *zone; 582 struct zone *node_zones = pgdat->node_zones; 583 unsigned long flags; 584 585 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 586 if (!populated_zone(zone)) 587 continue; 588 589 spin_lock_irqsave(&zone->lock, flags); 590 print(m, pgdat, zone); 591 spin_unlock_irqrestore(&zone->lock, flags); 592 } 593} 594#endif 595 596#ifdef CONFIG_PROC_FS 597static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 598 struct zone *zone) 599{ 600 int order; 601 602 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 603 for (order = 0; order < MAX_ORDER; ++order) 604 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 605 seq_putc(m, '\n'); 606} 607 608/* 609 * This walks the free areas for each zone. 610 */ 611static int frag_show(struct seq_file *m, void *arg) 612{ 613 pg_data_t *pgdat = (pg_data_t *)arg; 614 walk_zones_in_node(m, pgdat, frag_show_print); 615 return 0; 616} 617 618static void pagetypeinfo_showfree_print(struct seq_file *m, 619 pg_data_t *pgdat, struct zone *zone) 620{ 621 int order, mtype; 622 623 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { 624 seq_printf(m, "Node %4d, zone %8s, type %12s ", 625 pgdat->node_id, 626 zone->name, 627 migratetype_names[mtype]); 628 for (order = 0; order < MAX_ORDER; ++order) { 629 unsigned long freecount = 0; 630 struct free_area *area; 631 struct list_head *curr; 632 633 area = &(zone->free_area[order]); 634 635 list_for_each(curr, &area->free_list[mtype]) 636 freecount++; 637 seq_printf(m, "%6lu ", freecount); 638 } 639 seq_putc(m, '\n'); 640 } 641} 642 643/* Print out the free pages at each order for each migatetype */ 644static int pagetypeinfo_showfree(struct seq_file *m, void *arg) 645{ 646 int order; 647 pg_data_t *pgdat = (pg_data_t *)arg; 648 649 /* Print header */ 650 seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); 651 for (order = 0; order < MAX_ORDER; ++order) 652 seq_printf(m, "%6d ", order); 653 seq_putc(m, '\n'); 654 655 walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); 656 657 return 0; 658} 659 660static void pagetypeinfo_showblockcount_print(struct seq_file *m, 661 pg_data_t *pgdat, struct zone *zone) 662{ 663 int mtype; 664 unsigned long pfn; 665 unsigned long start_pfn = zone->zone_start_pfn; 666 unsigned long end_pfn = start_pfn + zone->spanned_pages; 667 unsigned long count[MIGRATE_TYPES] = { 0, }; 668 669 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 670 struct page *page; 671 672 if (!pfn_valid(pfn)) 673 continue; 674 675 page = pfn_to_page(pfn); 676 677 /* Watch for unexpected holes punched in the memmap */ 678 if (!memmap_valid_within(pfn, page, zone)) 679 continue; 680 681 mtype = get_pageblock_migratetype(page); 682 683 if (mtype < MIGRATE_TYPES) 684 count[mtype]++; 685 } 686 687 /* Print counts */ 688 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 689 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 690 seq_printf(m, "%12lu ", count[mtype]); 691 seq_putc(m, '\n'); 692} 693 694/* Print out the free pages at each order for each migratetype */ 695static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) 696{ 697 int mtype; 698 pg_data_t *pgdat = (pg_data_t *)arg; 699 700 seq_printf(m, "\n%-23s", "Number of blocks type "); 701 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 702 seq_printf(m, "%12s ", migratetype_names[mtype]); 703 seq_putc(m, '\n'); 704 walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); 705 706 return 0; 707} 708 709/* 710 * This prints out statistics in relation to grouping pages by mobility. 711 * It is expensive to collect so do not constantly read the file. 712 */ 713static int pagetypeinfo_show(struct seq_file *m, void *arg) 714{ 715 pg_data_t *pgdat = (pg_data_t *)arg; 716 717 /* check memoryless node */ 718 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 719 return 0; 720 721 seq_printf(m, "Page block order: %d\n", pageblock_order); 722 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); 723 seq_putc(m, '\n'); 724 pagetypeinfo_showfree(m, pgdat); 725 pagetypeinfo_showblockcount(m, pgdat); 726 727 return 0; 728} 729 730static const struct seq_operations fragmentation_op = { 731 .start = frag_start, 732 .next = frag_next, 733 .stop = frag_stop, 734 .show = frag_show, 735}; 736 737static int fragmentation_open(struct inode *inode, struct file *file) 738{ 739 return seq_open(file, &fragmentation_op); 740} 741 742static const struct file_operations fragmentation_file_operations = { 743 .open = fragmentation_open, 744 .read = seq_read, 745 .llseek = seq_lseek, 746 .release = seq_release, 747}; 748 749static const struct seq_operations pagetypeinfo_op = { 750 .start = frag_start, 751 .next = frag_next, 752 .stop = frag_stop, 753 .show = pagetypeinfo_show, 754}; 755 756static int pagetypeinfo_open(struct inode *inode, struct file *file) 757{ 758 return seq_open(file, &pagetypeinfo_op); 759} 760 761static const struct file_operations pagetypeinfo_file_ops = { 762 .open = pagetypeinfo_open, 763 .read = seq_read, 764 .llseek = seq_lseek, 765 .release = seq_release, 766}; 767 768#ifdef CONFIG_ZONE_DMA 769#define TEXT_FOR_DMA(xx) xx "_dma", 770#else 771#define TEXT_FOR_DMA(xx) 772#endif 773 774#ifdef CONFIG_ZONE_DMA32 775#define TEXT_FOR_DMA32(xx) xx "_dma32", 776#else 777#define TEXT_FOR_DMA32(xx) 778#endif 779 780#ifdef CONFIG_HIGHMEM 781#define TEXT_FOR_HIGHMEM(xx) xx "_high", 782#else 783#define TEXT_FOR_HIGHMEM(xx) 784#endif 785 786#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ 787 TEXT_FOR_HIGHMEM(xx) xx "_movable", 788 789static const char * const vmstat_text[] = { 790 /* Zoned VM counters */ 791 "nr_free_pages", 792 "nr_inactive_anon", 793 "nr_active_anon", 794 "nr_inactive_file", 795 "nr_active_file", 796 "nr_unevictable", 797 "nr_mlock", 798 "nr_anon_pages", 799 "nr_mapped", 800 "nr_file_pages", 801 "nr_dirty", 802 "nr_writeback", 803 "nr_slab_reclaimable", 804 "nr_slab_unreclaimable", 805 "nr_page_table_pages", 806 "nr_kernel_stack", 807 "nr_unstable", 808 "nr_bounce", 809 "nr_vmscan_write", 810 "nr_writeback_temp", 811 "nr_isolated_anon", 812 "nr_isolated_file", 813 "nr_shmem", 814#ifdef CONFIG_NUMA 815 "numa_hit", 816 "numa_miss", 817 "numa_foreign", 818 "numa_interleave", 819 "numa_local", 820 "numa_other", 821#endif 822 823#ifdef CONFIG_VM_EVENT_COUNTERS 824 "pgpgin", 825 "pgpgout", 826 "pswpin", 827 "pswpout", 828 829 TEXTS_FOR_ZONES("pgalloc") 830 831 "pgfree", 832 "pgactivate", 833 "pgdeactivate", 834 835 "pgfault", 836 "pgmajfault", 837 838 TEXTS_FOR_ZONES("pgrefill") 839 TEXTS_FOR_ZONES("pgsteal") 840 TEXTS_FOR_ZONES("pgscan_kswapd") 841 TEXTS_FOR_ZONES("pgscan_direct") 842 843#ifdef CONFIG_NUMA 844 "zone_reclaim_failed", 845#endif 846 "pginodesteal", 847 "slabs_scanned", 848 "kswapd_steal", 849 "kswapd_inodesteal", 850 "kswapd_low_wmark_hit_quickly", 851 "kswapd_high_wmark_hit_quickly", 852 "kswapd_skip_congestion_wait", 853 "pageoutrun", 854 "allocstall", 855 856 "pgrotated", 857 858#ifdef CONFIG_COMPACTION 859 "compact_blocks_moved", 860 "compact_pages_moved", 861 "compact_pagemigrate_failed", 862 "compact_stall", 863 "compact_fail", 864 "compact_success", 865#endif 866 867#ifdef CONFIG_HUGETLB_PAGE 868 "htlb_buddy_alloc_success", 869 "htlb_buddy_alloc_fail", 870#endif 871 "unevictable_pgs_culled", 872 "unevictable_pgs_scanned", 873 "unevictable_pgs_rescued", 874 "unevictable_pgs_mlocked", 875 "unevictable_pgs_munlocked", 876 "unevictable_pgs_cleared", 877 "unevictable_pgs_stranded", 878 "unevictable_pgs_mlockfreed", 879#endif 880}; 881 882static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 883 struct zone *zone) 884{ 885 int i; 886 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 887 seq_printf(m, 888 "\n pages free %lu" 889 "\n min %lu" 890 "\n low %lu" 891 "\n high %lu" 892 "\n scanned %lu" 893 "\n spanned %lu" 894 "\n present %lu", 895 zone_page_state(zone, NR_FREE_PAGES), 896 min_wmark_pages(zone), 897 low_wmark_pages(zone), 898 high_wmark_pages(zone), 899 zone->pages_scanned, 900 zone->spanned_pages, 901 zone->present_pages); 902 903 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 904 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 905 zone_page_state(zone, i)); 906 907 seq_printf(m, 908 "\n protection: (%lu", 909 zone->lowmem_reserve[0]); 910 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 911 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 912 seq_printf(m, 913 ")" 914 "\n pagesets"); 915 for_each_online_cpu(i) { 916 struct per_cpu_pageset *pageset; 917 918 pageset = per_cpu_ptr(zone->pageset, i); 919 seq_printf(m, 920 "\n cpu: %i" 921 "\n count: %i" 922 "\n high: %i" 923 "\n batch: %i", 924 i, 925 pageset->pcp.count, 926 pageset->pcp.high, 927 pageset->pcp.batch); 928#ifdef CONFIG_SMP 929 seq_printf(m, "\n vm stats threshold: %d", 930 pageset->stat_threshold); 931#endif 932 } 933 seq_printf(m, 934 "\n all_unreclaimable: %u" 935 "\n start_pfn: %lu" 936 "\n inactive_ratio: %u", 937 zone->all_unreclaimable, 938 zone->zone_start_pfn, 939 zone->inactive_ratio); 940 seq_putc(m, '\n'); 941} 942 943/* 944 * Output information about zones in @pgdat. 945 */ 946static int zoneinfo_show(struct seq_file *m, void *arg) 947{ 948 pg_data_t *pgdat = (pg_data_t *)arg; 949 walk_zones_in_node(m, pgdat, zoneinfo_show_print); 950 return 0; 951} 952 953static const struct seq_operations zoneinfo_op = { 954 .start = frag_start, /* iterate over all zones. The same as in 955 * fragmentation. */ 956 .next = frag_next, 957 .stop = frag_stop, 958 .show = zoneinfo_show, 959}; 960 961static int zoneinfo_open(struct inode *inode, struct file *file) 962{ 963 return seq_open(file, &zoneinfo_op); 964} 965 966static const struct file_operations proc_zoneinfo_file_operations = { 967 .open = zoneinfo_open, 968 .read = seq_read, 969 .llseek = seq_lseek, 970 .release = seq_release, 971}; 972 973static void *vmstat_start(struct seq_file *m, loff_t *pos) 974{ 975 unsigned long *v; 976#ifdef CONFIG_VM_EVENT_COUNTERS 977 unsigned long *e; 978#endif 979 int i; 980 981 if (*pos >= ARRAY_SIZE(vmstat_text)) 982 return NULL; 983 984#ifdef CONFIG_VM_EVENT_COUNTERS 985 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) 986 + sizeof(struct vm_event_state), GFP_KERNEL); 987#else 988 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), 989 GFP_KERNEL); 990#endif 991 m->private = v; 992 if (!v) 993 return ERR_PTR(-ENOMEM); 994 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 995 v[i] = global_page_state(i); 996#ifdef CONFIG_VM_EVENT_COUNTERS 997 e = v + NR_VM_ZONE_STAT_ITEMS; 998 all_vm_events(e); 999 e[PGPGIN] /= 2; /* sectors -> kbytes */ 1000 e[PGPGOUT] /= 2; 1001#endif 1002 return v + *pos; 1003} 1004 1005static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1006{ 1007 (*pos)++; 1008 if (*pos >= ARRAY_SIZE(vmstat_text)) 1009 return NULL; 1010 return (unsigned long *)m->private + *pos; 1011} 1012 1013static int vmstat_show(struct seq_file *m, void *arg) 1014{ 1015 unsigned long *l = arg; 1016 unsigned long off = l - (unsigned long *)m->private; 1017 1018 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 1019 return 0; 1020} 1021 1022static void vmstat_stop(struct seq_file *m, void *arg) 1023{ 1024 kfree(m->private); 1025 m->private = NULL; 1026} 1027 1028static const struct seq_operations vmstat_op = { 1029 .start = vmstat_start, 1030 .next = vmstat_next, 1031 .stop = vmstat_stop, 1032 .show = vmstat_show, 1033}; 1034 1035static int vmstat_open(struct inode *inode, struct file *file) 1036{ 1037 return seq_open(file, &vmstat_op); 1038} 1039 1040static const struct file_operations proc_vmstat_file_operations = { 1041 .open = vmstat_open, 1042 .read = seq_read, 1043 .llseek = seq_lseek, 1044 .release = seq_release, 1045}; 1046#endif /* CONFIG_PROC_FS */ 1047 1048#ifdef CONFIG_SMP 1049static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1050int sysctl_stat_interval __read_mostly = HZ; 1051 1052static void vmstat_update(struct work_struct *w) 1053{ 1054 refresh_cpu_vm_stats(smp_processor_id()); 1055 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1056 round_jiffies_relative(sysctl_stat_interval)); 1057} 1058 1059static void __cpuinit start_cpu_timer(int cpu) 1060{ 1061 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1062 1063 INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); 1064 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1065} 1066 1067/* 1068 * Use the cpu notifier to insure that the thresholds are recalculated 1069 * when necessary. 1070 */ 1071static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, 1072 unsigned long action, 1073 void *hcpu) 1074{ 1075 long cpu = (long)hcpu; 1076 1077 switch (action) { 1078 case CPU_ONLINE: 1079 case CPU_ONLINE_FROZEN: 1080 refresh_zone_stat_thresholds(); 1081 start_cpu_timer(cpu); 1082 node_set_state(cpu_to_node(cpu), N_CPU); 1083 break; 1084 case CPU_DOWN_PREPARE: 1085 case CPU_DOWN_PREPARE_FROZEN: 1086 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); 1087 per_cpu(vmstat_work, cpu).work.func = NULL; 1088 break; 1089 case CPU_DOWN_FAILED: 1090 case CPU_DOWN_FAILED_FROZEN: 1091 start_cpu_timer(cpu); 1092 break; 1093 case CPU_DEAD: 1094 case CPU_DEAD_FROZEN: 1095 refresh_zone_stat_thresholds(); 1096 break; 1097 default: 1098 break; 1099 } 1100 return NOTIFY_OK; 1101} 1102 1103static struct notifier_block __cpuinitdata vmstat_notifier = 1104 { &vmstat_cpuup_callback, NULL, 0 }; 1105#endif 1106 1107static int __init setup_vmstat(void) 1108{ 1109#ifdef CONFIG_SMP 1110 int cpu; 1111 1112 refresh_zone_stat_thresholds(); 1113 register_cpu_notifier(&vmstat_notifier); 1114 1115 for_each_online_cpu(cpu) 1116 start_cpu_timer(cpu); 1117#endif 1118#ifdef CONFIG_PROC_FS 1119 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); 1120 proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); 1121 proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); 1122 proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); 1123#endif 1124 return 0; 1125} 1126module_init(setup_vmstat) 1127 1128#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1129#include <linux/debugfs.h> 1130 1131static struct dentry *extfrag_debug_root; 1132 1133/* 1134 * Return an index indicating how much of the available free memory is 1135 * unusable for an allocation of the requested size. 1136 */ 1137static int unusable_free_index(unsigned int order, 1138 struct contig_page_info *info) 1139{ 1140 /* No free memory is interpreted as all free memory is unusable */ 1141 if (info->free_pages == 0) 1142 return 1000; 1143 1144 /* 1145 * Index should be a value between 0 and 1. Return a value to 3 1146 * decimal places. 1147 * 1148 * 0 => no fragmentation 1149 * 1 => high fragmentation 1150 */ 1151 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); 1152 1153} 1154 1155static void unusable_show_print(struct seq_file *m, 1156 pg_data_t *pgdat, struct zone *zone) 1157{ 1158 unsigned int order; 1159 int index; 1160 struct contig_page_info info; 1161 1162 seq_printf(m, "Node %d, zone %8s ", 1163 pgdat->node_id, 1164 zone->name); 1165 for (order = 0; order < MAX_ORDER; ++order) { 1166 fill_contig_page_info(zone, order, &info); 1167 index = unusable_free_index(order, &info); 1168 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 1169 } 1170 1171 seq_putc(m, '\n'); 1172} 1173 1174/* 1175 * Display unusable free space index 1176 * 1177 * The unusable free space index measures how much of the available free 1178 * memory cannot be used to satisfy an allocation of a given size and is a 1179 * value between 0 and 1. The higher the value, the more of free memory is 1180 * unusable and by implication, the worse the external fragmentation is. This 1181 * can be expressed as a percentage by multiplying by 100. 1182 */ 1183static int unusable_show(struct seq_file *m, void *arg) 1184{ 1185 pg_data_t *pgdat = (pg_data_t *)arg; 1186 1187 /* check memoryless node */ 1188 if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) 1189 return 0; 1190 1191 walk_zones_in_node(m, pgdat, unusable_show_print); 1192 1193 return 0; 1194} 1195 1196static const struct seq_operations unusable_op = { 1197 .start = frag_start, 1198 .next = frag_next, 1199 .stop = frag_stop, 1200 .show = unusable_show, 1201}; 1202 1203static int unusable_open(struct inode *inode, struct file *file) 1204{ 1205 return seq_open(file, &unusable_op); 1206} 1207 1208static const struct file_operations unusable_file_ops = { 1209 .open = unusable_open, 1210 .read = seq_read, 1211 .llseek = seq_lseek, 1212 .release = seq_release, 1213}; 1214 1215static void extfrag_show_print(struct seq_file *m, 1216 pg_data_t *pgdat, struct zone *zone) 1217{ 1218 unsigned int order; 1219 int index; 1220 1221 /* Alloc on stack as interrupts are disabled for zone walk */ 1222 struct contig_page_info info; 1223 1224 seq_printf(m, "Node %d, zone %8s ", 1225 pgdat->node_id, 1226 zone->name); 1227 for (order = 0; order < MAX_ORDER; ++order) { 1228 fill_contig_page_info(zone, order, &info); 1229 index = __fragmentation_index(order, &info); 1230 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 1231 } 1232 1233 seq_putc(m, '\n'); 1234} 1235 1236/* 1237 * Display fragmentation index for orders that allocations would fail for 1238 */ 1239static int extfrag_show(struct seq_file *m, void *arg) 1240{ 1241 pg_data_t *pgdat = (pg_data_t *)arg; 1242 1243 walk_zones_in_node(m, pgdat, extfrag_show_print); 1244 1245 return 0; 1246} 1247 1248static const struct seq_operations extfrag_op = { 1249 .start = frag_start, 1250 .next = frag_next, 1251 .stop = frag_stop, 1252 .show = extfrag_show, 1253}; 1254 1255static int extfrag_open(struct inode *inode, struct file *file) 1256{ 1257 return seq_open(file, &extfrag_op); 1258} 1259 1260static const struct file_operations extfrag_file_ops = { 1261 .open = extfrag_open, 1262 .read = seq_read, 1263 .llseek = seq_lseek, 1264 .release = seq_release, 1265}; 1266 1267static int __init extfrag_debug_init(void) 1268{ 1269 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 1270 if (!extfrag_debug_root) 1271 return -ENOMEM; 1272 1273 if (!debugfs_create_file("unusable_index", 0444, 1274 extfrag_debug_root, NULL, &unusable_file_ops)) 1275 return -ENOMEM; 1276 1277 if (!debugfs_create_file("extfrag_index", 0444, 1278 extfrag_debug_root, NULL, &extfrag_file_ops)) 1279 return -ENOMEM; 1280 1281 return 0; 1282} 1283 1284module_init(extfrag_debug_init); 1285#endif 1286