1/*
2 *  linux/mm/swap.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 */
6
7/*
8 * This file contains the default values for the opereation of the
9 * Linux VM subsystem. Fine-tuning documentation can be found in
10 * Documentation/sysctl/vm.txt.
11 * Started 18.12.91
12 * Swap aging added 23.2.95, Stephen Tweedie.
13 * Buffermem limits added 12.3.98, Rik van Riel.
14 */
15
16#include <linux/mm.h>
17#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/swap.h>
20#include <linux/mman.h>
21#include <linux/pagemap.h>
22#include <linux/pagevec.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h>	/* for try_to_release_page() */
27#include <linux/module.h>
28#include <linux/percpu_counter.h>
29#include <linux/percpu.h>
30#include <linux/cpu.h>
31#include <linux/notifier.h>
32#include <linux/init.h>
33
34/* How many pages do we try to swap or page in/out together? */
35int page_cluster;
36
37/*
38 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs.  But it gets used by networking.
40 */
41static void fastcall __page_cache_release(struct page *page)
42{
43	if (PageLRU(page)) {
44		unsigned long flags;
45		struct zone *zone = page_zone(page);
46
47		spin_lock_irqsave(&zone->lru_lock, flags);
48		VM_BUG_ON(!PageLRU(page));
49		__ClearPageLRU(page);
50		del_page_from_lru(zone, page);
51		spin_unlock_irqrestore(&zone->lru_lock, flags);
52	}
53	free_hot_page(page);
54}
55
56static void put_compound_page(struct page *page)
57{
58	page = compound_head(page);
59	if (put_page_testzero(page)) {
60		compound_page_dtor *dtor;
61
62		dtor = get_compound_page_dtor(page);
63		(*dtor)(page);
64	}
65}
66
67void put_page(struct page *page)
68{
69	if (unlikely(PageCompound(page)))
70		put_compound_page(page);
71	else if (put_page_testzero(page))
72		__page_cache_release(page);
73}
74EXPORT_SYMBOL(put_page);
75
76/**
77 * put_pages_list(): release a list of pages
78 *
79 * Release a list of pages which are strung together on page.lru.  Currently
80 * used by read_cache_pages() and related error recovery code.
81 *
82 * @pages: list of pages threaded on page->lru
83 */
84void put_pages_list(struct list_head *pages)
85{
86	while (!list_empty(pages)) {
87		struct page *victim;
88
89		victim = list_entry(pages->prev, struct page, lru);
90		list_del(&victim->lru);
91		page_cache_release(victim);
92	}
93}
94EXPORT_SYMBOL(put_pages_list);
95
96/*
97 * Writeback is about to end against a page which has been marked for immediate
98 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
99 * inactive list.  The page still has PageWriteback set, which will pin it.
100 *
101 * We don't expect many pages to come through here, so don't bother batching
102 * things up.
103 *
104 * To avoid placing the page at the tail of the LRU while PG_writeback is still
105 * set, this function will clear PG_writeback before performing the page
106 * motion.  Do that inside the lru lock because once PG_writeback is cleared
107 * we may not touch the page.
108 *
109 * Returns zero if it cleared PG_writeback.
110 */
111int rotate_reclaimable_page(struct page *page)
112{
113	struct zone *zone;
114	unsigned long flags;
115
116	if (PageLocked(page))
117		return 1;
118	if (PageDirty(page))
119		return 1;
120	if (PageActive(page))
121		return 1;
122	if (!PageLRU(page))
123		return 1;
124
125	zone = page_zone(page);
126	spin_lock_irqsave(&zone->lru_lock, flags);
127	if (PageLRU(page) && !PageActive(page)) {
128		list_move_tail(&page->lru, &zone->inactive_list);
129		__count_vm_event(PGROTATED);
130	}
131	if (!test_clear_page_writeback(page))
132		BUG();
133	spin_unlock_irqrestore(&zone->lru_lock, flags);
134	return 0;
135}
136
137void fastcall activate_page(struct page *page)
138{
139	struct zone *zone = page_zone(page);
140
141	spin_lock_irq(&zone->lru_lock);
142	if (PageLRU(page) && !PageActive(page)) {
143		del_page_from_inactive_list(zone, page);
144		SetPageActive(page);
145		add_page_to_active_list(zone, page);
146		__count_vm_event(PGACTIVATE);
147	}
148	spin_unlock_irq(&zone->lru_lock);
149}
150
151/*
152 * Mark a page as having seen activity.
153 *
154 * inactive,unreferenced	->	inactive,referenced
155 * inactive,referenced		->	active,unreferenced
156 * active,unreferenced		->	active,referenced
157 */
158void fastcall mark_page_accessed(struct page *page)
159{
160	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
161		activate_page(page);
162		ClearPageReferenced(page);
163	} else if (!PageReferenced(page)) {
164		SetPageReferenced(page);
165	}
166}
167
168EXPORT_SYMBOL(mark_page_accessed);
169
170/**
171 * lru_cache_add: add a page to the page lists
172 * @page: the page to add
173 */
174static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
175static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
176
177void fastcall lru_cache_add(struct page *page)
178{
179	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
180
181	page_cache_get(page);
182	if (!pagevec_add(pvec, page))
183		__pagevec_lru_add(pvec);
184	put_cpu_var(lru_add_pvecs);
185}
186
187void fastcall lru_cache_add_active(struct page *page)
188{
189	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
190
191	page_cache_get(page);
192	if (!pagevec_add(pvec, page))
193		__pagevec_lru_add_active(pvec);
194	put_cpu_var(lru_add_active_pvecs);
195}
196
197static void __lru_add_drain(int cpu)
198{
199	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
200
201	/* CPU is dead, so no locking needed. */
202	if (pagevec_count(pvec))
203		__pagevec_lru_add(pvec);
204	pvec = &per_cpu(lru_add_active_pvecs, cpu);
205	if (pagevec_count(pvec))
206		__pagevec_lru_add_active(pvec);
207}
208
209void lru_add_drain(void)
210{
211	__lru_add_drain(get_cpu());
212	put_cpu();
213}
214
215#ifdef CONFIG_NUMA
216static void lru_add_drain_per_cpu(struct work_struct *dummy)
217{
218	lru_add_drain();
219}
220
221/*
222 * Returns 0 for success
223 */
224int lru_add_drain_all(void)
225{
226	return schedule_on_each_cpu(lru_add_drain_per_cpu);
227}
228
229#else
230
231/*
232 * Returns 0 for success
233 */
234int lru_add_drain_all(void)
235{
236	lru_add_drain();
237	return 0;
238}
239#endif
240
241/*
242 * Batched page_cache_release().  Decrement the reference count on all the
243 * passed pages.  If it fell to zero then remove the page from the LRU and
244 * free it.
245 *
246 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
247 * for the remainder of the operation.
248 *
249 * The locking in this function is against shrink_cache(): we recheck the
250 * page count inside the lock to see whether shrink_cache grabbed the page
251 * via the LRU.  If it did, give up: shrink_cache will free it.
252 */
253void release_pages(struct page **pages, int nr, int cold)
254{
255	int i;
256	struct pagevec pages_to_free;
257	struct zone *zone = NULL;
258
259	pagevec_init(&pages_to_free, cold);
260	for (i = 0; i < nr; i++) {
261		struct page *page = pages[i];
262
263		if (unlikely(PageCompound(page))) {
264			if (zone) {
265				spin_unlock_irq(&zone->lru_lock);
266				zone = NULL;
267			}
268			put_compound_page(page);
269			continue;
270		}
271
272		if (!put_page_testzero(page))
273			continue;
274
275		if (PageLRU(page)) {
276			struct zone *pagezone = page_zone(page);
277			if (pagezone != zone) {
278				if (zone)
279					spin_unlock_irq(&zone->lru_lock);
280				zone = pagezone;
281				spin_lock_irq(&zone->lru_lock);
282			}
283			VM_BUG_ON(!PageLRU(page));
284			__ClearPageLRU(page);
285			del_page_from_lru(zone, page);
286		}
287
288		if (!pagevec_add(&pages_to_free, page)) {
289			if (zone) {
290				spin_unlock_irq(&zone->lru_lock);
291				zone = NULL;
292			}
293			__pagevec_free(&pages_to_free);
294			pagevec_reinit(&pages_to_free);
295  		}
296	}
297	if (zone)
298		spin_unlock_irq(&zone->lru_lock);
299
300	pagevec_free(&pages_to_free);
301}
302
303/*
304 * The pages which we're about to release may be in the deferred lru-addition
305 * queues.  That would prevent them from really being freed right now.  That's
306 * OK from a correctness point of view but is inefficient - those pages may be
307 * cache-warm and we want to give them back to the page allocator ASAP.
308 *
309 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
310 * and __pagevec_lru_add_active() call release_pages() directly to avoid
311 * mutual recursion.
312 */
313void __pagevec_release(struct pagevec *pvec)
314{
315	lru_add_drain();
316	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
317	pagevec_reinit(pvec);
318}
319
320EXPORT_SYMBOL(__pagevec_release);
321
322/*
323 * pagevec_release() for pages which are known to not be on the LRU
324 *
325 * This function reinitialises the caller's pagevec.
326 */
327void __pagevec_release_nonlru(struct pagevec *pvec)
328{
329	int i;
330	struct pagevec pages_to_free;
331
332	pagevec_init(&pages_to_free, pvec->cold);
333	for (i = 0; i < pagevec_count(pvec); i++) {
334		struct page *page = pvec->pages[i];
335
336		VM_BUG_ON(PageLRU(page));
337		if (put_page_testzero(page))
338			pagevec_add(&pages_to_free, page);
339	}
340	pagevec_free(&pages_to_free);
341	pagevec_reinit(pvec);
342}
343
344/*
345 * Add the passed pages to the LRU, then drop the caller's refcount
346 * on them.  Reinitialises the caller's pagevec.
347 */
348void __pagevec_lru_add(struct pagevec *pvec)
349{
350	int i;
351	struct zone *zone = NULL;
352
353	for (i = 0; i < pagevec_count(pvec); i++) {
354		struct page *page = pvec->pages[i];
355		struct zone *pagezone = page_zone(page);
356
357		if (pagezone != zone) {
358			if (zone)
359				spin_unlock_irq(&zone->lru_lock);
360			zone = pagezone;
361			spin_lock_irq(&zone->lru_lock);
362		}
363		VM_BUG_ON(PageLRU(page));
364		SetPageLRU(page);
365		add_page_to_inactive_list(zone, page);
366	}
367	if (zone)
368		spin_unlock_irq(&zone->lru_lock);
369	release_pages(pvec->pages, pvec->nr, pvec->cold);
370	pagevec_reinit(pvec);
371}
372
373EXPORT_SYMBOL(__pagevec_lru_add);
374
375void __pagevec_lru_add_active(struct pagevec *pvec)
376{
377	int i;
378	struct zone *zone = NULL;
379
380	for (i = 0; i < pagevec_count(pvec); i++) {
381		struct page *page = pvec->pages[i];
382		struct zone *pagezone = page_zone(page);
383
384		if (pagezone != zone) {
385			if (zone)
386				spin_unlock_irq(&zone->lru_lock);
387			zone = pagezone;
388			spin_lock_irq(&zone->lru_lock);
389		}
390		VM_BUG_ON(PageLRU(page));
391		SetPageLRU(page);
392		VM_BUG_ON(PageActive(page));
393		SetPageActive(page);
394		add_page_to_active_list(zone, page);
395	}
396	if (zone)
397		spin_unlock_irq(&zone->lru_lock);
398	release_pages(pvec->pages, pvec->nr, pvec->cold);
399	pagevec_reinit(pvec);
400}
401
402/*
403 * Try to drop buffers from the pages in a pagevec
404 */
405void pagevec_strip(struct pagevec *pvec)
406{
407	int i;
408
409	for (i = 0; i < pagevec_count(pvec); i++) {
410		struct page *page = pvec->pages[i];
411
412		if (PagePrivate(page) && !TestSetPageLocked(page)) {
413			if (PagePrivate(page))
414				try_to_release_page(page, 0);
415			unlock_page(page);
416		}
417	}
418}
419
420/**
421 * pagevec_lookup - gang pagecache lookup
422 * @pvec:	Where the resulting pages are placed
423 * @mapping:	The address_space to search
424 * @start:	The starting page index
425 * @nr_pages:	The maximum number of pages
426 *
427 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
428 * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
429 * reference against the pages in @pvec.
430 *
431 * The search returns a group of mapping-contiguous pages with ascending
432 * indexes.  There may be holes in the indices due to not-present pages.
433 *
434 * pagevec_lookup() returns the number of pages which were found.
435 */
436unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
437		pgoff_t start, unsigned nr_pages)
438{
439	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
440	return pagevec_count(pvec);
441}
442
443EXPORT_SYMBOL(pagevec_lookup);
444
445unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
446		pgoff_t *index, int tag, unsigned nr_pages)
447{
448	pvec->nr = find_get_pages_tag(mapping, index, tag,
449					nr_pages, pvec->pages);
450	return pagevec_count(pvec);
451}
452
453EXPORT_SYMBOL(pagevec_lookup_tag);
454
455#ifdef CONFIG_SMP
456/*
457 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
458 * CPUs
459 */
460#define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
461
462static DEFINE_PER_CPU(long, committed_space) = 0;
463
464void vm_acct_memory(long pages)
465{
466	long *local;
467
468	preempt_disable();
469	local = &__get_cpu_var(committed_space);
470	*local += pages;
471	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
472		atomic_add(*local, &vm_committed_space);
473		*local = 0;
474	}
475	preempt_enable();
476}
477
478#ifdef CONFIG_HOTPLUG_CPU
479
480/* Drop the CPU's cached committed space back into the central pool. */
481static int cpu_swap_callback(struct notifier_block *nfb,
482			     unsigned long action,
483			     void *hcpu)
484{
485	long *committed;
486
487	committed = &per_cpu(committed_space, (long)hcpu);
488	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
489		atomic_add(*committed, &vm_committed_space);
490		*committed = 0;
491		__lru_add_drain((long)hcpu);
492	}
493	return NOTIFY_OK;
494}
495#endif /* CONFIG_HOTPLUG_CPU */
496#endif /* CONFIG_SMP */
497
498/*
499 * Perform any setup for the swap system
500 */
501void __init swap_setup(void)
502{
503	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
504
505	/* Use a smaller cluster for small-memory machines */
506	if (megs < 16)
507		page_cluster = 2;
508	else
509		page_cluster = 3;
510	/*
511	 * Right now other parts of the system means that we
512	 * _really_ don't want to cluster much more
513	 */
514#ifdef CONFIG_HOTPLUG_CPU
515	hotcpu_notifier(cpu_swap_callback, 0);
516#endif
517}
518