1
2
3/* Notebook:
4   fix mmap readahead to honour policy and enable policy for any page cache
5   object
6   statistics for bigpages
7   global policy for page cache? currently it uses process policy. Requires
8   first item above.
9   handle mremap for shared memory (currently ignored for the policy)
10   grows down?
11   make bind policy root only? It can trigger oom much faster and the
12   kernel is not always grateful with that.
13   could replace all the switch()es with a mempolicy_ops structure.
14*/
15
16#include <linux/mempolicy.h>
17#include <linux/mm.h>
18#include <linux/highmem.h>
19#include <linux/hugetlb.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/mm.h>
23#include <linux/nodemask.h>
24#include <linux/cpuset.h>
25#include <linux/gfp.h>
26#include <linux/slab.h>
27#include <linux/string.h>
28#include <linux/module.h>
29#include <linux/interrupt.h>
30#include <linux/init.h>
31#include <linux/compat.h>
32#include <linux/mempolicy.h>
33#include <linux/swap.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36#include <linux/migrate.h>
37#include <linux/rmap.h>
38#include <linux/security.h>
39
40#include <asm/tlbflush.h>
41#include <asm/uaccess.h>
42
43/* Internal flags */
44#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
45#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
46#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
47
48static struct kmem_cache *policy_cache;
49static struct kmem_cache *sn_cache;
50
51#define PDprintk(fmt...)
52
53/* Highest zone. An specific allocation for a zone below that is not
54   policied. */
55enum zone_type policy_zone = 0;
56
57struct mempolicy default_policy = {
58	.refcnt = ATOMIC_INIT(1), /* never free it */
59	.policy = MPOL_DEFAULT,
60};
61
62/* Do sanity checking on a policy */
63static int mpol_check_policy(int mode, nodemask_t *nodes)
64{
65	int empty = nodes_empty(*nodes);
66
67	switch (mode) {
68	case MPOL_DEFAULT:
69		if (!empty)
70			return -EINVAL;
71		break;
72	case MPOL_BIND:
73	case MPOL_INTERLEAVE:
74		/* Preferred will only use the first bit, but allow
75		   more for now. */
76		if (empty)
77			return -EINVAL;
78		break;
79	}
80	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
81}
82
83/* Generate a custom zonelist for the BIND policy. */
84static struct zonelist *bind_zonelist(nodemask_t *nodes)
85{
86	struct zonelist *zl;
87	int num, max, nd;
88	enum zone_type k;
89
90	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
91	max++;			/* space for zlcache_ptr (see mmzone.h) */
92	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
93	if (!zl)
94		return ERR_PTR(-ENOMEM);
95	zl->zlcache_ptr = NULL;
96	num = 0;
97	/* First put in the highest zones from all nodes, then all the next
98	   lower zones etc. Avoid empty zones because the memory allocator
99	   doesn't like them. If you implement node hot removal you
100	   have to fix that. */
101	k = policy_zone;
102	while (1) {
103		for_each_node_mask(nd, *nodes) {
104			struct zone *z = &NODE_DATA(nd)->node_zones[k];
105			if (z->present_pages > 0)
106				zl->zones[num++] = z;
107		}
108		if (k == 0)
109			break;
110		k--;
111	}
112	if (num == 0) {
113		kfree(zl);
114		return ERR_PTR(-EINVAL);
115	}
116	zl->zones[num] = NULL;
117	return zl;
118}
119
120/* Create a new policy */
121static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
122{
123	struct mempolicy *policy;
124
125	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
126	if (mode == MPOL_DEFAULT)
127		return NULL;
128	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
129	if (!policy)
130		return ERR_PTR(-ENOMEM);
131	atomic_set(&policy->refcnt, 1);
132	switch (mode) {
133	case MPOL_INTERLEAVE:
134		policy->v.nodes = *nodes;
135		if (nodes_weight(*nodes) == 0) {
136			kmem_cache_free(policy_cache, policy);
137			return ERR_PTR(-EINVAL);
138		}
139		break;
140	case MPOL_PREFERRED:
141		policy->v.preferred_node = first_node(*nodes);
142		if (policy->v.preferred_node >= MAX_NUMNODES)
143			policy->v.preferred_node = -1;
144		break;
145	case MPOL_BIND:
146		policy->v.zonelist = bind_zonelist(nodes);
147		if (IS_ERR(policy->v.zonelist)) {
148			void *error_code = policy->v.zonelist;
149			kmem_cache_free(policy_cache, policy);
150			return error_code;
151		}
152		break;
153	}
154	policy->policy = mode;
155	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
156	return policy;
157}
158
159static void gather_stats(struct page *, void *, int pte_dirty);
160static void migrate_page_add(struct page *page, struct list_head *pagelist,
161				unsigned long flags);
162
163/* Scan through pages checking if pages follow certain conditions. */
164static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
165		unsigned long addr, unsigned long end,
166		const nodemask_t *nodes, unsigned long flags,
167		void *private)
168{
169	pte_t *orig_pte;
170	pte_t *pte;
171	spinlock_t *ptl;
172
173	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
174	do {
175		struct page *page;
176		int nid;
177
178		if (!pte_present(*pte))
179			continue;
180		page = vm_normal_page(vma, addr, *pte);
181		if (!page)
182			continue;
183		/*
184		 * The check for PageReserved here is important to avoid
185		 * handling zero pages and other pages that may have been
186		 * marked special by the system.
187		 *
188		 * If the PageReserved would not be checked here then f.e.
189		 * the location of the zero page could have an influence
190		 * on MPOL_MF_STRICT, zero pages would be counted for
191		 * the per node stats, and there would be useless attempts
192		 * to put zero pages on the migration list.
193		 */
194		if (PageReserved(page))
195			continue;
196		nid = page_to_nid(page);
197		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
198			continue;
199
200		if (flags & MPOL_MF_STATS)
201			gather_stats(page, private, pte_dirty(*pte));
202		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
203			migrate_page_add(page, private, flags);
204		else
205			break;
206	} while (pte++, addr += PAGE_SIZE, addr != end);
207	pte_unmap_unlock(orig_pte, ptl);
208	return addr != end;
209}
210
211static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
212		unsigned long addr, unsigned long end,
213		const nodemask_t *nodes, unsigned long flags,
214		void *private)
215{
216	pmd_t *pmd;
217	unsigned long next;
218
219	pmd = pmd_offset(pud, addr);
220	do {
221		next = pmd_addr_end(addr, end);
222		if (pmd_none_or_clear_bad(pmd))
223			continue;
224		if (check_pte_range(vma, pmd, addr, next, nodes,
225				    flags, private))
226			return -EIO;
227	} while (pmd++, addr = next, addr != end);
228	return 0;
229}
230
231static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
232		unsigned long addr, unsigned long end,
233		const nodemask_t *nodes, unsigned long flags,
234		void *private)
235{
236	pud_t *pud;
237	unsigned long next;
238
239	pud = pud_offset(pgd, addr);
240	do {
241		next = pud_addr_end(addr, end);
242		if (pud_none_or_clear_bad(pud))
243			continue;
244		if (check_pmd_range(vma, pud, addr, next, nodes,
245				    flags, private))
246			return -EIO;
247	} while (pud++, addr = next, addr != end);
248	return 0;
249}
250
251static inline int check_pgd_range(struct vm_area_struct *vma,
252		unsigned long addr, unsigned long end,
253		const nodemask_t *nodes, unsigned long flags,
254		void *private)
255{
256	pgd_t *pgd;
257	unsigned long next;
258
259	pgd = pgd_offset(vma->vm_mm, addr);
260	do {
261		next = pgd_addr_end(addr, end);
262		if (pgd_none_or_clear_bad(pgd))
263			continue;
264		if (check_pud_range(vma, pgd, addr, next, nodes,
265				    flags, private))
266			return -EIO;
267	} while (pgd++, addr = next, addr != end);
268	return 0;
269}
270
271/*
272 * Check if all pages in a range are on a set of nodes.
273 * If pagelist != NULL then isolate pages from the LRU and
274 * put them on the pagelist.
275 */
276static struct vm_area_struct *
277check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
278		const nodemask_t *nodes, unsigned long flags, void *private)
279{
280	int err;
281	struct vm_area_struct *first, *vma, *prev;
282
283	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
284
285		err = migrate_prep();
286		if (err)
287			return ERR_PTR(err);
288	}
289
290	first = find_vma(mm, start);
291	if (!first)
292		return ERR_PTR(-EFAULT);
293	prev = NULL;
294	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
295		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
296			if (!vma->vm_next && vma->vm_end < end)
297				return ERR_PTR(-EFAULT);
298			if (prev && prev->vm_end < vma->vm_start)
299				return ERR_PTR(-EFAULT);
300		}
301		if (!is_vm_hugetlb_page(vma) &&
302		    ((flags & MPOL_MF_STRICT) ||
303		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
304				vma_migratable(vma)))) {
305			unsigned long endvma = vma->vm_end;
306
307			if (endvma > end)
308				endvma = end;
309			if (vma->vm_start > start)
310				start = vma->vm_start;
311			err = check_pgd_range(vma, start, endvma, nodes,
312						flags, private);
313			if (err) {
314				first = ERR_PTR(err);
315				break;
316			}
317		}
318		prev = vma;
319	}
320	return first;
321}
322
323/* Apply policy to a single VMA */
324static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
325{
326	int err = 0;
327	struct mempolicy *old = vma->vm_policy;
328
329	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
330		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
331		 vma->vm_ops, vma->vm_file,
332		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
333
334	if (vma->vm_ops && vma->vm_ops->set_policy)
335		err = vma->vm_ops->set_policy(vma, new);
336	if (!err) {
337		mpol_get(new);
338		vma->vm_policy = new;
339		mpol_free(old);
340	}
341	return err;
342}
343
344/* Step 2: apply policy to a range and do splits. */
345static int mbind_range(struct vm_area_struct *vma, unsigned long start,
346		       unsigned long end, struct mempolicy *new)
347{
348	struct vm_area_struct *next;
349	int err;
350
351	err = 0;
352	for (; vma && vma->vm_start < end; vma = next) {
353		next = vma->vm_next;
354		if (vma->vm_start < start)
355			err = split_vma(vma->vm_mm, vma, start, 1);
356		if (!err && vma->vm_end > end)
357			err = split_vma(vma->vm_mm, vma, end, 0);
358		if (!err)
359			err = policy_vma(vma, new);
360		if (err)
361			break;
362	}
363	return err;
364}
365
366static int contextualize_policy(int mode, nodemask_t *nodes)
367{
368	if (!nodes)
369		return 0;
370
371	cpuset_update_task_memory_state();
372	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
373		return -EINVAL;
374	return mpol_check_policy(mode, nodes);
375}
376
377
378/*
379 * Update task->flags PF_MEMPOLICY bit: set iff non-default
380 * mempolicy.  Allows more rapid checking of this (combined perhaps
381 * with other PF_* flag bits) on memory allocation hot code paths.
382 *
383 * If called from outside this file, the task 'p' should -only- be
384 * a newly forked child not yet visible on the task list, because
385 * manipulating the task flags of a visible task is not safe.
386 *
387 * The above limitation is why this routine has the funny name
388 * mpol_fix_fork_child_flag().
389 *
390 * It is also safe to call this with a task pointer of current,
391 * which the static wrapper mpol_set_task_struct_flag() does,
392 * for use within this file.
393 */
394
395void mpol_fix_fork_child_flag(struct task_struct *p)
396{
397	if (p->mempolicy)
398		p->flags |= PF_MEMPOLICY;
399	else
400		p->flags &= ~PF_MEMPOLICY;
401}
402
403static void mpol_set_task_struct_flag(void)
404{
405	mpol_fix_fork_child_flag(current);
406}
407
408/* Set the process memory policy */
409long do_set_mempolicy(int mode, nodemask_t *nodes)
410{
411	struct mempolicy *new;
412
413	if (contextualize_policy(mode, nodes))
414		return -EINVAL;
415	new = mpol_new(mode, nodes);
416	if (IS_ERR(new))
417		return PTR_ERR(new);
418	mpol_free(current->mempolicy);
419	current->mempolicy = new;
420	mpol_set_task_struct_flag();
421	if (new && new->policy == MPOL_INTERLEAVE)
422		current->il_next = first_node(new->v.nodes);
423	return 0;
424}
425
426/* Fill a zone bitmap for a policy */
427static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
428{
429	int i;
430
431	nodes_clear(*nodes);
432	switch (p->policy) {
433	case MPOL_BIND:
434		for (i = 0; p->v.zonelist->zones[i]; i++)
435			node_set(zone_to_nid(p->v.zonelist->zones[i]),
436				*nodes);
437		break;
438	case MPOL_DEFAULT:
439		break;
440	case MPOL_INTERLEAVE:
441		*nodes = p->v.nodes;
442		break;
443	case MPOL_PREFERRED:
444		/* or use current node instead of online map? */
445		if (p->v.preferred_node < 0)
446			*nodes = node_online_map;
447		else
448			node_set(p->v.preferred_node, *nodes);
449		break;
450	default:
451		BUG();
452	}
453}
454
455static int lookup_node(struct mm_struct *mm, unsigned long addr)
456{
457	struct page *p;
458	int err;
459
460	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
461	if (err >= 0) {
462		err = page_to_nid(p);
463		put_page(p);
464	}
465	return err;
466}
467
468/* Retrieve NUMA policy */
469long do_get_mempolicy(int *policy, nodemask_t *nmask,
470			unsigned long addr, unsigned long flags)
471{
472	int err;
473	struct mm_struct *mm = current->mm;
474	struct vm_area_struct *vma = NULL;
475	struct mempolicy *pol = current->mempolicy;
476
477	cpuset_update_task_memory_state();
478	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
479		return -EINVAL;
480	if (flags & MPOL_F_ADDR) {
481		down_read(&mm->mmap_sem);
482		vma = find_vma_intersection(mm, addr, addr+1);
483		if (!vma) {
484			up_read(&mm->mmap_sem);
485			return -EFAULT;
486		}
487		if (vma->vm_ops && vma->vm_ops->get_policy)
488			pol = vma->vm_ops->get_policy(vma, addr);
489		else
490			pol = vma->vm_policy;
491	} else if (addr)
492		return -EINVAL;
493
494	if (!pol)
495		pol = &default_policy;
496
497	if (flags & MPOL_F_NODE) {
498		if (flags & MPOL_F_ADDR) {
499			err = lookup_node(mm, addr);
500			if (err < 0)
501				goto out;
502			*policy = err;
503		} else if (pol == current->mempolicy &&
504				pol->policy == MPOL_INTERLEAVE) {
505			*policy = current->il_next;
506		} else {
507			err = -EINVAL;
508			goto out;
509		}
510	} else
511		*policy = pol->policy;
512
513	if (vma) {
514		up_read(&current->mm->mmap_sem);
515		vma = NULL;
516	}
517
518	err = 0;
519	if (nmask)
520		get_zonemask(pol, nmask);
521
522 out:
523	if (vma)
524		up_read(&current->mm->mmap_sem);
525	return err;
526}
527
528#ifdef CONFIG_MIGRATION
529/*
530 * page migration
531 */
532static void migrate_page_add(struct page *page, struct list_head *pagelist,
533				unsigned long flags)
534{
535	/*
536	 * Avoid migrating a page that is shared with others.
537	 */
538	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
539		isolate_lru_page(page, pagelist);
540}
541
542static struct page *new_node_page(struct page *page, unsigned long node, int **x)
543{
544	return alloc_pages_node(node, GFP_HIGHUSER, 0);
545}
546
547/*
548 * Migrate pages from one node to a target node.
549 * Returns error or the number of pages not migrated.
550 */
551int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
552{
553	nodemask_t nmask;
554	LIST_HEAD(pagelist);
555	int err = 0;
556
557	nodes_clear(nmask);
558	node_set(source, nmask);
559
560	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
561			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
562
563	if (!list_empty(&pagelist))
564		err = migrate_pages(&pagelist, new_node_page, dest);
565
566	return err;
567}
568
569/*
570 * Move pages between the two nodesets so as to preserve the physical
571 * layout as much as possible.
572 *
573 * Returns the number of page that could not be moved.
574 */
575int do_migrate_pages(struct mm_struct *mm,
576	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
577{
578	LIST_HEAD(pagelist);
579	int busy = 0;
580	int err = 0;
581	nodemask_t tmp;
582
583  	down_read(&mm->mmap_sem);
584
585	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
586	if (err)
587		goto out;
588
589/*
590 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
591 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
592 * bit in 'tmp', and return that <source, dest> pair for migration.
593 * The pair of nodemasks 'to' and 'from' define the map.
594 *
595 * If no pair of bits is found that way, fallback to picking some
596 * pair of 'source' and 'dest' bits that are not the same.  If the
597 * 'source' and 'dest' bits are the same, this represents a node
598 * that will be migrating to itself, so no pages need move.
599 *
600 * If no bits are left in 'tmp', or if all remaining bits left
601 * in 'tmp' correspond to the same bit in 'to', return false
602 * (nothing left to migrate).
603 *
604 * This lets us pick a pair of nodes to migrate between, such that
605 * if possible the dest node is not already occupied by some other
606 * source node, minimizing the risk of overloading the memory on a
607 * node that would happen if we migrated incoming memory to a node
608 * before migrating outgoing memory source that same node.
609 *
610 * A single scan of tmp is sufficient.  As we go, we remember the
611 * most recent <s, d> pair that moved (s != d).  If we find a pair
612 * that not only moved, but what's better, moved to an empty slot
613 * (d is not set in tmp), then we break out then, with that pair.
614 * Otherwise when we finish scannng from_tmp, we at least have the
615 * most recent <s, d> pair that moved.  If we get all the way through
616 * the scan of tmp without finding any node that moved, much less
617 * moved to an empty node, then there is nothing left worth migrating.
618 */
619
620	tmp = *from_nodes;
621	while (!nodes_empty(tmp)) {
622		int s,d;
623		int source = -1;
624		int dest = 0;
625
626		for_each_node_mask(s, tmp) {
627			d = node_remap(s, *from_nodes, *to_nodes);
628			if (s == d)
629				continue;
630
631			source = s;	/* Node moved. Memorize */
632			dest = d;
633
634			/* dest not in remaining from nodes? */
635			if (!node_isset(dest, tmp))
636				break;
637		}
638		if (source == -1)
639			break;
640
641		node_clear(source, tmp);
642		err = migrate_to_node(mm, source, dest, flags);
643		if (err > 0)
644			busy += err;
645		if (err < 0)
646			break;
647	}
648out:
649	up_read(&mm->mmap_sem);
650	if (err < 0)
651		return err;
652	return busy;
653
654}
655
656static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
657{
658	struct vm_area_struct *vma = (struct vm_area_struct *)private;
659
660	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
661}
662#else
663
664static void migrate_page_add(struct page *page, struct list_head *pagelist,
665				unsigned long flags)
666{
667}
668
669int do_migrate_pages(struct mm_struct *mm,
670	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
671{
672	return -ENOSYS;
673}
674
675static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
676{
677	return NULL;
678}
679#endif
680
681long do_mbind(unsigned long start, unsigned long len,
682		unsigned long mode, nodemask_t *nmask, unsigned long flags)
683{
684	struct vm_area_struct *vma;
685	struct mm_struct *mm = current->mm;
686	struct mempolicy *new;
687	unsigned long end;
688	int err;
689	LIST_HEAD(pagelist);
690
691	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
692				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
693	    || mode > MPOL_MAX)
694		return -EINVAL;
695	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
696		return -EPERM;
697
698	if (start & ~PAGE_MASK)
699		return -EINVAL;
700
701	if (mode == MPOL_DEFAULT)
702		flags &= ~MPOL_MF_STRICT;
703
704	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
705	end = start + len;
706
707	if (end < start)
708		return -EINVAL;
709	if (end == start)
710		return 0;
711
712	if (mpol_check_policy(mode, nmask))
713		return -EINVAL;
714
715	new = mpol_new(mode, nmask);
716	if (IS_ERR(new))
717		return PTR_ERR(new);
718
719	/*
720	 * If we are using the default policy then operation
721	 * on discontinuous address spaces is okay after all
722	 */
723	if (!new)
724		flags |= MPOL_MF_DISCONTIG_OK;
725
726	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
727			mode,nodes_addr(nodes)[0]);
728
729	down_write(&mm->mmap_sem);
730	vma = check_range(mm, start, end, nmask,
731			  flags | MPOL_MF_INVERT, &pagelist);
732
733	err = PTR_ERR(vma);
734	if (!IS_ERR(vma)) {
735		int nr_failed = 0;
736
737		err = mbind_range(vma, start, end, new);
738
739		if (!list_empty(&pagelist))
740			nr_failed = migrate_pages(&pagelist, new_vma_page,
741						(unsigned long)vma);
742
743		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
744			err = -EIO;
745	}
746
747	up_write(&mm->mmap_sem);
748	mpol_free(new);
749	return err;
750}
751
752/*
753 * User space interface with variable sized bitmaps for nodelists.
754 */
755
756/* Copy a node mask from user space. */
757static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
758		     unsigned long maxnode)
759{
760	unsigned long k;
761	unsigned long nlongs;
762	unsigned long endmask;
763
764	--maxnode;
765	nodes_clear(*nodes);
766	if (maxnode == 0 || !nmask)
767		return 0;
768	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
769		return -EINVAL;
770
771	nlongs = BITS_TO_LONGS(maxnode);
772	if ((maxnode % BITS_PER_LONG) == 0)
773		endmask = ~0UL;
774	else
775		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
776
777	/* When the user specified more nodes than supported just check
778	   if the non supported part is all zero. */
779	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
780		if (nlongs > PAGE_SIZE/sizeof(long))
781			return -EINVAL;
782		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
783			unsigned long t;
784			if (get_user(t, nmask + k))
785				return -EFAULT;
786			if (k == nlongs - 1) {
787				if (t & endmask)
788					return -EINVAL;
789			} else if (t)
790				return -EINVAL;
791		}
792		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
793		endmask = ~0UL;
794	}
795
796	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
797		return -EFAULT;
798	nodes_addr(*nodes)[nlongs-1] &= endmask;
799	return 0;
800}
801
802/* Copy a kernel node mask to user space */
803static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
804			      nodemask_t *nodes)
805{
806	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
807	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
808
809	if (copy > nbytes) {
810		if (copy > PAGE_SIZE)
811			return -EINVAL;
812		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
813			return -EFAULT;
814		copy = nbytes;
815	}
816	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
817}
818
819asmlinkage long sys_mbind(unsigned long start, unsigned long len,
820			unsigned long mode,
821			unsigned long __user *nmask, unsigned long maxnode,
822			unsigned flags)
823{
824	nodemask_t nodes;
825	int err;
826
827	err = get_nodes(&nodes, nmask, maxnode);
828	if (err)
829		return err;
830#ifdef CONFIG_CPUSETS
831	/* Restrict the nodes to the allowed nodes in the cpuset */
832	nodes_and(nodes, nodes, current->mems_allowed);
833#endif
834	return do_mbind(start, len, mode, &nodes, flags);
835}
836
837/* Set the process memory policy */
838asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
839		unsigned long maxnode)
840{
841	int err;
842	nodemask_t nodes;
843
844	if (mode < 0 || mode > MPOL_MAX)
845		return -EINVAL;
846	err = get_nodes(&nodes, nmask, maxnode);
847	if (err)
848		return err;
849	return do_set_mempolicy(mode, &nodes);
850}
851
852asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
853		const unsigned long __user *old_nodes,
854		const unsigned long __user *new_nodes)
855{
856	struct mm_struct *mm;
857	struct task_struct *task;
858	nodemask_t old;
859	nodemask_t new;
860	nodemask_t task_nodes;
861	int err;
862
863	err = get_nodes(&old, old_nodes, maxnode);
864	if (err)
865		return err;
866
867	err = get_nodes(&new, new_nodes, maxnode);
868	if (err)
869		return err;
870
871	/* Find the mm_struct */
872	read_lock(&tasklist_lock);
873	task = pid ? find_task_by_pid(pid) : current;
874	if (!task) {
875		read_unlock(&tasklist_lock);
876		return -ESRCH;
877	}
878	mm = get_task_mm(task);
879	read_unlock(&tasklist_lock);
880
881	if (!mm)
882		return -EINVAL;
883
884	/*
885	 * Check if this process has the right to modify the specified
886	 * process. The right exists if the process has administrative
887	 * capabilities, superuser privileges or the same
888	 * userid as the target process.
889	 */
890	if ((current->euid != task->suid) && (current->euid != task->uid) &&
891	    (current->uid != task->suid) && (current->uid != task->uid) &&
892	    !capable(CAP_SYS_NICE)) {
893		err = -EPERM;
894		goto out;
895	}
896
897	task_nodes = cpuset_mems_allowed(task);
898	/* Is the user allowed to access the target nodes? */
899	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
900		err = -EPERM;
901		goto out;
902	}
903
904	err = security_task_movememory(task);
905	if (err)
906		goto out;
907
908	err = do_migrate_pages(mm, &old, &new,
909		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
910out:
911	mmput(mm);
912	return err;
913}
914
915
916/* Retrieve NUMA policy */
917asmlinkage long sys_get_mempolicy(int __user *policy,
918				unsigned long __user *nmask,
919				unsigned long maxnode,
920				unsigned long addr, unsigned long flags)
921{
922	int err, pval;
923	nodemask_t nodes;
924
925	if (nmask != NULL && maxnode < MAX_NUMNODES)
926		return -EINVAL;
927
928	err = do_get_mempolicy(&pval, &nodes, addr, flags);
929
930	if (err)
931		return err;
932
933	if (policy && put_user(pval, policy))
934		return -EFAULT;
935
936	if (nmask)
937		err = copy_nodes_to_user(nmask, maxnode, &nodes);
938
939	return err;
940}
941
942#ifdef CONFIG_COMPAT
943
944asmlinkage long compat_sys_get_mempolicy(int __user *policy,
945				     compat_ulong_t __user *nmask,
946				     compat_ulong_t maxnode,
947				     compat_ulong_t addr, compat_ulong_t flags)
948{
949	long err;
950	unsigned long __user *nm = NULL;
951	unsigned long nr_bits, alloc_size;
952	DECLARE_BITMAP(bm, MAX_NUMNODES);
953
954	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
955	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
956
957	if (nmask)
958		nm = compat_alloc_user_space(alloc_size);
959
960	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
961
962	if (!err && nmask) {
963		err = copy_from_user(bm, nm, alloc_size);
964		/* ensure entire bitmap is zeroed */
965		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
966		err |= compat_put_bitmap(nmask, bm, nr_bits);
967	}
968
969	return err;
970}
971
972asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
973				     compat_ulong_t maxnode)
974{
975	long err = 0;
976	unsigned long __user *nm = NULL;
977	unsigned long nr_bits, alloc_size;
978	DECLARE_BITMAP(bm, MAX_NUMNODES);
979
980	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
981	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
982
983	if (nmask) {
984		err = compat_get_bitmap(bm, nmask, nr_bits);
985		nm = compat_alloc_user_space(alloc_size);
986		err |= copy_to_user(nm, bm, alloc_size);
987	}
988
989	if (err)
990		return -EFAULT;
991
992	return sys_set_mempolicy(mode, nm, nr_bits+1);
993}
994
995asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
996			     compat_ulong_t mode, compat_ulong_t __user *nmask,
997			     compat_ulong_t maxnode, compat_ulong_t flags)
998{
999	long err = 0;
1000	unsigned long __user *nm = NULL;
1001	unsigned long nr_bits, alloc_size;
1002	nodemask_t bm;
1003
1004	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1005	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1006
1007	if (nmask) {
1008		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1009		nm = compat_alloc_user_space(alloc_size);
1010		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1011	}
1012
1013	if (err)
1014		return -EFAULT;
1015
1016	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1017}
1018
1019#endif
1020
1021/* Return effective policy for a VMA */
1022static struct mempolicy * get_vma_policy(struct task_struct *task,
1023		struct vm_area_struct *vma, unsigned long addr)
1024{
1025	struct mempolicy *pol = task->mempolicy;
1026
1027	if (vma) {
1028		if (vma->vm_ops && vma->vm_ops->get_policy)
1029			pol = vma->vm_ops->get_policy(vma, addr);
1030		else if (vma->vm_policy &&
1031				vma->vm_policy->policy != MPOL_DEFAULT)
1032			pol = vma->vm_policy;
1033	}
1034	if (!pol)
1035		pol = &default_policy;
1036	return pol;
1037}
1038
1039/* Return a zonelist representing a mempolicy */
1040static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1041{
1042	int nd;
1043
1044	switch (policy->policy) {
1045	case MPOL_PREFERRED:
1046		nd = policy->v.preferred_node;
1047		if (nd < 0)
1048			nd = numa_node_id();
1049		break;
1050	case MPOL_BIND:
1051		/* Lower zones don't get a policy applied */
1052		/* Careful: current->mems_allowed might have moved */
1053		if (gfp_zone(gfp) >= policy_zone)
1054			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1055				return policy->v.zonelist;
1056		/*FALL THROUGH*/
1057	case MPOL_INTERLEAVE: /* should not happen */
1058	case MPOL_DEFAULT:
1059		nd = numa_node_id();
1060		break;
1061	default:
1062		nd = 0;
1063		BUG();
1064	}
1065	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1066}
1067
1068/* Do dynamic interleaving for a process */
1069static unsigned interleave_nodes(struct mempolicy *policy)
1070{
1071	unsigned nid, next;
1072	struct task_struct *me = current;
1073
1074	nid = me->il_next;
1075	next = next_node(nid, policy->v.nodes);
1076	if (next >= MAX_NUMNODES)
1077		next = first_node(policy->v.nodes);
1078	me->il_next = next;
1079	return nid;
1080}
1081
1082/*
1083 * Depending on the memory policy provide a node from which to allocate the
1084 * next slab entry.
1085 */
1086unsigned slab_node(struct mempolicy *policy)
1087{
1088	int pol = policy ? policy->policy : MPOL_DEFAULT;
1089
1090	switch (pol) {
1091	case MPOL_INTERLEAVE:
1092		return interleave_nodes(policy);
1093
1094	case MPOL_BIND:
1095		/*
1096		 * Follow bind policy behavior and start allocation at the
1097		 * first node.
1098		 */
1099		return zone_to_nid(policy->v.zonelist->zones[0]);
1100
1101	case MPOL_PREFERRED:
1102		if (policy->v.preferred_node >= 0)
1103			return policy->v.preferred_node;
1104		/* Fall through */
1105
1106	default:
1107		return numa_node_id();
1108	}
1109}
1110
1111/* Do static interleaving for a VMA with known offset. */
1112static unsigned offset_il_node(struct mempolicy *pol,
1113		struct vm_area_struct *vma, unsigned long off)
1114{
1115	unsigned nnodes = nodes_weight(pol->v.nodes);
1116	unsigned target = (unsigned)off % nnodes;
1117	int c;
1118	int nid = -1;
1119
1120	c = 0;
1121	do {
1122		nid = next_node(nid, pol->v.nodes);
1123		c++;
1124	} while (c <= target);
1125	return nid;
1126}
1127
1128/* Determine a node number for interleave */
1129static inline unsigned interleave_nid(struct mempolicy *pol,
1130		 struct vm_area_struct *vma, unsigned long addr, int shift)
1131{
1132	if (vma) {
1133		unsigned long off;
1134
1135		/*
1136		 * for small pages, there is no difference between
1137		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1138		 * for huge pages, since vm_pgoff is in units of small
1139		 * pages, we need to shift off the always 0 bits to get
1140		 * a useful offset.
1141		 */
1142		BUG_ON(shift < PAGE_SHIFT);
1143		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1144		off += (addr - vma->vm_start) >> shift;
1145		return offset_il_node(pol, vma, off);
1146	} else
1147		return interleave_nodes(pol);
1148}
1149
1150#ifdef CONFIG_HUGETLBFS
1151/* Return a zonelist suitable for a huge page allocation. */
1152struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1153{
1154	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1155
1156	if (pol->policy == MPOL_INTERLEAVE) {
1157		unsigned nid;
1158
1159		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1160		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1161	}
1162	return zonelist_policy(GFP_HIGHUSER, pol);
1163}
1164#endif
1165
1166/* Allocate a page in interleaved policy.
1167   Own path because it needs to do special accounting. */
1168static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1169					unsigned nid)
1170{
1171	struct zonelist *zl;
1172	struct page *page;
1173
1174	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1175	page = __alloc_pages(gfp, order, zl);
1176	if (page && page_zone(page) == zl->zones[0])
1177		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1178	return page;
1179}
1180
1181/**
1182 * 	alloc_page_vma	- Allocate a page for a VMA.
1183 *
1184 * 	@gfp:
1185 *      %GFP_USER    user allocation.
1186 *      %GFP_KERNEL  kernel allocations,
1187 *      %GFP_HIGHMEM highmem/user allocations,
1188 *      %GFP_FS      allocation should not call back into a file system.
1189 *      %GFP_ATOMIC  don't sleep.
1190 *
1191 * 	@vma:  Pointer to VMA or NULL if not available.
1192 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1193 *
1194 * 	This function allocates a page from the kernel page pool and applies
1195 *	a NUMA policy associated with the VMA or the current process.
1196 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1197 *	mm_struct of the VMA to prevent it from going away. Should be used for
1198 *	all allocations for pages that will be mapped into
1199 * 	user space. Returns NULL when no page can be allocated.
1200 *
1201 *	Should be called with the mm_sem of the vma hold.
1202 */
1203struct page *
1204alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1205{
1206	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1207
1208	cpuset_update_task_memory_state();
1209
1210	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1211		unsigned nid;
1212
1213		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1214		return alloc_page_interleave(gfp, 0, nid);
1215	}
1216	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1217}
1218
1219/**
1220 * 	alloc_pages_current - Allocate pages.
1221 *
1222 *	@gfp:
1223 *		%GFP_USER   user allocation,
1224 *      	%GFP_KERNEL kernel allocation,
1225 *      	%GFP_HIGHMEM highmem allocation,
1226 *      	%GFP_FS     don't call back into a file system.
1227 *      	%GFP_ATOMIC don't sleep.
1228 *	@order: Power of two of allocation size in pages. 0 is a single page.
1229 *
1230 *	Allocate a page from the kernel page pool.  When not in
1231 *	interrupt context and apply the current process NUMA policy.
1232 *	Returns NULL when no page can be allocated.
1233 *
1234 *	Don't call cpuset_update_task_memory_state() unless
1235 *	1) it's ok to take cpuset_sem (can WAIT), and
1236 *	2) allocating for current task (not interrupt).
1237 */
1238struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1239{
1240	struct mempolicy *pol = current->mempolicy;
1241
1242	if ((gfp & __GFP_WAIT) && !in_interrupt())
1243		cpuset_update_task_memory_state();
1244	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1245		pol = &default_policy;
1246	if (pol->policy == MPOL_INTERLEAVE)
1247		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1248	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1249}
1250EXPORT_SYMBOL(alloc_pages_current);
1251
1252/*
1253 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1254 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1255 * with the mems_allowed returned by cpuset_mems_allowed().  This
1256 * keeps mempolicies cpuset relative after its cpuset moves.  See
1257 * further kernel/cpuset.c update_nodemask().
1258 */
1259void *cpuset_being_rebound;
1260
1261/* Slow path of a mempolicy copy */
1262struct mempolicy *__mpol_copy(struct mempolicy *old)
1263{
1264	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1265
1266	if (!new)
1267		return ERR_PTR(-ENOMEM);
1268	if (current_cpuset_is_being_rebound()) {
1269		nodemask_t mems = cpuset_mems_allowed(current);
1270		mpol_rebind_policy(old, &mems);
1271	}
1272	*new = *old;
1273	atomic_set(&new->refcnt, 1);
1274	if (new->policy == MPOL_BIND) {
1275		int sz = ksize(old->v.zonelist);
1276		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1277		if (!new->v.zonelist) {
1278			kmem_cache_free(policy_cache, new);
1279			return ERR_PTR(-ENOMEM);
1280		}
1281	}
1282	return new;
1283}
1284
1285/* Slow path of a mempolicy comparison */
1286int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1287{
1288	if (!a || !b)
1289		return 0;
1290	if (a->policy != b->policy)
1291		return 0;
1292	switch (a->policy) {
1293	case MPOL_DEFAULT:
1294		return 1;
1295	case MPOL_INTERLEAVE:
1296		return nodes_equal(a->v.nodes, b->v.nodes);
1297	case MPOL_PREFERRED:
1298		return a->v.preferred_node == b->v.preferred_node;
1299	case MPOL_BIND: {
1300		int i;
1301		for (i = 0; a->v.zonelist->zones[i]; i++)
1302			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1303				return 0;
1304		return b->v.zonelist->zones[i] == NULL;
1305	}
1306	default:
1307		BUG();
1308		return 0;
1309	}
1310}
1311
1312/* Slow path of a mpol destructor. */
1313void __mpol_free(struct mempolicy *p)
1314{
1315	if (!atomic_dec_and_test(&p->refcnt))
1316		return;
1317	if (p->policy == MPOL_BIND)
1318		kfree(p->v.zonelist);
1319	p->policy = MPOL_DEFAULT;
1320	kmem_cache_free(policy_cache, p);
1321}
1322
1323/*
1324 * Shared memory backing store policy support.
1325 *
1326 * Remember policies even when nobody has shared memory mapped.
1327 * The policies are kept in Red-Black tree linked from the inode.
1328 * They are protected by the sp->lock spinlock, which should be held
1329 * for any accesses to the tree.
1330 */
1331
1332/* lookup first element intersecting start-end */
1333/* Caller holds sp->lock */
1334static struct sp_node *
1335sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1336{
1337	struct rb_node *n = sp->root.rb_node;
1338
1339	while (n) {
1340		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1341
1342		if (start >= p->end)
1343			n = n->rb_right;
1344		else if (end <= p->start)
1345			n = n->rb_left;
1346		else
1347			break;
1348	}
1349	if (!n)
1350		return NULL;
1351	for (;;) {
1352		struct sp_node *w = NULL;
1353		struct rb_node *prev = rb_prev(n);
1354		if (!prev)
1355			break;
1356		w = rb_entry(prev, struct sp_node, nd);
1357		if (w->end <= start)
1358			break;
1359		n = prev;
1360	}
1361	return rb_entry(n, struct sp_node, nd);
1362}
1363
1364/* Insert a new shared policy into the list. */
1365/* Caller holds sp->lock */
1366static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1367{
1368	struct rb_node **p = &sp->root.rb_node;
1369	struct rb_node *parent = NULL;
1370	struct sp_node *nd;
1371
1372	while (*p) {
1373		parent = *p;
1374		nd = rb_entry(parent, struct sp_node, nd);
1375		if (new->start < nd->start)
1376			p = &(*p)->rb_left;
1377		else if (new->end > nd->end)
1378			p = &(*p)->rb_right;
1379		else
1380			BUG();
1381	}
1382	rb_link_node(&new->nd, parent, p);
1383	rb_insert_color(&new->nd, &sp->root);
1384	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1385		 new->policy ? new->policy->policy : 0);
1386}
1387
1388/* Find shared policy intersecting idx */
1389struct mempolicy *
1390mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1391{
1392	struct mempolicy *pol = NULL;
1393	struct sp_node *sn;
1394
1395	if (!sp->root.rb_node)
1396		return NULL;
1397	spin_lock(&sp->lock);
1398	sn = sp_lookup(sp, idx, idx+1);
1399	if (sn) {
1400		mpol_get(sn->policy);
1401		pol = sn->policy;
1402	}
1403	spin_unlock(&sp->lock);
1404	return pol;
1405}
1406
1407static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1408{
1409	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1410	rb_erase(&n->nd, &sp->root);
1411	mpol_free(n->policy);
1412	kmem_cache_free(sn_cache, n);
1413}
1414
1415struct sp_node *
1416sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1417{
1418	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1419
1420	if (!n)
1421		return NULL;
1422	n->start = start;
1423	n->end = end;
1424	mpol_get(pol);
1425	n->policy = pol;
1426	return n;
1427}
1428
1429/* Replace a policy range. */
1430static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1431				 unsigned long end, struct sp_node *new)
1432{
1433	struct sp_node *n, *new2 = NULL;
1434
1435restart:
1436	spin_lock(&sp->lock);
1437	n = sp_lookup(sp, start, end);
1438	/* Take care of old policies in the same range. */
1439	while (n && n->start < end) {
1440		struct rb_node *next = rb_next(&n->nd);
1441		if (n->start >= start) {
1442			if (n->end <= end)
1443				sp_delete(sp, n);
1444			else
1445				n->start = end;
1446		} else {
1447			/* Old policy spanning whole new range. */
1448			if (n->end > end) {
1449				if (!new2) {
1450					spin_unlock(&sp->lock);
1451					new2 = sp_alloc(end, n->end, n->policy);
1452					if (!new2)
1453						return -ENOMEM;
1454					goto restart;
1455				}
1456				n->end = start;
1457				sp_insert(sp, new2);
1458				new2 = NULL;
1459				break;
1460			} else
1461				n->end = start;
1462		}
1463		if (!next)
1464			break;
1465		n = rb_entry(next, struct sp_node, nd);
1466	}
1467	if (new)
1468		sp_insert(sp, new);
1469	spin_unlock(&sp->lock);
1470	if (new2) {
1471		mpol_free(new2->policy);
1472		kmem_cache_free(sn_cache, new2);
1473	}
1474	return 0;
1475}
1476
1477void mpol_shared_policy_init(struct shared_policy *info, int policy,
1478				nodemask_t *policy_nodes)
1479{
1480	info->root = RB_ROOT;
1481	spin_lock_init(&info->lock);
1482
1483	if (policy != MPOL_DEFAULT) {
1484		struct mempolicy *newpol;
1485
1486		/* Falls back to MPOL_DEFAULT on any error */
1487		newpol = mpol_new(policy, policy_nodes);
1488		if (!IS_ERR(newpol)) {
1489			/* Create pseudo-vma that contains just the policy */
1490			struct vm_area_struct pvma;
1491
1492			memset(&pvma, 0, sizeof(struct vm_area_struct));
1493			/* Policy covers entire file */
1494			pvma.vm_end = TASK_SIZE;
1495			mpol_set_shared_policy(info, &pvma, newpol);
1496			mpol_free(newpol);
1497		}
1498	}
1499}
1500
1501int mpol_set_shared_policy(struct shared_policy *info,
1502			struct vm_area_struct *vma, struct mempolicy *npol)
1503{
1504	int err;
1505	struct sp_node *new = NULL;
1506	unsigned long sz = vma_pages(vma);
1507
1508	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1509		 vma->vm_pgoff,
1510		 sz, npol? npol->policy : -1,
1511		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1512
1513	if (npol) {
1514		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1515		if (!new)
1516			return -ENOMEM;
1517	}
1518	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1519	if (err && new)
1520		kmem_cache_free(sn_cache, new);
1521	return err;
1522}
1523
1524/* Free a backing policy store on inode delete. */
1525void mpol_free_shared_policy(struct shared_policy *p)
1526{
1527	struct sp_node *n;
1528	struct rb_node *next;
1529
1530	if (!p->root.rb_node)
1531		return;
1532	spin_lock(&p->lock);
1533	next = rb_first(&p->root);
1534	while (next) {
1535		n = rb_entry(next, struct sp_node, nd);
1536		next = rb_next(&n->nd);
1537		rb_erase(&n->nd, &p->root);
1538		mpol_free(n->policy);
1539		kmem_cache_free(sn_cache, n);
1540	}
1541	spin_unlock(&p->lock);
1542}
1543
1544/* assumes fs == KERNEL_DS */
1545void __init numa_policy_init(void)
1546{
1547	policy_cache = kmem_cache_create("numa_policy",
1548					 sizeof(struct mempolicy),
1549					 0, SLAB_PANIC, NULL, NULL);
1550
1551	sn_cache = kmem_cache_create("shared_policy_node",
1552				     sizeof(struct sp_node),
1553				     0, SLAB_PANIC, NULL, NULL);
1554
1555	/* Set interleaving policy for system init. This way not all
1556	   the data structures allocated at system boot end up in node zero. */
1557
1558	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1559		printk("numa_policy_init: interleaving failed\n");
1560}
1561
1562/* Reset policy of current process to default */
1563void numa_default_policy(void)
1564{
1565	do_set_mempolicy(MPOL_DEFAULT, NULL);
1566}
1567
1568/* Migrate a policy to a different set of nodes */
1569void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1570{
1571	nodemask_t *mpolmask;
1572	nodemask_t tmp;
1573
1574	if (!pol)
1575		return;
1576	mpolmask = &pol->cpuset_mems_allowed;
1577	if (nodes_equal(*mpolmask, *newmask))
1578		return;
1579
1580	switch (pol->policy) {
1581	case MPOL_DEFAULT:
1582		break;
1583	case MPOL_INTERLEAVE:
1584		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1585		pol->v.nodes = tmp;
1586		*mpolmask = *newmask;
1587		current->il_next = node_remap(current->il_next,
1588						*mpolmask, *newmask);
1589		break;
1590	case MPOL_PREFERRED:
1591		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1592						*mpolmask, *newmask);
1593		*mpolmask = *newmask;
1594		break;
1595	case MPOL_BIND: {
1596		nodemask_t nodes;
1597		struct zone **z;
1598		struct zonelist *zonelist;
1599
1600		nodes_clear(nodes);
1601		for (z = pol->v.zonelist->zones; *z; z++)
1602			node_set(zone_to_nid(*z), nodes);
1603		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1604		nodes = tmp;
1605
1606		zonelist = bind_zonelist(&nodes);
1607
1608		/* If no mem, then zonelist is NULL and we keep old zonelist.
1609		 * If that old zonelist has no remaining mems_allowed nodes,
1610		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1611		 */
1612
1613		if (!IS_ERR(zonelist)) {
1614			/* Good - got mem - substitute new zonelist */
1615			kfree(pol->v.zonelist);
1616			pol->v.zonelist = zonelist;
1617		}
1618		*mpolmask = *newmask;
1619		break;
1620	}
1621	default:
1622		BUG();
1623		break;
1624	}
1625}
1626
1627/*
1628 * Wrapper for mpol_rebind_policy() that just requires task
1629 * pointer, and updates task mempolicy.
1630 */
1631
1632void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1633{
1634	mpol_rebind_policy(tsk->mempolicy, new);
1635}
1636
1637/*
1638 * Rebind each vma in mm to new nodemask.
1639 *
1640 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1641 */
1642
1643void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1644{
1645	struct vm_area_struct *vma;
1646
1647	down_write(&mm->mmap_sem);
1648	for (vma = mm->mmap; vma; vma = vma->vm_next)
1649		mpol_rebind_policy(vma->vm_policy, new);
1650	up_write(&mm->mmap_sem);
1651}
1652
1653/*
1654 * Display pages allocated per node and memory policy via /proc.
1655 */
1656
1657static const char * const policy_types[] =
1658	{ "default", "prefer", "bind", "interleave" };
1659
1660/*
1661 * Convert a mempolicy into a string.
1662 * Returns the number of characters in buffer (if positive)
1663 * or an error (negative)
1664 */
1665static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1666{
1667	char *p = buffer;
1668	int l;
1669	nodemask_t nodes;
1670	int mode = pol ? pol->policy : MPOL_DEFAULT;
1671
1672	switch (mode) {
1673	case MPOL_DEFAULT:
1674		nodes_clear(nodes);
1675		break;
1676
1677	case MPOL_PREFERRED:
1678		nodes_clear(nodes);
1679		node_set(pol->v.preferred_node, nodes);
1680		break;
1681
1682	case MPOL_BIND:
1683		get_zonemask(pol, &nodes);
1684		break;
1685
1686	case MPOL_INTERLEAVE:
1687		nodes = pol->v.nodes;
1688		break;
1689
1690	default:
1691		BUG();
1692		return -EFAULT;
1693	}
1694
1695	l = strlen(policy_types[mode]);
1696 	if (buffer + maxlen < p + l + 1)
1697 		return -ENOSPC;
1698
1699	strcpy(p, policy_types[mode]);
1700	p += l;
1701
1702	if (!nodes_empty(nodes)) {
1703		if (buffer + maxlen < p + 2)
1704			return -ENOSPC;
1705		*p++ = '=';
1706	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1707	}
1708	return p - buffer;
1709}
1710
1711struct numa_maps {
1712	unsigned long pages;
1713	unsigned long anon;
1714	unsigned long active;
1715	unsigned long writeback;
1716	unsigned long mapcount_max;
1717	unsigned long dirty;
1718	unsigned long swapcache;
1719	unsigned long node[MAX_NUMNODES];
1720};
1721
1722static void gather_stats(struct page *page, void *private, int pte_dirty)
1723{
1724	struct numa_maps *md = private;
1725	int count = page_mapcount(page);
1726
1727	md->pages++;
1728	if (pte_dirty || PageDirty(page))
1729		md->dirty++;
1730
1731	if (PageSwapCache(page))
1732		md->swapcache++;
1733
1734	if (PageActive(page))
1735		md->active++;
1736
1737	if (PageWriteback(page))
1738		md->writeback++;
1739
1740	if (PageAnon(page))
1741		md->anon++;
1742
1743	if (count > md->mapcount_max)
1744		md->mapcount_max = count;
1745
1746	md->node[page_to_nid(page)]++;
1747}
1748
1749#ifdef CONFIG_HUGETLB_PAGE
1750static void check_huge_range(struct vm_area_struct *vma,
1751		unsigned long start, unsigned long end,
1752		struct numa_maps *md)
1753{
1754	unsigned long addr;
1755	struct page *page;
1756
1757	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1758		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1759		pte_t pte;
1760
1761		if (!ptep)
1762			continue;
1763
1764		pte = *ptep;
1765		if (pte_none(pte))
1766			continue;
1767
1768		page = pte_page(pte);
1769		if (!page)
1770			continue;
1771
1772		gather_stats(page, md, pte_dirty(*ptep));
1773	}
1774}
1775#else
1776static inline void check_huge_range(struct vm_area_struct *vma,
1777		unsigned long start, unsigned long end,
1778		struct numa_maps *md)
1779{
1780}
1781#endif
1782
1783int show_numa_map(struct seq_file *m, void *v)
1784{
1785	struct proc_maps_private *priv = m->private;
1786	struct vm_area_struct *vma = v;
1787	struct numa_maps *md;
1788	struct file *file = vma->vm_file;
1789	struct mm_struct *mm = vma->vm_mm;
1790	int n;
1791	char buffer[50];
1792
1793	if (!mm)
1794		return 0;
1795
1796	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1797	if (!md)
1798		return 0;
1799
1800	mpol_to_str(buffer, sizeof(buffer),
1801			    get_vma_policy(priv->task, vma, vma->vm_start));
1802
1803	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1804
1805	if (file) {
1806		seq_printf(m, " file=");
1807		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1808	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1809		seq_printf(m, " heap");
1810	} else if (vma->vm_start <= mm->start_stack &&
1811			vma->vm_end >= mm->start_stack) {
1812		seq_printf(m, " stack");
1813	}
1814
1815	if (is_vm_hugetlb_page(vma)) {
1816		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1817		seq_printf(m, " huge");
1818	} else {
1819		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1820				&node_online_map, MPOL_MF_STATS, md);
1821	}
1822
1823	if (!md->pages)
1824		goto out;
1825
1826	if (md->anon)
1827		seq_printf(m," anon=%lu",md->anon);
1828
1829	if (md->dirty)
1830		seq_printf(m," dirty=%lu",md->dirty);
1831
1832	if (md->pages != md->anon && md->pages != md->dirty)
1833		seq_printf(m, " mapped=%lu", md->pages);
1834
1835	if (md->mapcount_max > 1)
1836		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1837
1838	if (md->swapcache)
1839		seq_printf(m," swapcache=%lu", md->swapcache);
1840
1841	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1842		seq_printf(m," active=%lu", md->active);
1843
1844	if (md->writeback)
1845		seq_printf(m," writeback=%lu", md->writeback);
1846
1847	for_each_online_node(n)
1848		if (md->node[n])
1849			seq_printf(m, " N%d=%lu", n, md->node[n]);
1850out:
1851	seq_putc(m, '\n');
1852	kfree(md);
1853
1854	if (m->count < m->size)
1855		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1856	return 0;
1857}
1858