balloon.c revision 267654
1/******************************************************************************
2 * balloon.c
3 *
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
5 *
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9 *
10 * This file may be distributed separately from the Linux kernel, or
11 * incorporated into other software packages, subject to the following license:
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this source file (the "Software"), to deal in the Software without
15 * restriction, including without limitation the rights to use, copy, modify,
16 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17 * and to permit persons to whom the Software is furnished to do so, subject to
18 * the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29 * IN THE SOFTWARE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: releng/9.3/sys/dev/xen/balloon/balloon.c 249132 2013-04-05 08:22:11Z mav $");
34
35#include <sys/param.h>
36#include <sys/lock.h>
37#include <sys/kernel.h>
38#include <sys/kthread.h>
39#include <sys/malloc.h>
40#include <sys/mutex.h>
41#include <sys/sysctl.h>
42
43#include <machine/xen/xen-os.h>
44#include <machine/xen/xenvar.h>
45#include <machine/xen/xenfunc.h>
46#include <xen/hypervisor.h>
47#include <xen/xenstore/xenstorevar.h>
48
49#include <vm/vm.h>
50#include <vm/vm_page.h>
51
52static MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver");
53
54struct mtx balloon_mutex;
55
56/*
57 * Protects atomic reservation decrease/increase against concurrent increases.
58 * Also protects non-atomic updates of current_pages and driver_pages, and
59 * balloon lists.
60 */
61struct mtx balloon_lock;
62
63/* We increase/decrease in batches which fit in a page */
64static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
65#define ARRAY_SIZE(A)	(sizeof(A) / sizeof(A[0]))
66
67struct balloon_stats {
68	/* We aim for 'current allocation' == 'target allocation'. */
69	unsigned long current_pages;
70	unsigned long target_pages;
71	/* We may hit the hard limit in Xen. If we do then we remember it. */
72	unsigned long hard_limit;
73	/*
74	 * Drivers may alter the memory reservation independently, but they
75	 * must inform the balloon driver so we avoid hitting the hard limit.
76	 */
77	unsigned long driver_pages;
78	/* Number of pages in high- and low-memory balloons. */
79	unsigned long balloon_low;
80	unsigned long balloon_high;
81};
82
83static struct balloon_stats balloon_stats;
84#define bs balloon_stats
85
86SYSCTL_DECL(_dev_xen);
87static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon");
88SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD,
89    &bs.current_pages, 0, "Current allocation");
90SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD,
91    &bs.target_pages, 0, "Target allocation");
92SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD,
93    &bs.driver_pages, 0, "Driver pages");
94SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD,
95    &bs.hard_limit, 0, "Xen hard limit");
96SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD,
97    &bs.balloon_low, 0, "Low-mem balloon");
98SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD,
99    &bs.balloon_high, 0, "High-mem balloon");
100
101struct balloon_entry {
102	vm_page_t page;
103	STAILQ_ENTRY(balloon_entry) list;
104};
105
106/* List of ballooned pages, threaded through the mem_map array. */
107static STAILQ_HEAD(,balloon_entry) ballooned_pages;
108
109/* Main work function, always executed in process context. */
110static void balloon_process(void *unused);
111
112#define IPRINTK(fmt, args...) \
113	printk(KERN_INFO "xen_mem: " fmt, ##args)
114#define WPRINTK(fmt, args...) \
115	printk(KERN_WARNING "xen_mem: " fmt, ##args)
116
117/* balloon_append: add the given page to the balloon. */
118static void
119balloon_append(vm_page_t page)
120{
121	struct balloon_entry *entry;
122
123	entry = malloc(sizeof(struct balloon_entry), M_BALLOON, M_WAITOK);
124	entry->page = page;
125	STAILQ_INSERT_HEAD(&ballooned_pages, entry, list);
126	bs.balloon_low++;
127}
128
129/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
130static vm_page_t
131balloon_retrieve(void)
132{
133	vm_page_t page;
134	struct balloon_entry *entry;
135
136	if (STAILQ_EMPTY(&ballooned_pages))
137		return NULL;
138
139	entry = STAILQ_FIRST(&ballooned_pages);
140	STAILQ_REMOVE_HEAD(&ballooned_pages, list);
141
142	page = entry->page;
143	free(entry, M_BALLOON);
144
145	bs.balloon_low--;
146
147	return page;
148}
149
150static unsigned long
151current_target(void)
152{
153	unsigned long target = min(bs.target_pages, bs.hard_limit);
154	if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
155		target = bs.current_pages + bs.balloon_low + bs.balloon_high;
156	return target;
157}
158
159static unsigned long
160minimum_target(void)
161{
162#ifdef XENHVM
163#define max_pfn physmem
164#else
165#define max_pfn HYPERVISOR_shared_info->arch.max_pfn
166#endif
167	unsigned long min_pages, curr_pages = current_target();
168
169#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
170	/* Simple continuous piecewiese linear function:
171	 *  max MiB -> min MiB	gradient
172	 *       0	   0
173	 *      16	  16
174	 *      32	  24
175	 *     128	  72	(1/2)
176	 *     512 	 168	(1/4)
177	 *    2048	 360	(1/8)
178	 *    8192	 552	(1/32)
179	 *   32768	1320
180	 *  131072	4392
181	 */
182	if (max_pfn < MB2PAGES(128))
183		min_pages = MB2PAGES(8) + (max_pfn >> 1);
184	else if (max_pfn < MB2PAGES(512))
185		min_pages = MB2PAGES(40) + (max_pfn >> 2);
186	else if (max_pfn < MB2PAGES(2048))
187		min_pages = MB2PAGES(104) + (max_pfn >> 3);
188	else
189		min_pages = MB2PAGES(296) + (max_pfn >> 5);
190#undef MB2PAGES
191
192	/* Don't enforce growth */
193	return min(min_pages, curr_pages);
194#ifndef CONFIG_XEN
195#undef max_pfn
196#endif
197}
198
199static int
200increase_reservation(unsigned long nr_pages)
201{
202	unsigned long  pfn, i;
203	struct balloon_entry *entry;
204	vm_page_t      page;
205	long           rc;
206	struct xen_memory_reservation reservation = {
207		.address_bits = 0,
208		.extent_order = 0,
209		.domid        = DOMID_SELF
210	};
211
212	if (nr_pages > ARRAY_SIZE(frame_list))
213		nr_pages = ARRAY_SIZE(frame_list);
214
215	mtx_lock(&balloon_lock);
216
217	for (entry = STAILQ_FIRST(&ballooned_pages), i = 0;
218	     i < nr_pages; i++, entry = STAILQ_NEXT(entry, list)) {
219		KASSERT(entry, ("ballooned_pages list corrupt"));
220		page = entry->page;
221		frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
222	}
223
224	set_xen_guest_handle(reservation.extent_start, frame_list);
225	reservation.nr_extents   = nr_pages;
226	rc = HYPERVISOR_memory_op(
227		XENMEM_populate_physmap, &reservation);
228	if (rc < nr_pages) {
229		if (rc > 0) {
230			int ret;
231
232			/* We hit the Xen hard limit: reprobe. */
233			reservation.nr_extents = rc;
234			ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
235					&reservation);
236			KASSERT(ret == rc, ("HYPERVISOR_memory_op failed"));
237		}
238		if (rc >= 0)
239			bs.hard_limit = (bs.current_pages + rc -
240					 bs.driver_pages);
241		goto out;
242	}
243
244	for (i = 0; i < nr_pages; i++) {
245		page = balloon_retrieve();
246		KASSERT(page, ("balloon_retrieve failed"));
247
248		pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
249		KASSERT((xen_feature(XENFEAT_auto_translated_physmap) ||
250			!phys_to_machine_mapping_valid(pfn)),
251		    ("auto translated physmap but mapping is valid"));
252
253		set_phys_to_machine(pfn, frame_list[i]);
254
255#if 0
256#ifndef XENHVM
257		/* Link back into the page tables if not highmem. */
258		if (pfn < max_low_pfn) {
259			int ret;
260			ret = HYPERVISOR_update_va_mapping(
261				(unsigned long)__va(pfn << PAGE_SHIFT),
262				pfn_pte_ma(frame_list[i], PAGE_KERNEL),
263				0);
264			PASSING(ret == 0,
265			    ("HYPERVISOR_update_va_mapping failed"));
266		}
267#endif
268#endif
269
270		/* Relinquish the page back to the allocator. */
271		vm_page_unwire(page, 0);
272		vm_page_free(page);
273	}
274
275	bs.current_pages += nr_pages;
276	//totalram_pages = bs.current_pages;
277
278 out:
279	mtx_unlock(&balloon_lock);
280
281	return 0;
282}
283
284static int
285decrease_reservation(unsigned long nr_pages)
286{
287	unsigned long  pfn, i;
288	vm_page_t      page;
289	int            need_sleep = 0;
290	int ret;
291	struct xen_memory_reservation reservation = {
292		.address_bits = 0,
293		.extent_order = 0,
294		.domid        = DOMID_SELF
295	};
296
297	if (nr_pages > ARRAY_SIZE(frame_list))
298		nr_pages = ARRAY_SIZE(frame_list);
299
300	for (i = 0; i < nr_pages; i++) {
301		if ((page = vm_page_alloc(NULL, 0,
302			    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
303			    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
304			nr_pages = i;
305			need_sleep = 1;
306			break;
307		}
308
309		pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
310		frame_list[i] = PFNTOMFN(pfn);
311
312#if 0
313		if (!PageHighMem(page)) {
314			v = phys_to_virt(pfn << PAGE_SHIFT);
315			scrub_pages(v, 1);
316#ifdef CONFIG_XEN
317			ret = HYPERVISOR_update_va_mapping(
318				(unsigned long)v, __pte_ma(0), 0);
319			BUG_ON(ret);
320#endif
321		}
322#endif
323#ifdef CONFIG_XEN_SCRUB_PAGES
324		else {
325			v = kmap(page);
326			scrub_pages(v, 1);
327			kunmap(page);
328		}
329#endif
330	}
331
332#ifdef CONFIG_XEN
333	/* Ensure that ballooned highmem pages don't have kmaps. */
334	kmap_flush_unused();
335	flush_tlb_all();
336#endif
337
338	mtx_lock(&balloon_lock);
339
340	/* No more mappings: invalidate P2M and add to balloon. */
341	for (i = 0; i < nr_pages; i++) {
342		pfn = MFNTOPFN(frame_list[i]);
343		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
344		balloon_append(PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT));
345	}
346
347	set_xen_guest_handle(reservation.extent_start, frame_list);
348	reservation.nr_extents   = nr_pages;
349	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
350	KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed"));
351
352	bs.current_pages -= nr_pages;
353	//totalram_pages = bs.current_pages;
354
355	mtx_unlock(&balloon_lock);
356
357	return (need_sleep);
358}
359
360/*
361 * We avoid multiple worker processes conflicting via the balloon mutex.
362 * We may of course race updates of the target counts (which are protected
363 * by the balloon lock), or with changes to the Xen hard limit, but we will
364 * recover from these in time.
365 */
366static void
367balloon_process(void *unused)
368{
369	int need_sleep = 0;
370	long credit;
371
372	mtx_lock(&balloon_mutex);
373	for (;;) {
374		int sleep_time;
375
376		do {
377			credit = current_target() - bs.current_pages;
378			if (credit > 0)
379				need_sleep = (increase_reservation(credit) != 0);
380			if (credit < 0)
381				need_sleep = (decrease_reservation(-credit) != 0);
382
383		} while ((credit != 0) && !need_sleep);
384
385		/* Schedule more work if there is some still to be done. */
386		if (current_target() != bs.current_pages)
387			sleep_time = hz;
388		else
389			sleep_time = 0;
390
391		msleep(balloon_process, &balloon_mutex, 0, "balloon",
392		       sleep_time);
393	}
394	mtx_unlock(&balloon_mutex);
395}
396
397/* Resets the Xen limit, sets new target, and kicks off processing. */
398static void
399set_new_target(unsigned long target)
400{
401	/* No need for lock. Not read-modify-write updates. */
402	bs.hard_limit   = ~0UL;
403	bs.target_pages = max(target, minimum_target());
404	wakeup(balloon_process);
405}
406
407static struct xs_watch target_watch =
408{
409	.node = "memory/target"
410};
411
412/* React to a change in the target key */
413static void
414watch_target(struct xs_watch *watch,
415	     const char **vec, unsigned int len)
416{
417	unsigned long long new_target;
418	int err;
419
420	err = xs_scanf(XST_NIL, "memory", "target", NULL,
421	    "%llu", &new_target);
422	if (err) {
423		/* This is ok (for domain0 at least) - so just return */
424		return;
425	}
426
427	/* The given memory/target value is in KiB, so it needs converting to
428	   pages.  PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
429	*/
430	set_new_target(new_target >> (PAGE_SHIFT - 10));
431
432}
433
434static void
435balloon_init_watcher(void *arg)
436{
437	int err;
438
439	if (!is_running_on_xen())
440		return;
441
442	err = xs_register_watch(&target_watch);
443	if (err)
444		printf("Failed to set balloon watcher\n");
445
446}
447SYSINIT(balloon_init_watcher, SI_SUB_PSEUDO, SI_ORDER_ANY,
448    balloon_init_watcher, NULL);
449
450static void
451balloon_init(void *arg)
452{
453#ifndef XENHVM
454	vm_page_t page;
455	unsigned long pfn;
456
457#define max_pfn HYPERVISOR_shared_info->arch.max_pfn
458#endif
459
460	if (!is_running_on_xen())
461		return;
462
463	mtx_init(&balloon_lock, "balloon_lock", NULL, MTX_DEF);
464	mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF);
465
466#ifndef XENHVM
467	bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
468#else
469	bs.current_pages = physmem;
470#endif
471	bs.target_pages  = bs.current_pages;
472	bs.balloon_low   = 0;
473	bs.balloon_high  = 0;
474	bs.driver_pages  = 0UL;
475	bs.hard_limit    = ~0UL;
476
477	kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon");
478
479#ifndef XENHVM
480	/* Initialise the balloon with excess memory space. */
481	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
482		page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT);
483		balloon_append(page);
484	}
485#undef max_pfn
486#endif
487
488	target_watch.callback = watch_target;
489
490	return;
491}
492SYSINIT(balloon_init, SI_SUB_PSEUDO, SI_ORDER_ANY, balloon_init, NULL);
493
494void balloon_update_driver_allowance(long delta);
495
496void
497balloon_update_driver_allowance(long delta)
498{
499	mtx_lock(&balloon_lock);
500	bs.driver_pages += delta;
501	mtx_unlock(&balloon_lock);
502}
503
504#if 0
505static int dealloc_pte_fn(
506	pte_t *pte, struct page *pte_page, unsigned long addr, void *data)
507{
508	unsigned long mfn = pte_mfn(*pte);
509	int ret;
510	struct xen_memory_reservation reservation = {
511		.extent_start = &mfn,
512		.nr_extents   = 1,
513		.extent_order = 0,
514		.domid        = DOMID_SELF
515	};
516	set_pte_at(&init_mm, addr, pte, __pte_ma(0));
517	set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
518	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
519	KASSERT(ret == 1, ("HYPERVISOR_memory_op failed"));
520	return 0;
521}
522
523#endif
524
525#if 0
526vm_page_t
527balloon_alloc_empty_page_range(unsigned long nr_pages)
528{
529	vm_page_t pages;
530	int i, rc;
531	unsigned long *mfn_list;
532	struct xen_memory_reservation reservation = {
533		.address_bits = 0,
534		.extent_order = 0,
535		.domid        = DOMID_SELF
536	};
537
538	pages = vm_page_alloc_contig(nr_pages, 0, -1, 4, 4)
539	if (pages == NULL)
540		return NULL;
541
542	mfn_list = malloc(nr_pages*sizeof(unsigned long), M_DEVBUF, M_WAITOK);
543
544	for (i = 0; i < nr_pages; i++) {
545		mfn_list[i] = PFNTOMFN(VM_PAGE_TO_PHYS(pages[i]) >> PAGE_SHIFT);
546		PFNTOMFN(i) = INVALID_P2M_ENTRY;
547		reservation.extent_start = mfn_list;
548		reservation.nr_extents = nr_pages;
549		rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
550		    &reservation);
551		KASSERT(rc == nr_pages, ("HYPERVISOR_memory_op failed"));
552	}
553
554	current_pages -= nr_pages;
555
556	wakeup(balloon_process);
557
558	return pages;
559}
560
561void
562balloon_dealloc_empty_page_range(vm_page_t page, unsigned long nr_pages)
563{
564	unsigned long i;
565
566	for (i = 0; i < nr_pages; i++)
567		balloon_append(page + i);
568
569	wakeup(balloon_process);
570}
571#endif
572