1/******************************************************************************
2 * balloon.c
3 *
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
5 *
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9 *
10 * This file may be distributed separately from the Linux kernel, or
11 * incorporated into other software packages, subject to the following license:
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this source file (the "Software"), to deal in the Software without
15 * restriction, including without limitation the rights to use, copy, modify,
16 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17 * and to permit persons to whom the Software is furnished to do so, subject to
18 * the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29 * IN THE SOFTWARE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD$");
34
35#include <sys/param.h>
36#include <sys/lock.h>
37#include <sys/kernel.h>
38#include <sys/kthread.h>
39#include <sys/malloc.h>
40#include <sys/mutex.h>
41#include <sys/sysctl.h>
42#include <sys/module.h>
43
44#include <vm/vm.h>
45#include <vm/vm_page.h>
46
47#include <xen/xen-os.h>
48#include <xen/hypervisor.h>
49#include <xen/features.h>
50#include <xen/xenstore/xenstorevar.h>
51
52static MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver");
53
54/* Convert from KB (as fetched from xenstore) to number of PAGES */
55#define KB_TO_PAGE_SHIFT	(PAGE_SHIFT - 10)
56
57struct mtx balloon_mutex;
58
59/* We increase/decrease in batches which fit in a page */
60static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)];
61
62struct balloon_stats {
63	/* We aim for 'current allocation' == 'target allocation'. */
64	unsigned long current_pages;
65	unsigned long target_pages;
66	/* We may hit the hard limit in Xen. If we do then we remember it. */
67	unsigned long hard_limit;
68	/*
69	 * Drivers may alter the memory reservation independently, but they
70	 * must inform the balloon driver so we avoid hitting the hard limit.
71	 */
72	unsigned long driver_pages;
73	/* Number of pages in high- and low-memory balloons. */
74	unsigned long balloon_low;
75	unsigned long balloon_high;
76};
77
78static struct balloon_stats balloon_stats;
79#define bs balloon_stats
80
81SYSCTL_DECL(_dev_xen);
82static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon");
83SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD,
84    &bs.current_pages, 0, "Current allocation");
85SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD,
86    &bs.target_pages, 0, "Target allocation");
87SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD,
88    &bs.driver_pages, 0, "Driver pages");
89SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD,
90    &bs.hard_limit, 0, "Xen hard limit");
91SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD,
92    &bs.balloon_low, 0, "Low-mem balloon");
93SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD,
94    &bs.balloon_high, 0, "High-mem balloon");
95
96/* List of ballooned pages, threaded through the mem_map array. */
97static TAILQ_HEAD(,vm_page) ballooned_pages;
98
99/* Main work function, always executed in process context. */
100static void balloon_process(void *unused);
101
102#define IPRINTK(fmt, args...) \
103	printk(KERN_INFO "xen_mem: " fmt, ##args)
104#define WPRINTK(fmt, args...) \
105	printk(KERN_WARNING "xen_mem: " fmt, ##args)
106
107static unsigned long
108current_target(void)
109{
110	unsigned long target = min(bs.target_pages, bs.hard_limit);
111	if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
112		target = bs.current_pages + bs.balloon_low + bs.balloon_high;
113	return (target);
114}
115
116static unsigned long
117minimum_target(void)
118{
119	unsigned long min_pages, curr_pages = current_target();
120
121#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
122	/*
123	 * Simple continuous piecewiese linear function:
124	 *  max MiB -> min MiB	gradient
125	 *       0	   0
126	 *      16	  16
127	 *      32	  24
128	 *     128	  72	(1/2)
129	 *     512 	 168	(1/4)
130	 *    2048	 360	(1/8)
131	 *    8192	 552	(1/32)
132	 *   32768	1320
133	 *  131072	4392
134	 */
135	if (realmem < MB2PAGES(128))
136		min_pages = MB2PAGES(8) + (realmem >> 1);
137	else if (realmem < MB2PAGES(512))
138		min_pages = MB2PAGES(40) + (realmem >> 2);
139	else if (realmem < MB2PAGES(2048))
140		min_pages = MB2PAGES(104) + (realmem >> 3);
141	else
142		min_pages = MB2PAGES(296) + (realmem >> 5);
143#undef MB2PAGES
144
145	/* Don't enforce growth */
146	return (min(min_pages, curr_pages));
147}
148
149static int
150increase_reservation(unsigned long nr_pages)
151{
152	unsigned long  i;
153	vm_page_t      page;
154	long           rc;
155	struct xen_memory_reservation reservation = {
156		.address_bits = 0,
157		.extent_order = 0,
158		.domid        = DOMID_SELF
159	};
160
161	mtx_assert(&balloon_mutex, MA_OWNED);
162
163	if (nr_pages > nitems(frame_list))
164		nr_pages = nitems(frame_list);
165
166	for (page = TAILQ_FIRST(&ballooned_pages), i = 0;
167	    i < nr_pages; i++, page = TAILQ_NEXT(page, plinks.q)) {
168		KASSERT(page != NULL, ("ballooned_pages list corrupt"));
169		frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
170	}
171
172	set_xen_guest_handle(reservation.extent_start, frame_list);
173	reservation.nr_extents   = nr_pages;
174	rc = HYPERVISOR_memory_op(
175		XENMEM_populate_physmap, &reservation);
176	if (rc < nr_pages) {
177		if (rc > 0) {
178			int ret;
179
180			/* We hit the Xen hard limit: reprobe. */
181			reservation.nr_extents = rc;
182			ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
183					&reservation);
184			KASSERT(ret == rc, ("HYPERVISOR_memory_op failed"));
185		}
186		if (rc >= 0)
187			bs.hard_limit = (bs.current_pages + rc -
188					 bs.driver_pages);
189		goto out;
190	}
191
192	for (i = 0; i < nr_pages; i++) {
193		page = TAILQ_FIRST(&ballooned_pages);
194		KASSERT(page != NULL, ("Unable to get ballooned page"));
195		TAILQ_REMOVE(&ballooned_pages, page, plinks.q);
196		bs.balloon_low--;
197
198		KASSERT(xen_feature(XENFEAT_auto_translated_physmap),
199		    ("auto translated physmap but mapping is valid"));
200
201		vm_page_free(page);
202	}
203
204	bs.current_pages += nr_pages;
205
206 out:
207	return (0);
208}
209
210static int
211decrease_reservation(unsigned long nr_pages)
212{
213	unsigned long  i;
214	vm_page_t      page;
215	int            need_sleep = 0;
216	int ret;
217	struct xen_memory_reservation reservation = {
218		.address_bits = 0,
219		.extent_order = 0,
220		.domid        = DOMID_SELF
221	};
222
223	mtx_assert(&balloon_mutex, MA_OWNED);
224
225	if (nr_pages > nitems(frame_list))
226		nr_pages = nitems(frame_list);
227
228	for (i = 0; i < nr_pages; i++) {
229		if ((page = vm_page_alloc(NULL, 0,
230			    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
231			    VM_ALLOC_ZERO)) == NULL) {
232			nr_pages = i;
233			need_sleep = 1;
234			break;
235		}
236
237		if ((page->flags & PG_ZERO) == 0) {
238			/*
239			 * Zero the page, or else we might be leaking
240			 * important data to other domains on the same
241			 * host. Xen doesn't scrub ballooned out memory
242			 * pages, the guest is in charge of making
243			 * sure that no information is leaked.
244			 */
245			pmap_zero_page(page);
246		}
247
248		frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
249
250		TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q);
251		bs.balloon_low++;
252	}
253
254	set_xen_guest_handle(reservation.extent_start, frame_list);
255	reservation.nr_extents   = nr_pages;
256	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
257	KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed"));
258
259	bs.current_pages -= nr_pages;
260
261	return (need_sleep);
262}
263
264/*
265 * We avoid multiple worker processes conflicting via the balloon mutex.
266 * We may of course race updates of the target counts (which are protected
267 * by the balloon lock), or with changes to the Xen hard limit, but we will
268 * recover from these in time.
269 */
270static void
271balloon_process(void *unused)
272{
273	int need_sleep = 0;
274	long credit;
275
276	mtx_lock(&balloon_mutex);
277	for (;;) {
278		int sleep_time;
279
280		do {
281			credit = current_target() - bs.current_pages;
282			if (credit > 0)
283				need_sleep = (increase_reservation(credit) != 0);
284			if (credit < 0)
285				need_sleep = (decrease_reservation(-credit) != 0);
286
287		} while ((credit != 0) && !need_sleep);
288
289		/* Schedule more work if there is some still to be done. */
290		if (current_target() != bs.current_pages)
291			sleep_time = hz;
292		else
293			sleep_time = 0;
294
295		msleep(balloon_process, &balloon_mutex, 0, "balloon",
296		       sleep_time);
297	}
298	mtx_unlock(&balloon_mutex);
299}
300
301/* Resets the Xen limit, sets new target, and kicks off processing. */
302static void
303set_new_target(unsigned long target)
304{
305	/* No need for lock. Not read-modify-write updates. */
306	bs.hard_limit   = ~0UL;
307	bs.target_pages = max(target, minimum_target());
308	wakeup(balloon_process);
309}
310
311static struct xs_watch target_watch =
312{
313	.node = "memory/target",
314	.max_pending = 1,
315};
316
317/* React to a change in the target key */
318static void
319watch_target(struct xs_watch *watch,
320	     const char **vec, unsigned int len)
321{
322	unsigned long long new_target;
323	int err;
324
325	err = xs_scanf(XST_NIL, "memory", "target", NULL,
326	    "%llu", &new_target);
327	if (err) {
328		/* This is ok (for domain0 at least) - so just return */
329		return;
330	}
331
332	/*
333	 * The given memory/target value is in KiB, so it needs converting to
334	 * pages.  PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
335	 */
336	set_new_target(new_target >> KB_TO_PAGE_SHIFT);
337}
338
339/*------------------ Private Device Attachment Functions  --------------------*/
340/**
341 * \brief Identify instances of this device type in the system.
342 *
343 * \param driver  The driver performing this identify action.
344 * \param parent  The NewBus parent device for any devices this method adds.
345 */
346static void
347xenballoon_identify(driver_t *driver __unused, device_t parent)
348{
349	/*
350	 * A single device instance for our driver is always present
351	 * in a system operating under Xen.
352	 */
353	BUS_ADD_CHILD(parent, 0, driver->name, 0);
354}
355
356/**
357 * \brief Probe for the existence of the Xen Balloon device
358 *
359 * \param dev  NewBus device_t for this Xen control instance.
360 *
361 * \return  Always returns 0 indicating success.
362 */
363static int
364xenballoon_probe(device_t dev)
365{
366
367	device_set_desc(dev, "Xen Balloon Device");
368	return (0);
369}
370
371/**
372 * \brief Attach the Xen Balloon device.
373 *
374 * \param dev  NewBus device_t for this Xen control instance.
375 *
376 * \return  On success, 0. Otherwise an errno value indicating the
377 *          type of failure.
378 */
379static int
380xenballoon_attach(device_t dev)
381{
382	int err;
383
384	mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF);
385
386	bs.current_pages = realmem;
387	bs.target_pages  = bs.current_pages;
388	bs.balloon_low   = 0;
389	bs.balloon_high  = 0;
390	bs.driver_pages  = 0UL;
391	bs.hard_limit    = ~0UL;
392
393	kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon");
394
395	target_watch.callback = watch_target;
396
397	err = xs_register_watch(&target_watch);
398	if (err)
399		device_printf(dev,
400		    "xenballon: failed to set balloon watcher\n");
401
402	return (err);
403}
404
405/*-------------------- Private Device Attachment Data  -----------------------*/
406static device_method_t xenballoon_methods[] = {
407	/* Device interface */
408	DEVMETHOD(device_identify,	xenballoon_identify),
409	DEVMETHOD(device_probe,         xenballoon_probe),
410	DEVMETHOD(device_attach,        xenballoon_attach),
411
412	DEVMETHOD_END
413};
414
415DEFINE_CLASS_0(xenballoon, xenballoon_driver, xenballoon_methods, 0);
416devclass_t xenballoon_devclass;
417
418DRIVER_MODULE(xenballoon, xenstore, xenballoon_driver, xenballoon_devclass,
419    NULL, NULL);
420