xref: /netbsd-src/sys/arch/xen/xen/balloon.c (revision f2da08845bdbeb494a73bb078760bf7739f9f239)
1 /* $NetBSD: balloon.c,v 1.24 2024/09/24 20:54:53 andvar Exp $ */
2 
3 /*-
4  * Copyright (c) 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Cherry G. Mathew <cherry@zyx.in> and
9  * Jean-Yves Migeon <jym@NetBSD.org>
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * The Xen balloon driver enables growing and shrinking PV domains
35  * memory on the fly, by allocating and freeing memory pages directly.
36  * This management needs domain cooperation to work properly, especially
37  * during balloon_inflate() operation where a domain gives back memory to
38  * the hypervisor.
39  *
40  * Shrinking memory on a live system is a difficult task, and may render
41  * it unstable or lead to crash. The driver takes a conservative approach
42  * there by doing memory operations in small steps of a few MiB each time. It
43  * will also refuse to decrease reservation below a certain threshold
44  * (XEN_RESERVATION_MIN), so as to avoid a complete kernel memory exhaustion.
45  *
46  * The user can intervene at two different levels to manage the ballooning
47  * of a domain:
48  * - directly within the domain using a sysctl(9) interface.
49  * - through the Xentools, by modifying the memory/target entry associated
50  *   to a domain. This is usually done in dom0.
51  *
52  * Modification of the reservation is signaled by writing inside the
53  * memory/target node in Xenstore. Writing new values will fire the xenbus
54  * watcher, and wakeup the balloon thread to inflate or deflate balloon.
55  *
56  * Both sysctl(9) nodes and memory/target entry assume that the values passed
57  * to them are in KiB. Internally, the driver will convert this value in
58  * pages (assuming a page is PAGE_SIZE bytes), and issue the correct hypercalls
59  * to decrease/increase domain's reservation accordingly.
60  *
61  * XXX Pages used by balloon are tracked through entries stored in a SLIST.
62  * This allows driver to conveniently add/remove wired pages from memory
63  * without the need to support these "memory gaps" inside uvm(9). Still, the
64  * driver does not currently "plug" new pages into uvm(9) when more memory
65  * is available than originally managed by balloon. For example, deflating
66  * balloon with a total number of pages above physmem is not supported for
67  * now. See balloon_deflate() for more details.
68  *
69  */
70 
71 #define BALLOONDEBUG 0
72 
73 #if defined(_KERNEL_OPT)
74 #include "opt_uvm_hotplug.h"
75 #endif
76 
77 #include <sys/cdefs.h>
78 __KERNEL_RCSID(0, "$NetBSD: balloon.c,v 1.24 2024/09/24 20:54:53 andvar Exp $");
79 
80 #include <sys/inttypes.h>
81 #include <sys/device.h>
82 #include <sys/param.h>
83 
84 #include <sys/atomic.h>
85 #include <sys/condvar.h>
86 #include <sys/kernel.h>
87 #include <sys/kmem.h>
88 #include <sys/kthread.h>
89 #include <sys/mutex.h>
90 #include <sys/pool.h>
91 #include <sys/queue.h>
92 #include <sys/sysctl.h>
93 
94 #include <xen/xen.h>
95 #include <xen/xenbus.h>
96 #include <xen/balloon.h>
97 
98 #include <uvm/uvm.h>
99 #include <uvm/uvm.h>
100 #include <uvm/uvm_physseg.h>
101 #include <xen/xenpmap.h>
102 
103 #include "locators.h"
104 
105 /*
106  * Number of MFNs stored in the array passed back and forth between domain
107  * and balloon/hypervisor, during balloon_inflate() / balloon_deflate(). These
108  * should fit in a page, for performance reasons.
109  */
110 #define BALLOON_DELTA (PAGE_SIZE / sizeof(xen_pfn_t))
111 
112 /*
113  * Safeguard value. Refuse to go below this threshold, so that domain
114  * can keep some free pages for its own use. Value is arbitrary, and may
115  * evolve with time.
116  */
117 #define BALLOON_BALLAST 256 /* In pages - 1MiB */
118 #define XEN_RESERVATION_MIN (uvmexp.freemin + BALLOON_BALLAST) /* In pages */
119 
120 /* KB <-> PAGEs */
121 #define PAGE_SIZE_KB (PAGE_SIZE >> 10) /* page size in KB */
122 #define BALLOON_PAGES_TO_KB(_pg) ((uint64_t)_pg * PAGE_SIZE_KB)
123 #define BALLOON_KB_TO_PAGES(_kb) (roundup(_kb, PAGE_SIZE_KB) / PAGE_SIZE_KB)
124 
125 /*
126  * A balloon page entry. Needed to track pages put/reclaimed from balloon
127  */
128 struct balloon_page_entry {
129 	struct vm_page *pg;
130 	SLIST_ENTRY(balloon_page_entry) entry;
131 };
132 
133 struct balloon_xenbus_softc {
134 	device_t sc_dev;
135 	struct sysctllog *sc_log;
136 
137 	kmutex_t balloon_mtx;   /* Protects condvar, target and res_min (below) */
138 	kcondvar_t balloon_cv;  /* Condvar variable for target (below) */
139 	size_t balloon_target;  /* Target domain reservation size in pages. */
140 	/* Minimum amount of memory reserved by domain, in KiB */
141 	uint64_t balloon_res_min;
142 
143 	xen_pfn_t *sc_mfn_list; /* List of MFNs passed from/to balloon */
144 	pool_cache_t bpge_pool; /* pool cache for balloon page entries */
145 	/* linked list for tracking pages used by balloon */
146 	SLIST_HEAD(, balloon_page_entry) balloon_page_entries;
147 	size_t balloon_num_page_entries;
148 };
149 
150 static size_t xenmem_get_currentreservation(void);
151 static size_t xenmem_get_maxreservation(void);
152 
153 static int  bpge_ctor(void *, void *, int);
154 static void bpge_dtor(void *, void *);
155 
156 static void   balloon_thread(void *);
157 static size_t balloon_deflate(struct balloon_xenbus_softc*, size_t);
158 static size_t balloon_inflate(struct balloon_xenbus_softc*, size_t);
159 
160 static void sysctl_kern_xen_balloon_setup(struct balloon_xenbus_softc *);
161 static void balloon_xenbus_watcher(struct xenbus_watch *, const char **,
162 				   unsigned int);
163 
164 static int  balloon_xenbus_match(device_t, cfdata_t, void *);
165 static void balloon_xenbus_attach(device_t, device_t, void *);
166 
167 CFATTACH_DECL_NEW(balloon, sizeof(struct balloon_xenbus_softc),
168     balloon_xenbus_match, balloon_xenbus_attach, NULL, NULL);
169 
170 static struct xenbus_watch balloon_xenbus_watch = {
171 	.node = __UNCONST("memory/target"),
172 	.xbw_callback = balloon_xenbus_watcher,
173 };
174 
175 static struct balloon_xenbus_softc *balloon_sc;
176 
177 static int
178 balloon_xenbus_match(device_t parent, cfdata_t match, void *aux)
179 {
180 	struct xenbusdev_attach_args *xa = aux;
181 
182 	if (strcmp(xa->xa_type, "balloon") != 0)
183 		return 0;
184 
185 	if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT &&
186 	    match->cf_loc[XENBUSCF_ID] != xa->xa_id)
187 		return 0;
188 
189 	return 1;
190 }
191 
192 static void
193 balloon_xenbus_attach(device_t parent, device_t self, void *aux)
194 {
195 	xen_pfn_t *mfn_list;
196 	size_t currentpages;
197 	struct balloon_xenbus_softc *sc = balloon_sc = device_private(self);
198 
199 	aprint_normal(": Xen Balloon driver\n");
200 	sc->sc_dev = self;
201 
202 	/* Initialize target mutex and condvar */
203 	mutex_init(&sc->balloon_mtx, MUTEX_DEFAULT, IPL_NONE);
204 	cv_init(&sc->balloon_cv, "xen_balloon");
205 
206 	SLIST_INIT(&sc->balloon_page_entries);
207 	sc->balloon_num_page_entries = 0;
208 
209 	/* Get current number of pages */
210 	currentpages = xenmem_get_currentreservation();
211 
212 	KASSERT(currentpages > 0);
213 
214 	/* Update initial target value - no need to lock for initialization */
215 	sc->balloon_target = currentpages;
216 
217 	/* Set the values used by sysctl */
218 	sc->balloon_res_min =
219 	    BALLOON_PAGES_TO_KB(XEN_RESERVATION_MIN);
220 
221 	aprint_normal_dev(self, "current reservation: %"PRIu64" KiB\n",
222 	    BALLOON_PAGES_TO_KB(currentpages));
223 #if BALLOONDEBUG
224 	aprint_normal_dev(self, "min reservation: %"PRIu64" KiB\n",
225 	    sc->balloon_res_min);
226 	aprint_normal_dev(self, "max reservation: %"PRIu64" KiB\n",
227 	    BALLOON_PAGES_TO_KB(xenmem_get_maxreservation()));
228 #endif
229 
230 	sc->bpge_pool = pool_cache_init(sizeof(struct balloon_page_entry),
231 	    0, 0, 0, "xen_bpge", NULL, IPL_NONE, bpge_ctor, bpge_dtor, NULL);
232 
233 	sysctl_kern_xen_balloon_setup(sc);
234 
235 	/* List of MFNs passed from/to balloon for inflating/deflating */
236 	mfn_list = kmem_alloc(BALLOON_DELTA * sizeof(*mfn_list), KM_SLEEP);
237 	sc->sc_mfn_list = mfn_list;
238 
239 	/* Setup xenbus node watch callback */
240 	if (register_xenbus_watch(&balloon_xenbus_watch)) {
241 		aprint_error_dev(self, "unable to watch memory/target\n");
242 		goto error;
243 	}
244 
245 	/* Setup kernel thread to asynchronously (in/de)-flate the balloon */
246 	if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, balloon_thread,
247 	    sc, NULL, "xen_balloon")) {
248 		aprint_error_dev(self, "unable to create balloon thread\n");
249 		unregister_xenbus_watch(&balloon_xenbus_watch);
250 		goto error;
251 	}
252 
253 	if (!pmf_device_register(self, NULL, NULL))
254 		aprint_error_dev(self, "couldn't establish power handler\n");
255 
256 	return;
257 
258 error:
259 	sysctl_teardown(&sc->sc_log);
260 	cv_destroy(&sc->balloon_cv);
261 	mutex_destroy(&sc->balloon_mtx);
262 	return;
263 
264 }
265 
266 /*
267  * Returns maximum memory reservation available to current domain. In Xen
268  * with DOMID_SELF, this hypercall never fails: return value should be
269  * interpreted as unsigned.
270  *
271  */
272 static size_t
273 xenmem_get_maxreservation(void)
274 {
275 	unsigned int ret;
276 
277 	ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation,
278 	    & (domid_t) { DOMID_SELF });
279 
280 	if (ret == 0) {
281 		/* well, a maximum reservation of 0 is really bogus */
282 		panic("%s failed, maximum reservation returned 0", __func__);
283 	}
284 
285 	return ret;
286 }
287 
288 /* Returns current reservation, in pages */
289 static size_t
290 xenmem_get_currentreservation(void)
291 {
292 	int ret;
293 
294 	ret = HYPERVISOR_memory_op(XENMEM_current_reservation,
295 				   & (domid_t) { DOMID_SELF });
296 
297 	if (ret < 0) {
298 		panic("%s failed: %d", __func__, ret);
299 	}
300 
301 	return ret;
302 }
303 
304 /*
305  * Get value (in KiB) of memory/target in XenStore for current domain
306  * A return value of 0 can be considered as bogus or absent.
307  */
308 static unsigned long long
309 balloon_xenbus_read_target(void)
310 {
311 	unsigned long long new_target;
312 	int err = xenbus_read_ull(NULL, "memory", "target", &new_target, 0);
313 
314 	switch(err) {
315 	case 0:
316 		return new_target;
317 	case ENOENT:
318 		break;
319 	default:
320 		device_printf(balloon_sc->sc_dev,
321 		    "error %d, couldn't read xenbus target node\n", err);
322 		break;
323 	}
324 
325 	return 0;
326 }
327 
328 /* Set memory/target value (in KiB) in XenStore for current domain */
329 static void
330 balloon_xenbus_write_target(unsigned long long new_target)
331 {
332 	int err = xenbus_printf(NULL, "memory", "target", "%llu", new_target);
333 
334 	if (err != 0) {
335 		device_printf(balloon_sc->sc_dev,
336 		    "error %d, couldn't write xenbus target node\n", err);
337 	}
338 
339 	return;
340 }
341 
342 static int
343 bpge_ctor(void *arg, void *obj, int flags)
344 {
345 	struct balloon_page_entry *bpge = obj;
346 
347 	bpge->pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
348 	if (bpge->pg == NULL)
349 		return ENOMEM;
350 
351 	return 0;
352 
353 }
354 
355 static void
356 bpge_dtor(void *arg, void *obj)
357 {
358 	struct balloon_page_entry *bpge = obj;
359 
360 	uvm_pagefree(bpge->pg);
361 }
362 
363 /*
364  * Inflate balloon. Pages are moved out of domain's memory towards balloon.
365  */
366 static size_t
367 balloon_inflate(struct balloon_xenbus_softc *sc, size_t tpages)
368 {
369 	int rpages, ret;
370 	paddr_t pa;
371 	struct balloon_page_entry *bpg_entry;
372 	xen_pfn_t *mfn_list = sc->sc_mfn_list;
373 
374 	struct xen_memory_reservation reservation = {
375 		.mem_flags = 0,
376 		.extent_order = 0,
377 		.domid        = DOMID_SELF
378 	};
379 
380 	KASSERT(tpages > 0);
381 	KASSERT(tpages <= BALLOON_DELTA);
382 
383 	memset(mfn_list, 0, BALLOON_DELTA * sizeof(*mfn_list));
384 
385 	/* allocate pages that will be given to Hypervisor */
386 	for (rpages = 0; rpages < tpages; rpages++) {
387 
388 		bpg_entry = pool_cache_get(sc->bpge_pool, PR_WAITOK);
389 		if (bpg_entry == NULL) {
390 			/* failed reserving a page for balloon */
391 			break;
392 		}
393 
394 		pa = VM_PAGE_TO_PHYS(bpg_entry->pg);
395 
396 		mfn_list[rpages] = xpmap_ptom(pa) >> PAGE_SHIFT;
397 
398 		/* Invalidate pg */
399 		xpmap_ptom_unmap(pa);
400 
401 		SLIST_INSERT_HEAD(&balloon_sc->balloon_page_entries,
402 				  bpg_entry, entry);
403 		balloon_sc->balloon_num_page_entries++;
404 	}
405 
406 	/* Hand over pages to Hypervisor */
407 	set_xen_guest_handle(reservation.extent_start, mfn_list);
408 	reservation.nr_extents = rpages;
409 
410 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
411 				   &reservation);
412 	if (ret != rpages) {
413 		/*
414 		 * we are in bad shape: the operation failed for certain
415 		 * MFNs. As the API does not allow us to know which frame
416 		 * numbers were erroneous, we cannot really recover safely.
417 		 */
418 		panic("%s: decrease reservation failed: was %d, "
419 		    "returned %d", device_xname(sc->sc_dev), rpages, ret);
420 	}
421 
422 #if BALLOONDEBUG
423 	device_printf(sc->sc_dev, "inflate %zu => inflated by %d\n",
424 	    tpages, rpages);
425 #endif
426 	return rpages;
427 }
428 
429 /*
430  * Deflate balloon. Pages are given back to domain's memory.
431  */
432 static size_t
433 balloon_deflate(struct balloon_xenbus_softc *sc, size_t tpages)
434 {
435 	int rpages, s, ret;
436 	paddr_t pa;
437 	struct balloon_page_entry *bpg_entry;
438 	xen_pfn_t *mfn_list = sc->sc_mfn_list;
439 
440 	struct xen_memory_reservation reservation = {
441 		.mem_flags = 0,
442 		.extent_order = 0,
443 		.domid        = DOMID_SELF
444 	};
445 
446 	KASSERT(tpages > 0);
447 	KASSERT(tpages <= BALLOON_DELTA);
448 
449 	memset(mfn_list, 0, BALLOON_DELTA * sizeof(*mfn_list));
450 
451 #ifndef UVM_HOTPLUG
452 	/*
453 	 * If the list is empty, we are deflating balloon beyond empty. This
454 	 * is currently unsupported as this would require to dynamically add
455 	 * new memory pages inside uvm(9) and instruct pmap(9) on how to
456 	 * handle them. For now, we clip reservation up to the point we
457 	 * can manage them, eg. the remaining bpg entries in the SLIST.
458 	 * XXX find a way to hotplug memory through uvm(9)/pmap(9).
459 	 */
460 	if (tpages > sc->balloon_num_page_entries) {
461 		device_printf(sc->sc_dev,
462 		    "memory 'hot-plug' unsupported - clipping "
463 		    "reservation %zu => %zu pages.\n",
464 		    tpages, sc->balloon_num_page_entries);
465 		tpages = sc->balloon_num_page_entries;
466 	}
467 #endif
468 
469 	/* reclaim pages from balloon */
470 	set_xen_guest_handle(reservation.extent_start, mfn_list);
471 	reservation.nr_extents = tpages;
472 
473 	ret = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
474 
475 	if (ret < 0) {
476 		panic("%s: increase reservation failed, ret %d",
477 		    device_xname(sc->sc_dev), ret);
478 	}
479 
480 	if (ret != tpages) {
481 		device_printf(sc->sc_dev,
482 		    "increase reservation incomplete: was %zu, "
483 		    "returned %d\n", tpages, ret);
484 	}
485 
486 	/* plug pages back into memory through bpge entries */
487 	for (rpages = 0; rpages < ret; rpages++) {
488 #ifdef UVM_HOTPLUG
489 		extern paddr_t pmap_pa_end;
490 		if (sc->balloon_num_page_entries == 0) { /*XXX: consolidate */
491 			/* "hot-plug": Stick it at the end of memory */
492 			pa = pmap_pa_end;
493 
494 			/* P2M update */
495 #if defined(_LP64) || defined(PAE)
496 			atomic_add_64(&pmap_pa_end, PAGE_SIZE);
497 #else
498 			atomic_add_32(&pmap_pa_end, PAGE_SIZE);
499 #endif
500 			s = splvm();
501 			xpmap_ptom_map(pa, ptoa(mfn_list[rpages]));
502 			xpq_queue_machphys_update(ptoa(mfn_list[rpages]), pa);
503 			xpq_flush_queue();
504 			splx(s);
505 
506 			if (uvm_physseg_plug(atop(pa), 1, NULL) == false) {
507 				/* Undo P2M */
508 				s = splvm();
509 				xpmap_ptom_unmap(pa);
510 				xpq_queue_machphys_update(ptoa(mfn_list[rpages]), 0);
511 				xpq_flush_queue();
512 				splx(s);
513 #if defined(_LP64) || defined(PAE)
514 				atomic_add_64(&pmap_pa_end, -PAGE_SIZE);
515 #else
516 				atomic_add_32(&pmap_pa_end, -PAGE_SIZE);
517 #endif
518 				break;
519 			}
520 			continue;
521 		}
522 #else
523 		if (sc->balloon_num_page_entries == 0) {
524 			/*
525 			 * XXX This is the case where extra "hot-plug"
526 			 * mem w.r.t boot comes in
527 			 */
528 			device_printf(sc->sc_dev,
529 			    "List empty. Cannot be collapsed further!\n");
530 			break;
531 		}
532 #endif
533 		bpg_entry = SLIST_FIRST(&balloon_sc->balloon_page_entries);
534 		SLIST_REMOVE_HEAD(&balloon_sc->balloon_page_entries, entry);
535 		balloon_sc->balloon_num_page_entries--;
536 
537 		/* Update P->M */
538 		pa = VM_PAGE_TO_PHYS(bpg_entry->pg);
539 
540 		s = splvm();
541 		xpmap_ptom_map(pa, ptoa(mfn_list[rpages]));
542 		xpq_queue_machphys_update(ptoa(mfn_list[rpages]), pa);
543 		xpq_flush_queue();
544 		splx(s);
545 
546 		pool_cache_put(sc->bpge_pool, bpg_entry);
547 	}
548 
549 #if BALLOONDEBUG
550 	device_printf(sc->sc_dev, "deflate %zu => deflated by %d\n",
551 	    tpages, rpages);
552 #endif
553 	return rpages;
554 }
555 
556 /*
557  * The balloon thread is responsible for handling inflate/deflate balloon
558  * requests for the current domain given the new "target" value.
559  */
560 static void
561 balloon_thread(void *cookie)
562 {
563 	int ret;
564 	size_t current, diff, target;
565 	struct balloon_xenbus_softc *sc = cookie;
566 
567 	for/*ever*/ (;;) {
568 		current = xenmem_get_currentreservation();
569 
570 		/*
571 		 * We assume that balloon_xenbus_watcher() and
572 		 * sysctl(9) handlers checked the sanity of the
573 		 * new target value.
574 		 */
575 		mutex_enter(&sc->balloon_mtx);
576 		target = sc->balloon_target;
577 		if (current != target) {
578 			/*
579 			 * There is work to do. Inflate/deflate in
580 			 * increments of BALLOON_DELTA pages at maximum. The
581 			 * risk of integer wrapping is mitigated by
582 			 * BALLOON_DELTA, which is the upper bound.
583 			 */
584 			mutex_exit(&sc->balloon_mtx);
585 			diff = MIN(target - current, BALLOON_DELTA);
586 			if (current < target)
587 				ret = balloon_deflate(sc, diff);
588 			else
589 				ret = balloon_inflate(sc, diff);
590 
591 			if (ret != diff) {
592 				/*
593 				 * Something went wrong during operation.
594 				 * Log error then feedback current value in
595 				 * target so that thread gets back to waiting
596 				 * for the next iteration
597 				 */
598 				device_printf(sc->sc_dev,
599 				    "WARNING: balloon could not reach target "
600 				    "%zu (current %zu)\n",
601 				    target, current);
602 				current = xenmem_get_currentreservation();
603 				mutex_enter(&sc->balloon_mtx);
604 				sc->balloon_target = current;
605 				mutex_exit(&sc->balloon_mtx);
606 			}
607 		} else {
608 			/* no need for change -- wait for a signal */
609 			cv_wait(&sc->balloon_cv, &sc->balloon_mtx);
610 			mutex_exit(&sc->balloon_mtx);
611 		}
612 	}
613 }
614 
615 /*
616  * Handler called when memory/target value changes inside Xenstore.
617  * All sanity checks must also happen in this handler, as it is the common
618  * entry point where controller domain schedules balloon operations.
619  */
620 static void
621 balloon_xenbus_watcher(struct xenbus_watch *watch, const char **vec,
622 		       unsigned int len)
623 {
624 	size_t new_target;
625 	uint64_t target_kb, target_max, target_min;
626 
627 	target_kb = balloon_xenbus_read_target();
628 	if (target_kb == 0) {
629 		/* bogus -- just return */
630 		return;
631 	}
632 
633 	mutex_enter(&balloon_sc->balloon_mtx);
634 	target_min = balloon_sc->balloon_res_min;
635 	mutex_exit(&balloon_sc->balloon_mtx);
636 	if (target_kb < target_min) {
637 		device_printf(balloon_sc->sc_dev,
638 		    "new target %"PRIu64" is below min %"PRIu64"\n",
639 		    target_kb, target_min);
640 		return;
641 	}
642 
643 	target_max = BALLOON_PAGES_TO_KB(xenmem_get_maxreservation());
644 	if (target_kb > target_max) {
645 		/*
646 		 * Should not happen. Hypervisor should block balloon
647 		 * requests above mem-max.
648 		 */
649 		device_printf(balloon_sc->sc_dev,
650 		    "new target %"PRIu64" is above max %"PRIu64"\n",
651 		    target_kb, target_max);
652 		return;
653 	}
654 
655 	new_target = BALLOON_KB_TO_PAGES(target_kb);
656 
657 	device_printf(balloon_sc->sc_dev,
658 	    "current reservation: %zu pages => target: %zu pages\n",
659 	    xenmem_get_currentreservation(), new_target);
660 
661 	/* Only update target if its value changes */
662 	mutex_enter(&balloon_sc->balloon_mtx);
663 	if (balloon_sc->balloon_target != new_target) {
664 		balloon_sc->balloon_target = new_target;
665 		cv_signal(&balloon_sc->balloon_cv);
666 	}
667 	mutex_exit(&balloon_sc->balloon_mtx);
668 
669 	return;
670 }
671 
672 /*
673  * sysctl(9) stuff
674  */
675 
676 /* routine to control the minimum memory reserved for the domain */
677 static int
678 sysctl_kern_xen_balloon_min(SYSCTLFN_ARGS)
679 {
680 	struct sysctlnode node;
681 	u_quad_t newval;
682 	int error;
683 
684 	node = *rnode;
685 	node.sysctl_data = &newval;
686 
687 	mutex_enter(&balloon_sc->balloon_mtx);
688 	newval = balloon_sc->balloon_res_min;
689 	mutex_exit(&balloon_sc->balloon_mtx);
690 
691 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
692 	if (error || newp == NULL)
693 		return error;
694 
695 	/* Safeguard value: refuse to go below. */
696 	if (newval < XEN_RESERVATION_MIN) {
697 		device_printf(balloon_sc->sc_dev,
698 		    "cannot set min below minimum safe value (%d)\n",
699 		    XEN_RESERVATION_MIN);
700 		return EPERM;
701 	}
702 
703 	mutex_enter(&balloon_sc->balloon_mtx);
704 	if (balloon_sc->balloon_res_min != newval)
705 		balloon_sc->balloon_res_min = newval;
706 	mutex_exit(&balloon_sc->balloon_mtx);
707 
708 	return 0;
709 }
710 
711 /* Returns the maximum memory reservation of the domain */
712 static int
713 sysctl_kern_xen_balloon_max(SYSCTLFN_ARGS)
714 {
715 	struct sysctlnode node;
716 	u_quad_t node_val;
717 
718 	node = *rnode;
719 
720 	node_val = BALLOON_PAGES_TO_KB(xenmem_get_maxreservation());
721 	node.sysctl_data = &node_val;
722 	return sysctl_lookup(SYSCTLFN_CALL(&node));
723 }
724 
725 /* Returns the current memory reservation of the domain */
726 static int
727 sysctl_kern_xen_balloon_current(SYSCTLFN_ARGS)
728 {
729 	struct sysctlnode node;
730 	u_quad_t node_val;
731 
732 	node = *rnode;
733 
734 	node_val = BALLOON_PAGES_TO_KB(xenmem_get_currentreservation());
735 	node.sysctl_data = &node_val;
736 	return sysctl_lookup(SYSCTLFN_CALL(&node));
737 }
738 
739 /*
740  * Returns the target memory reservation of the domain
741  * When reading, this sysctl will return the value of the balloon_target
742  * variable, converted into KiB
743  * When used for writing, it will update the new memory/target value
744  * in XenStore, but will not update the balloon_target variable directly.
745  * This will be done by the Xenbus watch handler, balloon_xenbus_watcher().
746  */
747 static int
748 sysctl_kern_xen_balloon_target(SYSCTLFN_ARGS)
749 {
750 	struct sysctlnode node;
751 	u_quad_t newval, res_min, res_max;
752 	int error;
753 
754 	node = *rnode;
755 	node.sysctl_data = &newval;
756 
757 	mutex_enter(&balloon_sc->balloon_mtx);
758 	newval = BALLOON_PAGES_TO_KB(balloon_sc->balloon_target);
759 	res_min = balloon_sc->balloon_res_min;
760 	mutex_exit(&balloon_sc->balloon_mtx);
761 
762 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
763 	if (newp == NULL || error != 0) {
764 		return error;
765 	}
766 
767 	/*
768 	 * Sanity check new size
769 	 * We should not balloon below the minimum reservation
770 	 * set by the domain, nor above the maximum reservation set
771 	 * by domain controller.
772 	 * Note: domain is not supposed to receive balloon requests when
773 	 * they are above maximum reservation, but better be safe than
774 	 * sorry.
775 	 */
776 	res_max = BALLOON_PAGES_TO_KB(xenmem_get_maxreservation());
777 	if (newval < res_min || newval > res_max) {
778 #if BALLOONDEBUG
779 		device_printf(balloon_sc->sc_dev,
780 		    "new value out of bounds: %"PRIu64"\n", newval);
781 		device_printf(balloon_sc->sc_dev,
782 		    "min %"PRIu64", max %"PRIu64"\n", res_min, res_max);
783 #endif
784 		return EPERM;
785 	}
786 
787 	/*
788 	 * Write new value inside Xenstore. This will fire the memory/target
789 	 * watch handler, balloon_xenbus_watcher().
790 	 */
791 	balloon_xenbus_write_target(newval);
792 
793 	return 0;
794 }
795 
796 /* sysctl(9) nodes creation */
797 static void
798 sysctl_kern_xen_balloon_setup(struct balloon_xenbus_softc *sc)
799 {
800 	const struct sysctlnode *node = NULL;
801 	struct sysctllog **clog = &sc->sc_log;
802 
803 	sysctl_createv(clog, 0, NULL, &node,
804 	    CTLFLAG_PERMANENT,
805 	    CTLTYPE_NODE, "machdep", NULL,
806 	    NULL, 0, NULL, 0,
807 	    CTL_MACHDEP, CTL_EOL);
808 
809 	sysctl_createv(clog, 0, &node, &node,
810 	    CTLFLAG_PERMANENT,
811 	    CTLTYPE_NODE, "xen",
812 	    SYSCTL_DESCR("Xen top level node"),
813 	    NULL, 0, NULL, 0,
814 	    CTL_CREATE, CTL_EOL);
815 
816 	sysctl_createv(clog, 0, &node, &node,
817 	    CTLFLAG_PERMANENT,
818 	    CTLTYPE_NODE, "balloon",
819 	    SYSCTL_DESCR("Balloon details"),
820 	    NULL, 0, NULL, 0,
821 	    CTL_CREATE, CTL_EOL);
822 
823 	sysctl_createv(clog, 0, &node, NULL,
824 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
825 	    CTLTYPE_QUAD, "current",
826 	    SYSCTL_DESCR("Domain's current memory reservation from "
827 		"hypervisor, in KiB."),
828 	    sysctl_kern_xen_balloon_current, 0, NULL, 0,
829 	    CTL_CREATE, CTL_EOL);
830 
831 	sysctl_createv(clog, 0, &node, NULL,
832 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
833 	    CTLTYPE_QUAD, "target",
834 	    SYSCTL_DESCR("Target memory reservation for domain, in KiB."),
835 	    sysctl_kern_xen_balloon_target, 0, NULL, 0,
836 	    CTL_CREATE, CTL_EOL);
837 
838 	sysctl_createv(clog, 0, &node, NULL,
839 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
840 	    CTLTYPE_QUAD, "min",
841 	    SYSCTL_DESCR("Minimum amount of memory the domain "
842 		"reserves, in KiB."),
843 	    sysctl_kern_xen_balloon_min, 0, NULL, 0,
844 	    CTL_CREATE, CTL_EOL);
845 
846 	sysctl_createv(clog, 0, &node, NULL,
847 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
848 	    CTLTYPE_QUAD, "max",
849 	    SYSCTL_DESCR("Maximum amount of memory the domain "
850 		"can use, in KiB."),
851 	    sysctl_kern_xen_balloon_max, 0, NULL, 0,
852 	    CTL_CREATE, CTL_EOL);
853 }
854