1/*
2 * Copyright (c) 2014 Roger Pau Monn�� <roger.pau@citrix.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/uio.h>
33#include <sys/bus.h>
34#include <sys/malloc.h>
35#include <sys/kernel.h>
36#include <sys/lock.h>
37#include <sys/mutex.h>
38#include <sys/rwlock.h>
39#include <sys/selinfo.h>
40#include <sys/poll.h>
41#include <sys/conf.h>
42#include <sys/fcntl.h>
43#include <sys/ioccom.h>
44#include <sys/rman.h>
45#include <sys/tree.h>
46#include <sys/module.h>
47#include <sys/proc.h>
48#include <sys/bitset.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_page.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57#include <vm/vm_pager.h>
58
59#include <machine/md_var.h>
60
61#include <xen/xen-os.h>
62#include <xen/hypervisor.h>
63#include <xen/privcmd.h>
64#include <xen/error.h>
65
66MALLOC_DEFINE(M_PRIVCMD, "privcmd_dev", "Xen privcmd user-space device");
67
68struct privcmd_map {
69	vm_object_t mem;
70	vm_size_t size;
71	struct resource *pseudo_phys_res;
72	int pseudo_phys_res_id;
73	vm_paddr_t phys_base_addr;
74	boolean_t mapped;
75	BITSET_DEFINE_VAR() *err;
76};
77
78static d_ioctl_t     privcmd_ioctl;
79static d_mmap_single_t	privcmd_mmap_single;
80
81static struct cdevsw privcmd_devsw = {
82	.d_version = D_VERSION,
83	.d_ioctl = privcmd_ioctl,
84	.d_mmap_single = privcmd_mmap_single,
85	.d_name = "privcmd",
86};
87
88static int privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
89    vm_ooffset_t foff, struct ucred *cred, u_short *color);
90static void privcmd_pg_dtor(void *handle);
91static int privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
92    int prot, vm_page_t *mres);
93
94static struct cdev_pager_ops privcmd_pg_ops = {
95	.cdev_pg_fault = privcmd_pg_fault,
96	.cdev_pg_ctor =	privcmd_pg_ctor,
97	.cdev_pg_dtor =	privcmd_pg_dtor,
98};
99
100static device_t privcmd_dev = NULL;
101
102/*------------------------- Privcmd Pager functions --------------------------*/
103static int
104privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
105    vm_ooffset_t foff, struct ucred *cred, u_short *color)
106{
107
108	return (0);
109}
110
111static void
112privcmd_pg_dtor(void *handle)
113{
114	struct xen_remove_from_physmap rm = { .domid = DOMID_SELF };
115	struct privcmd_map *map = handle;
116	int error;
117	vm_size_t i;
118	vm_page_t m;
119
120	/*
121	 * Remove the mappings from the used pages. This will remove the
122	 * underlying p2m bindings in Xen second stage translation.
123	 */
124	if (map->mapped == true) {
125		VM_OBJECT_WLOCK(map->mem);
126retry:
127		for (i = 0; i < map->size; i++) {
128			m = vm_page_lookup(map->mem, i);
129			if (m == NULL)
130				continue;
131			if (vm_page_sleep_if_busy(m, "pcmdum"))
132				goto retry;
133			cdev_pager_free_page(map->mem, m);
134		}
135		VM_OBJECT_WUNLOCK(map->mem);
136
137		for (i = 0; i < map->size; i++) {
138			rm.gpfn = atop(map->phys_base_addr) + i;
139			HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &rm);
140		}
141		free(map->err, M_PRIVCMD);
142	}
143
144	error = xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
145	    map->pseudo_phys_res);
146	KASSERT(error == 0, ("Unable to release memory resource: %d", error));
147
148	free(map, M_PRIVCMD);
149}
150
151static int
152privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
153    int prot, vm_page_t *mres)
154{
155	struct privcmd_map *map = object->handle;
156	vm_pindex_t pidx;
157	vm_page_t page, oldm;
158
159	if (map->mapped != true)
160		return (VM_PAGER_FAIL);
161
162	pidx = OFF_TO_IDX(offset);
163	if (pidx >= map->size || BIT_ISSET(map->size, pidx, map->err))
164		return (VM_PAGER_FAIL);
165
166	page = PHYS_TO_VM_PAGE(map->phys_base_addr + offset);
167	if (page == NULL)
168		return (VM_PAGER_FAIL);
169
170	KASSERT((page->flags & PG_FICTITIOUS) != 0,
171	    ("not fictitious %p", page));
172	KASSERT(page->wire_count == 1, ("wire_count not 1 %p", page));
173	KASSERT(vm_page_busied(page) == 0, ("page %p is busy", page));
174
175	if (*mres != NULL) {
176		oldm = *mres;
177		vm_page_lock(oldm);
178		vm_page_free(oldm);
179		vm_page_unlock(oldm);
180		*mres = NULL;
181	}
182
183	vm_page_insert(page, object, pidx);
184	page->valid = VM_PAGE_BITS_ALL;
185	vm_page_xbusy(page);
186	*mres = page;
187	return (VM_PAGER_OK);
188}
189
190/*----------------------- Privcmd char device methods ------------------------*/
191static int
192privcmd_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
193    vm_object_t *object, int nprot)
194{
195	struct privcmd_map *map;
196
197	map = malloc(sizeof(*map), M_PRIVCMD, M_WAITOK | M_ZERO);
198
199	map->size = OFF_TO_IDX(size);
200	map->pseudo_phys_res_id = 0;
201
202	map->pseudo_phys_res = xenmem_alloc(privcmd_dev,
203	    &map->pseudo_phys_res_id, size);
204	if (map->pseudo_phys_res == NULL) {
205		free(map, M_PRIVCMD);
206		return (ENOMEM);
207	}
208
209	map->phys_base_addr = rman_get_start(map->pseudo_phys_res);
210	map->mem = cdev_pager_allocate(map, OBJT_MGTDEVICE, &privcmd_pg_ops,
211	    size, nprot, *offset, NULL);
212	if (map->mem == NULL) {
213		xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
214		    map->pseudo_phys_res);
215		free(map, M_PRIVCMD);
216		return (ENOMEM);
217	}
218
219	*object = map->mem;
220
221	return (0);
222}
223
224static int
225privcmd_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg,
226	      int mode, struct thread *td)
227{
228	int error, i;
229
230	switch (cmd) {
231	case IOCTL_PRIVCMD_HYPERCALL: {
232		struct ioctl_privcmd_hypercall *hcall;
233
234		hcall = (struct ioctl_privcmd_hypercall *)arg;
235#ifdef __amd64__
236		/*
237		 * The hypervisor page table walker will refuse to access
238		 * user-space pages if SMAP is enabled, so temporary disable it
239		 * while performing the hypercall.
240		 */
241		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
242			stac();
243#endif
244		error = privcmd_hypercall(hcall->op, hcall->arg[0],
245		    hcall->arg[1], hcall->arg[2], hcall->arg[3], hcall->arg[4]);
246#ifdef __amd64__
247		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
248			clac();
249#endif
250		if (error >= 0) {
251			hcall->retval = error;
252			error = 0;
253		} else {
254			error = xen_translate_error(error);
255			hcall->retval = 0;
256		}
257		break;
258	}
259	case IOCTL_PRIVCMD_MMAPBATCH: {
260		struct ioctl_privcmd_mmapbatch *mmap;
261		vm_map_t map;
262		vm_map_entry_t entry;
263		vm_object_t mem;
264		vm_pindex_t pindex;
265		vm_prot_t prot;
266		boolean_t wired;
267		struct xen_add_to_physmap_range add;
268		xen_ulong_t *idxs;
269		xen_pfn_t *gpfns;
270		int *errs, index;
271		struct privcmd_map *umap;
272		uint16_t num;
273
274		mmap = (struct ioctl_privcmd_mmapbatch *)arg;
275
276		if ((mmap->num == 0) ||
277		    ((mmap->addr & PAGE_MASK) != 0)) {
278			error = EINVAL;
279			break;
280		}
281
282		map = &td->td_proc->p_vmspace->vm_map;
283		error = vm_map_lookup(&map, mmap->addr, VM_PROT_NONE, &entry,
284		    &mem, &pindex, &prot, &wired);
285		if (error != KERN_SUCCESS) {
286			error = EINVAL;
287			break;
288		}
289		if ((entry->start != mmap->addr) ||
290		    (entry->end != mmap->addr + (mmap->num * PAGE_SIZE))) {
291			vm_map_lookup_done(map, entry);
292			error = EINVAL;
293			break;
294		}
295		vm_map_lookup_done(map, entry);
296		if ((mem->type != OBJT_MGTDEVICE) ||
297		    (mem->un_pager.devp.ops != &privcmd_pg_ops)) {
298			error = EINVAL;
299			break;
300		}
301		umap = mem->handle;
302
303		add.domid = DOMID_SELF;
304		add.space = XENMAPSPACE_gmfn_foreign;
305		add.foreign_domid = mmap->dom;
306
307		/*
308		 * The 'size' field in the xen_add_to_physmap_range only
309		 * allows for UINT16_MAX mappings in a single hypercall.
310		 */
311		num = MIN(mmap->num, UINT16_MAX);
312
313		idxs = malloc(sizeof(*idxs) * num, M_PRIVCMD, M_WAITOK);
314		gpfns = malloc(sizeof(*gpfns) * num, M_PRIVCMD, M_WAITOK);
315		errs = malloc(sizeof(*errs) * num, M_PRIVCMD, M_WAITOK);
316
317		set_xen_guest_handle(add.idxs, idxs);
318		set_xen_guest_handle(add.gpfns, gpfns);
319		set_xen_guest_handle(add.errs, errs);
320
321		/* Allocate a bitset to store broken page mappings. */
322		umap->err = BITSET_ALLOC(mmap->num, M_PRIVCMD,
323		    M_WAITOK | M_ZERO);
324
325		for (index = 0; index < mmap->num; index += num) {
326			num = MIN(mmap->num - index, UINT16_MAX);
327			add.size = num;
328
329			error = copyin(&mmap->arr[index], idxs,
330			    sizeof(idxs[0]) * num);
331			if (error != 0)
332				goto mmap_out;
333
334			for (i = 0; i < num; i++)
335				gpfns[i] = atop(umap->phys_base_addr +
336				    (i + index) * PAGE_SIZE);
337
338			bzero(errs, sizeof(*errs) * num);
339
340			error = HYPERVISOR_memory_op(
341			    XENMEM_add_to_physmap_range, &add);
342			if (error != 0) {
343				error = xen_translate_error(error);
344				goto mmap_out;
345			}
346
347			for (i = 0; i < num; i++) {
348				if (errs[i] != 0) {
349					errs[i] = xen_translate_error(errs[i]);
350
351					/* Mark the page as invalid. */
352					BIT_SET(mmap->num, index + i,
353					    umap->err);
354				}
355			}
356
357			error = copyout(errs, &mmap->err[index],
358			    sizeof(errs[0]) * num);
359			if (error != 0)
360				goto mmap_out;
361		}
362
363		umap->mapped = true;
364
365mmap_out:
366		free(idxs, M_PRIVCMD);
367		free(gpfns, M_PRIVCMD);
368		free(errs, M_PRIVCMD);
369		if (!umap->mapped)
370			free(umap->err, M_PRIVCMD);
371
372		break;
373	}
374
375	default:
376		error = ENOSYS;
377		break;
378	}
379
380	return (error);
381}
382
383/*------------------ Private Device Attachment Functions  --------------------*/
384static void
385privcmd_identify(driver_t *driver, device_t parent)
386{
387
388	KASSERT(xen_domain(),
389	    ("Trying to attach privcmd device on non Xen domain"));
390
391	if (BUS_ADD_CHILD(parent, 0, "privcmd", 0) == NULL)
392		panic("unable to attach privcmd user-space device");
393}
394
395static int
396privcmd_probe(device_t dev)
397{
398
399	privcmd_dev = dev;
400	device_set_desc(dev, "Xen privileged interface user-space device");
401	return (BUS_PROBE_NOWILDCARD);
402}
403
404static int
405privcmd_attach(device_t dev)
406{
407
408	make_dev_credf(MAKEDEV_ETERNAL, &privcmd_devsw, 0, NULL, UID_ROOT,
409	    GID_WHEEL, 0600, "xen/privcmd");
410	return (0);
411}
412
413/*-------------------- Private Device Attachment Data  -----------------------*/
414static device_method_t privcmd_methods[] = {
415	DEVMETHOD(device_identify,	privcmd_identify),
416	DEVMETHOD(device_probe,		privcmd_probe),
417	DEVMETHOD(device_attach,	privcmd_attach),
418
419	DEVMETHOD_END
420};
421
422static driver_t privcmd_driver = {
423	"privcmd",
424	privcmd_methods,
425	0,
426};
427
428devclass_t privcmd_devclass;
429
430DRIVER_MODULE(privcmd, xenpv, privcmd_driver, privcmd_devclass, 0, 0);
431MODULE_DEPEND(privcmd, xenpv, 1, 1, 1);
432