1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3 */
4#include <linux/file.h>
5#include <linux/interval_tree.h>
6#include <linux/iommu.h>
7#include <linux/iommufd.h>
8#include <linux/slab.h>
9#include <linux/vfio.h>
10#include <uapi/linux/vfio.h>
11#include <uapi/linux/iommufd.h>
12
13#include "iommufd_private.h"
14
15static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
16{
17	struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
18
19	xa_lock(&ictx->objects);
20	if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
21		goto out_unlock;
22	ioas = ictx->vfio_ioas;
23out_unlock:
24	xa_unlock(&ictx->objects);
25	return ioas;
26}
27
28/**
29 * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
30 * @ictx: Context to operate on
31 * @out_ioas_id: The IOAS ID of the compatibility IOAS
32 *
33 * Return the ID of the current compatibility IOAS. The ID can be passed into
34 * other functions that take an ioas_id.
35 */
36int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
37{
38	struct iommufd_ioas *ioas;
39
40	ioas = get_compat_ioas(ictx);
41	if (IS_ERR(ioas))
42		return PTR_ERR(ioas);
43	*out_ioas_id = ioas->obj.id;
44	iommufd_put_object(ictx, &ioas->obj);
45	return 0;
46}
47EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
48
49/**
50 * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
51 * @ictx: Context to operate on
52 *
53 * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
54 */
55int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
56{
57	int ret;
58
59	xa_lock(&ictx->objects);
60	if (!ictx->vfio_ioas) {
61		ictx->no_iommu_mode = 1;
62		ret = 0;
63	} else {
64		ret = -EINVAL;
65	}
66	xa_unlock(&ictx->objects);
67	return ret;
68}
69EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);
70
71/**
72 * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
73 * @ictx: Context to operate on
74 *
75 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
76 * on since they do not have an IOAS ID input in their ABI. Only attaching a
77 * group should cause a default creation of the internal ioas, this does nothing
78 * if an existing ioas has already been assigned somehow.
79 */
80int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
81{
82	struct iommufd_ioas *ioas = NULL;
83	int ret;
84
85	ioas = iommufd_ioas_alloc(ictx);
86	if (IS_ERR(ioas))
87		return PTR_ERR(ioas);
88
89	xa_lock(&ictx->objects);
90	/*
91	 * VFIO won't allow attaching a container to both iommu and no iommu
92	 * operation
93	 */
94	if (ictx->no_iommu_mode) {
95		ret = -EINVAL;
96		goto out_abort;
97	}
98
99	if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
100		ret = 0;
101		iommufd_put_object(ictx, &ictx->vfio_ioas->obj);
102		goto out_abort;
103	}
104	ictx->vfio_ioas = ioas;
105	xa_unlock(&ictx->objects);
106
107	/*
108	 * An automatically created compat IOAS is treated as a userspace
109	 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
110	 * and if not manually destroyed it will be destroyed automatically
111	 * at iommufd release.
112	 */
113	iommufd_object_finalize(ictx, &ioas->obj);
114	return 0;
115
116out_abort:
117	xa_unlock(&ictx->objects);
118	iommufd_object_abort(ictx, &ioas->obj);
119	return ret;
120}
121EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);
122
123int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
124{
125	struct iommu_vfio_ioas *cmd = ucmd->cmd;
126	struct iommufd_ioas *ioas;
127
128	if (cmd->__reserved)
129		return -EOPNOTSUPP;
130	switch (cmd->op) {
131	case IOMMU_VFIO_IOAS_GET:
132		ioas = get_compat_ioas(ucmd->ictx);
133		if (IS_ERR(ioas))
134			return PTR_ERR(ioas);
135		cmd->ioas_id = ioas->obj.id;
136		iommufd_put_object(ucmd->ictx, &ioas->obj);
137		return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
138
139	case IOMMU_VFIO_IOAS_SET:
140		ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
141		if (IS_ERR(ioas))
142			return PTR_ERR(ioas);
143		xa_lock(&ucmd->ictx->objects);
144		ucmd->ictx->vfio_ioas = ioas;
145		xa_unlock(&ucmd->ictx->objects);
146		iommufd_put_object(ucmd->ictx, &ioas->obj);
147		return 0;
148
149	case IOMMU_VFIO_IOAS_CLEAR:
150		xa_lock(&ucmd->ictx->objects);
151		ucmd->ictx->vfio_ioas = NULL;
152		xa_unlock(&ucmd->ictx->objects);
153		return 0;
154	default:
155		return -EOPNOTSUPP;
156	}
157}
158
159static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
160				void __user *arg)
161{
162	u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
163	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
164	struct vfio_iommu_type1_dma_map map;
165	int iommu_prot = IOMMU_CACHE;
166	struct iommufd_ioas *ioas;
167	unsigned long iova;
168	int rc;
169
170	if (copy_from_user(&map, arg, minsz))
171		return -EFAULT;
172
173	if (map.argsz < minsz || map.flags & ~supported_flags)
174		return -EINVAL;
175
176	if (map.flags & VFIO_DMA_MAP_FLAG_READ)
177		iommu_prot |= IOMMU_READ;
178	if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
179		iommu_prot |= IOMMU_WRITE;
180
181	ioas = get_compat_ioas(ictx);
182	if (IS_ERR(ioas))
183		return PTR_ERR(ioas);
184
185	/*
186	 * Maps created through the legacy interface always use VFIO compatible
187	 * rlimit accounting. If the user wishes to use the faster user based
188	 * rlimit accounting then they must use the new interface.
189	 */
190	iova = map.iova;
191	rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
192				 map.size, iommu_prot, 0);
193	iommufd_put_object(ictx, &ioas->obj);
194	return rc;
195}
196
197static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
198				  void __user *arg)
199{
200	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
201	/*
202	 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
203	 * dirty tracking direction:
204	 *  https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
205	 *  https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
206	 */
207	u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
208	struct vfio_iommu_type1_dma_unmap unmap;
209	unsigned long unmapped = 0;
210	struct iommufd_ioas *ioas;
211	int rc;
212
213	if (copy_from_user(&unmap, arg, minsz))
214		return -EFAULT;
215
216	if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
217		return -EINVAL;
218
219	ioas = get_compat_ioas(ictx);
220	if (IS_ERR(ioas))
221		return PTR_ERR(ioas);
222
223	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
224		if (unmap.iova != 0 || unmap.size != 0) {
225			rc = -EINVAL;
226			goto err_put;
227		}
228		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
229	} else {
230		if (READ_ONCE(ioas->iopt.disable_large_pages)) {
231			/*
232			 * Create cuts at the start and last of the requested
233			 * range. If the start IOVA is 0 then it doesn't need to
234			 * be cut.
235			 */
236			unsigned long iovas[] = { unmap.iova + unmap.size - 1,
237						  unmap.iova - 1 };
238
239			rc = iopt_cut_iova(&ioas->iopt, iovas,
240					   unmap.iova ? 2 : 1);
241			if (rc)
242				goto err_put;
243		}
244		rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
245				     &unmapped);
246	}
247	unmap.size = unmapped;
248	if (copy_to_user(arg, &unmap, minsz))
249		rc = -EFAULT;
250
251err_put:
252	iommufd_put_object(ictx, &ioas->obj);
253	return rc;
254}
255
256static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
257{
258	struct iommufd_hwpt_paging *hwpt_paging;
259	struct iommufd_ioas *ioas;
260	int rc = 1;
261
262	ioas = get_compat_ioas(ictx);
263	if (IS_ERR(ioas))
264		return PTR_ERR(ioas);
265
266	mutex_lock(&ioas->mutex);
267	list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
268		if (!hwpt_paging->enforce_cache_coherency) {
269			rc = 0;
270			break;
271		}
272	}
273	mutex_unlock(&ioas->mutex);
274
275	iommufd_put_object(ictx, &ioas->obj);
276	return rc;
277}
278
279static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
280					unsigned long type)
281{
282	switch (type) {
283	case VFIO_TYPE1_IOMMU:
284	case VFIO_TYPE1v2_IOMMU:
285	case VFIO_UNMAP_ALL:
286		return 1;
287
288	case VFIO_NOIOMMU_IOMMU:
289		return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
290
291	case VFIO_DMA_CC_IOMMU:
292		return iommufd_vfio_cc_iommu(ictx);
293
294	/*
295	 * This is obsolete, and to be removed from VFIO. It was an incomplete
296	 * idea that got merged.
297	 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
298	 */
299	case VFIO_TYPE1_NESTING_IOMMU:
300		return 0;
301
302	/*
303	 * VFIO_DMA_MAP_FLAG_VADDR
304	 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
305	 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
306	 *
307	 * It is hard to see how this could be implemented safely.
308	 */
309	case VFIO_UPDATE_VADDR:
310	default:
311		return 0;
312	}
313}
314
315static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
316{
317	bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
318	struct iommufd_ioas *ioas = NULL;
319	int rc = 0;
320
321	/*
322	 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
323	 * other ioctls. We let them keep working but they mostly fail since no
324	 * IOAS should exist.
325	 */
326	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
327	    no_iommu_mode) {
328		if (!capable(CAP_SYS_RAWIO))
329			return -EPERM;
330		return 0;
331	}
332
333	if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
334	    no_iommu_mode)
335		return -EINVAL;
336
337	/* VFIO fails the set_iommu if there is no group */
338	ioas = get_compat_ioas(ictx);
339	if (IS_ERR(ioas))
340		return PTR_ERR(ioas);
341
342	/*
343	 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
344	 * the middle of mapped ranges. This is complicated by huge page support
345	 * which creates single large IOPTEs that cannot be split by the iommu
346	 * driver. TYPE1 is very old at this point and likely nothing uses it,
347	 * however it is simple enough to emulate by simply disabling the
348	 * problematic large IOPTEs. Then we can safely unmap within any range.
349	 */
350	if (type == VFIO_TYPE1_IOMMU)
351		rc = iopt_disable_large_pages(&ioas->iopt);
352	iommufd_put_object(ictx, &ioas->obj);
353	return rc;
354}
355
356static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
357{
358	struct io_pagetable *iopt = &ioas->iopt;
359	unsigned long pgsize_bitmap = ULONG_MAX;
360	struct iommu_domain *domain;
361	unsigned long index;
362
363	down_read(&iopt->domains_rwsem);
364	xa_for_each(&iopt->domains, index, domain)
365		pgsize_bitmap &= domain->pgsize_bitmap;
366
367	/* See vfio_update_pgsize_bitmap() */
368	if (pgsize_bitmap & ~PAGE_MASK) {
369		pgsize_bitmap &= PAGE_MASK;
370		pgsize_bitmap |= PAGE_SIZE;
371	}
372	pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
373	up_read(&iopt->domains_rwsem);
374	return pgsize_bitmap;
375}
376
377static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
378				 struct vfio_info_cap_header __user *cur,
379				 size_t avail)
380{
381	struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
382		container_of(cur,
383			     struct vfio_iommu_type1_info_cap_iova_range __user,
384			     header);
385	struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
386		.header = {
387			.id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
388			.version = 1,
389		},
390	};
391	struct interval_tree_span_iter span;
392
393	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
394				    ULONG_MAX) {
395		struct vfio_iova_range range;
396
397		if (!span.is_hole)
398			continue;
399		range.start = span.start_hole;
400		range.end = span.last_hole;
401		if (avail >= struct_size(&cap_iovas, iova_ranges,
402					 cap_iovas.nr_iovas + 1) &&
403		    copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
404				 &range, sizeof(range)))
405			return -EFAULT;
406		cap_iovas.nr_iovas++;
407	}
408	if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
409	    copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
410		return -EFAULT;
411	return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
412}
413
414static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
415				      struct vfio_info_cap_header __user *cur,
416				      size_t avail)
417{
418	struct vfio_iommu_type1_info_dma_avail cap_dma = {
419		.header = {
420			.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
421			.version = 1,
422		},
423		/*
424		 * iommufd's limit is based on the cgroup's memory limit.
425		 * Normally vfio would return U16_MAX here, and provide a module
426		 * parameter to adjust it. Since S390 qemu userspace actually
427		 * pays attention and needs a value bigger than U16_MAX return
428		 * U32_MAX.
429		 */
430		.avail = U32_MAX,
431	};
432
433	if (avail >= sizeof(cap_dma) &&
434	    copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
435		return -EFAULT;
436	return sizeof(cap_dma);
437}
438
439static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
440				       void __user *arg)
441{
442	typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
443				   struct vfio_info_cap_header __user *cur,
444				   size_t avail);
445	static const fill_cap_fn fill_fns[] = {
446		iommufd_fill_cap_dma_avail,
447		iommufd_fill_cap_iova,
448	};
449	size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
450	struct vfio_info_cap_header __user *last_cap = NULL;
451	struct vfio_iommu_type1_info info = {};
452	struct iommufd_ioas *ioas;
453	size_t total_cap_size;
454	int rc;
455	int i;
456
457	if (copy_from_user(&info, arg, minsz))
458		return -EFAULT;
459
460	if (info.argsz < minsz)
461		return -EINVAL;
462	minsz = min_t(size_t, info.argsz, sizeof(info));
463
464	ioas = get_compat_ioas(ictx);
465	if (IS_ERR(ioas))
466		return PTR_ERR(ioas);
467
468	info.flags = VFIO_IOMMU_INFO_PGSIZES;
469	info.iova_pgsizes = iommufd_get_pagesizes(ioas);
470	info.cap_offset = 0;
471
472	down_read(&ioas->iopt.iova_rwsem);
473	total_cap_size = sizeof(info);
474	for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
475		int cap_size;
476
477		if (info.argsz > total_cap_size)
478			cap_size = fill_fns[i](ioas, arg + total_cap_size,
479					       info.argsz - total_cap_size);
480		else
481			cap_size = fill_fns[i](ioas, NULL, 0);
482		if (cap_size < 0) {
483			rc = cap_size;
484			goto out_put;
485		}
486		cap_size = ALIGN(cap_size, sizeof(u64));
487
488		if (last_cap && info.argsz >= total_cap_size &&
489		    put_user(total_cap_size, &last_cap->next)) {
490			rc = -EFAULT;
491			goto out_put;
492		}
493		last_cap = arg + total_cap_size;
494		total_cap_size += cap_size;
495	}
496
497	/*
498	 * If the user did not provide enough space then only some caps are
499	 * returned and the argsz will be updated to the correct amount to get
500	 * all caps.
501	 */
502	if (info.argsz >= total_cap_size)
503		info.cap_offset = sizeof(info);
504	info.argsz = total_cap_size;
505	info.flags |= VFIO_IOMMU_INFO_CAPS;
506	if (copy_to_user(arg, &info, minsz)) {
507		rc = -EFAULT;
508		goto out_put;
509	}
510	rc = 0;
511
512out_put:
513	up_read(&ioas->iopt.iova_rwsem);
514	iommufd_put_object(ictx, &ioas->obj);
515	return rc;
516}
517
518int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
519		       unsigned long arg)
520{
521	void __user *uarg = (void __user *)arg;
522
523	switch (cmd) {
524	case VFIO_GET_API_VERSION:
525		return VFIO_API_VERSION;
526	case VFIO_SET_IOMMU:
527		return iommufd_vfio_set_iommu(ictx, arg);
528	case VFIO_CHECK_EXTENSION:
529		return iommufd_vfio_check_extension(ictx, arg);
530	case VFIO_IOMMU_GET_INFO:
531		return iommufd_vfio_iommu_get_info(ictx, uarg);
532	case VFIO_IOMMU_MAP_DMA:
533		return iommufd_vfio_map_dma(ictx, cmd, uarg);
534	case VFIO_IOMMU_UNMAP_DMA:
535		return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
536	case VFIO_IOMMU_DIRTY_PAGES:
537	default:
538		return -ENOIOCTLCMD;
539	}
540	return -ENOIOCTLCMD;
541}
542