1// SPDX-License-Identifier: GPL-2.0
2#include <linux/backing-dev.h>
3#include <linux/falloc.h>
4#include <linux/kvm_host.h>
5#include <linux/pagemap.h>
6#include <linux/anon_inodes.h>
7
8#include "kvm_mm.h"
9
10struct kvm_gmem {
11	struct kvm *kvm;
12	struct xarray bindings;
13	struct list_head entry;
14};
15
16static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
17{
18	struct folio *folio;
19
20	/* TODO: Support huge pages. */
21	folio = filemap_grab_folio(inode->i_mapping, index);
22	if (IS_ERR_OR_NULL(folio))
23		return NULL;
24
25	/*
26	 * Use the up-to-date flag to track whether or not the memory has been
27	 * zeroed before being handed off to the guest.  There is no backing
28	 * storage for the memory, so the folio will remain up-to-date until
29	 * it's removed.
30	 *
31	 * TODO: Skip clearing pages when trusted firmware will do it when
32	 * assigning memory to the guest.
33	 */
34	if (!folio_test_uptodate(folio)) {
35		unsigned long nr_pages = folio_nr_pages(folio);
36		unsigned long i;
37
38		for (i = 0; i < nr_pages; i++)
39			clear_highpage(folio_page(folio, i));
40
41		folio_mark_uptodate(folio);
42	}
43
44	/*
45	 * Ignore accessed, referenced, and dirty flags.  The memory is
46	 * unevictable and there is no storage to write back to.
47	 */
48	return folio;
49}
50
51static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
52				      pgoff_t end)
53{
54	bool flush = false, found_memslot = false;
55	struct kvm_memory_slot *slot;
56	struct kvm *kvm = gmem->kvm;
57	unsigned long index;
58
59	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
60		pgoff_t pgoff = slot->gmem.pgoff;
61
62		struct kvm_gfn_range gfn_range = {
63			.start = slot->base_gfn + max(pgoff, start) - pgoff,
64			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
65			.slot = slot,
66			.may_block = true,
67		};
68
69		if (!found_memslot) {
70			found_memslot = true;
71
72			KVM_MMU_LOCK(kvm);
73			kvm_mmu_invalidate_begin(kvm);
74		}
75
76		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
77	}
78
79	if (flush)
80		kvm_flush_remote_tlbs(kvm);
81
82	if (found_memslot)
83		KVM_MMU_UNLOCK(kvm);
84}
85
86static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
87				    pgoff_t end)
88{
89	struct kvm *kvm = gmem->kvm;
90
91	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
92		KVM_MMU_LOCK(kvm);
93		kvm_mmu_invalidate_end(kvm);
94		KVM_MMU_UNLOCK(kvm);
95	}
96}
97
98static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
99{
100	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
101	pgoff_t start = offset >> PAGE_SHIFT;
102	pgoff_t end = (offset + len) >> PAGE_SHIFT;
103	struct kvm_gmem *gmem;
104
105	/*
106	 * Bindings must be stable across invalidation to ensure the start+end
107	 * are balanced.
108	 */
109	filemap_invalidate_lock(inode->i_mapping);
110
111	list_for_each_entry(gmem, gmem_list, entry)
112		kvm_gmem_invalidate_begin(gmem, start, end);
113
114	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
115
116	list_for_each_entry(gmem, gmem_list, entry)
117		kvm_gmem_invalidate_end(gmem, start, end);
118
119	filemap_invalidate_unlock(inode->i_mapping);
120
121	return 0;
122}
123
124static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
125{
126	struct address_space *mapping = inode->i_mapping;
127	pgoff_t start, index, end;
128	int r;
129
130	/* Dedicated guest is immutable by default. */
131	if (offset + len > i_size_read(inode))
132		return -EINVAL;
133
134	filemap_invalidate_lock_shared(mapping);
135
136	start = offset >> PAGE_SHIFT;
137	end = (offset + len) >> PAGE_SHIFT;
138
139	r = 0;
140	for (index = start; index < end; ) {
141		struct folio *folio;
142
143		if (signal_pending(current)) {
144			r = -EINTR;
145			break;
146		}
147
148		folio = kvm_gmem_get_folio(inode, index);
149		if (!folio) {
150			r = -ENOMEM;
151			break;
152		}
153
154		index = folio_next_index(folio);
155
156		folio_unlock(folio);
157		folio_put(folio);
158
159		/* 64-bit only, wrapping the index should be impossible. */
160		if (WARN_ON_ONCE(!index))
161			break;
162
163		cond_resched();
164	}
165
166	filemap_invalidate_unlock_shared(mapping);
167
168	return r;
169}
170
171static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
172			       loff_t len)
173{
174	int ret;
175
176	if (!(mode & FALLOC_FL_KEEP_SIZE))
177		return -EOPNOTSUPP;
178
179	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
180		return -EOPNOTSUPP;
181
182	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
183		return -EINVAL;
184
185	if (mode & FALLOC_FL_PUNCH_HOLE)
186		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
187	else
188		ret = kvm_gmem_allocate(file_inode(file), offset, len);
189
190	if (!ret)
191		file_modified(file);
192	return ret;
193}
194
195static int kvm_gmem_release(struct inode *inode, struct file *file)
196{
197	struct kvm_gmem *gmem = file->private_data;
198	struct kvm_memory_slot *slot;
199	struct kvm *kvm = gmem->kvm;
200	unsigned long index;
201
202	/*
203	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
204	 * reference to the file and thus no new bindings can be created, but
205	 * dereferencing the slot for existing bindings needs to be protected
206	 * against memslot updates, specifically so that unbind doesn't race
207	 * and free the memslot (kvm_gmem_get_file() will return NULL).
208	 */
209	mutex_lock(&kvm->slots_lock);
210
211	filemap_invalidate_lock(inode->i_mapping);
212
213	xa_for_each(&gmem->bindings, index, slot)
214		rcu_assign_pointer(slot->gmem.file, NULL);
215
216	synchronize_rcu();
217
218	/*
219	 * All in-flight operations are gone and new bindings can be created.
220	 * Zap all SPTEs pointed at by this file.  Do not free the backing
221	 * memory, as its lifetime is associated with the inode, not the file.
222	 */
223	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
224	kvm_gmem_invalidate_end(gmem, 0, -1ul);
225
226	list_del(&gmem->entry);
227
228	filemap_invalidate_unlock(inode->i_mapping);
229
230	mutex_unlock(&kvm->slots_lock);
231
232	xa_destroy(&gmem->bindings);
233	kfree(gmem);
234
235	kvm_put_kvm(kvm);
236
237	return 0;
238}
239
240static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
241{
242	/*
243	 * Do not return slot->gmem.file if it has already been closed;
244	 * there might be some time between the last fput() and when
245	 * kvm_gmem_release() clears slot->gmem.file, and you do not
246	 * want to spin in the meanwhile.
247	 */
248	return get_file_active(&slot->gmem.file);
249}
250
251static struct file_operations kvm_gmem_fops = {
252	.open		= generic_file_open,
253	.release	= kvm_gmem_release,
254	.fallocate	= kvm_gmem_fallocate,
255};
256
257void kvm_gmem_init(struct module *module)
258{
259	kvm_gmem_fops.owner = module;
260}
261
262static int kvm_gmem_migrate_folio(struct address_space *mapping,
263				  struct folio *dst, struct folio *src,
264				  enum migrate_mode mode)
265{
266	WARN_ON_ONCE(1);
267	return -EINVAL;
268}
269
270static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
271{
272	struct list_head *gmem_list = &mapping->i_private_list;
273	struct kvm_gmem *gmem;
274	pgoff_t start, end;
275
276	filemap_invalidate_lock_shared(mapping);
277
278	start = folio->index;
279	end = start + folio_nr_pages(folio);
280
281	list_for_each_entry(gmem, gmem_list, entry)
282		kvm_gmem_invalidate_begin(gmem, start, end);
283
284	/*
285	 * Do not truncate the range, what action is taken in response to the
286	 * error is userspace's decision (assuming the architecture supports
287	 * gracefully handling memory errors).  If/when the guest attempts to
288	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
289	 * at which point KVM can either terminate the VM or propagate the
290	 * error to userspace.
291	 */
292
293	list_for_each_entry(gmem, gmem_list, entry)
294		kvm_gmem_invalidate_end(gmem, start, end);
295
296	filemap_invalidate_unlock_shared(mapping);
297
298	return MF_DELAYED;
299}
300
301static const struct address_space_operations kvm_gmem_aops = {
302	.dirty_folio = noop_dirty_folio,
303	.migrate_folio	= kvm_gmem_migrate_folio,
304	.error_remove_folio = kvm_gmem_error_folio,
305};
306
307static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
308			    struct kstat *stat, u32 request_mask,
309			    unsigned int query_flags)
310{
311	struct inode *inode = path->dentry->d_inode;
312
313	generic_fillattr(idmap, request_mask, inode, stat);
314	return 0;
315}
316
317static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
318			    struct iattr *attr)
319{
320	return -EINVAL;
321}
322static const struct inode_operations kvm_gmem_iops = {
323	.getattr	= kvm_gmem_getattr,
324	.setattr	= kvm_gmem_setattr,
325};
326
327static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
328{
329	const char *anon_name = "[kvm-gmem]";
330	struct kvm_gmem *gmem;
331	struct inode *inode;
332	struct file *file;
333	int fd, err;
334
335	fd = get_unused_fd_flags(0);
336	if (fd < 0)
337		return fd;
338
339	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
340	if (!gmem) {
341		err = -ENOMEM;
342		goto err_fd;
343	}
344
345	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
346					 O_RDWR, NULL);
347	if (IS_ERR(file)) {
348		err = PTR_ERR(file);
349		goto err_gmem;
350	}
351
352	file->f_flags |= O_LARGEFILE;
353
354	inode = file->f_inode;
355	WARN_ON(file->f_mapping != inode->i_mapping);
356
357	inode->i_private = (void *)(unsigned long)flags;
358	inode->i_op = &kvm_gmem_iops;
359	inode->i_mapping->a_ops = &kvm_gmem_aops;
360	inode->i_mode |= S_IFREG;
361	inode->i_size = size;
362	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
363	mapping_set_unmovable(inode->i_mapping);
364	/* Unmovable mappings are supposed to be marked unevictable as well. */
365	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
366
367	kvm_get_kvm(kvm);
368	gmem->kvm = kvm;
369	xa_init(&gmem->bindings);
370	list_add(&gmem->entry, &inode->i_mapping->i_private_list);
371
372	fd_install(fd, file);
373	return fd;
374
375err_gmem:
376	kfree(gmem);
377err_fd:
378	put_unused_fd(fd);
379	return err;
380}
381
382int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
383{
384	loff_t size = args->size;
385	u64 flags = args->flags;
386	u64 valid_flags = 0;
387
388	if (flags & ~valid_flags)
389		return -EINVAL;
390
391	if (size <= 0 || !PAGE_ALIGNED(size))
392		return -EINVAL;
393
394	return __kvm_gmem_create(kvm, size, flags);
395}
396
397int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
398		  unsigned int fd, loff_t offset)
399{
400	loff_t size = slot->npages << PAGE_SHIFT;
401	unsigned long start, end;
402	struct kvm_gmem *gmem;
403	struct inode *inode;
404	struct file *file;
405	int r = -EINVAL;
406
407	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
408
409	file = fget(fd);
410	if (!file)
411		return -EBADF;
412
413	if (file->f_op != &kvm_gmem_fops)
414		goto err;
415
416	gmem = file->private_data;
417	if (gmem->kvm != kvm)
418		goto err;
419
420	inode = file_inode(file);
421
422	if (offset < 0 || !PAGE_ALIGNED(offset) ||
423	    offset + size > i_size_read(inode))
424		goto err;
425
426	filemap_invalidate_lock(inode->i_mapping);
427
428	start = offset >> PAGE_SHIFT;
429	end = start + slot->npages;
430
431	if (!xa_empty(&gmem->bindings) &&
432	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
433		filemap_invalidate_unlock(inode->i_mapping);
434		goto err;
435	}
436
437	/*
438	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
439	 * be see either a NULL file or this new file, no need for them to go
440	 * away.
441	 */
442	rcu_assign_pointer(slot->gmem.file, file);
443	slot->gmem.pgoff = start;
444
445	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
446	filemap_invalidate_unlock(inode->i_mapping);
447
448	/*
449	 * Drop the reference to the file, even on success.  The file pins KVM,
450	 * not the other way 'round.  Active bindings are invalidated if the
451	 * file is closed before memslots are destroyed.
452	 */
453	r = 0;
454err:
455	fput(file);
456	return r;
457}
458
459void kvm_gmem_unbind(struct kvm_memory_slot *slot)
460{
461	unsigned long start = slot->gmem.pgoff;
462	unsigned long end = start + slot->npages;
463	struct kvm_gmem *gmem;
464	struct file *file;
465
466	/*
467	 * Nothing to do if the underlying file was already closed (or is being
468	 * closed right now), kvm_gmem_release() invalidates all bindings.
469	 */
470	file = kvm_gmem_get_file(slot);
471	if (!file)
472		return;
473
474	gmem = file->private_data;
475
476	filemap_invalidate_lock(file->f_mapping);
477	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
478	rcu_assign_pointer(slot->gmem.file, NULL);
479	synchronize_rcu();
480	filemap_invalidate_unlock(file->f_mapping);
481
482	fput(file);
483}
484
485int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
486		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
487{
488	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
489	struct kvm_gmem *gmem;
490	struct folio *folio;
491	struct page *page;
492	struct file *file;
493	int r;
494
495	file = kvm_gmem_get_file(slot);
496	if (!file)
497		return -EFAULT;
498
499	gmem = file->private_data;
500
501	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
502		r = -EIO;
503		goto out_fput;
504	}
505
506	folio = kvm_gmem_get_folio(file_inode(file), index);
507	if (!folio) {
508		r = -ENOMEM;
509		goto out_fput;
510	}
511
512	if (folio_test_hwpoison(folio)) {
513		r = -EHWPOISON;
514		goto out_unlock;
515	}
516
517	page = folio_file_page(folio, index);
518
519	*pfn = page_to_pfn(page);
520	if (max_order)
521		*max_order = 0;
522
523	r = 0;
524
525out_unlock:
526	folio_unlock(folio);
527out_fput:
528	fput(file);
529
530	return r;
531}
532EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
533