1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 *  fs/userfaultfd.c
4 *
5 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
6 *  Copyright (C) 2008-2009 Red Hat, Inc.
7 *  Copyright (C) 2015  Red Hat, Inc.
8 *
9 *  Some part derived from fs/eventfd.c (anon inode setup) and
10 *  mm/ksm.c (mm hashing).
11 */
12
13#include <linux/list.h>
14#include <linux/hashtable.h>
15#include <linux/sched/signal.h>
16#include <linux/sched/mm.h>
17#include <linux/mm.h>
18#include <linux/mm_inline.h>
19#include <linux/mmu_notifier.h>
20#include <linux/poll.h>
21#include <linux/slab.h>
22#include <linux/seq_file.h>
23#include <linux/file.h>
24#include <linux/bug.h>
25#include <linux/anon_inodes.h>
26#include <linux/syscalls.h>
27#include <linux/userfaultfd_k.h>
28#include <linux/mempolicy.h>
29#include <linux/ioctl.h>
30#include <linux/security.h>
31#include <linux/hugetlb.h>
32#include <linux/swapops.h>
33#include <linux/miscdevice.h>
34
35static int sysctl_unprivileged_userfaultfd __read_mostly;
36
37#ifdef CONFIG_SYSCTL
38static struct ctl_table vm_userfaultfd_table[] = {
39	{
40		.procname	= "unprivileged_userfaultfd",
41		.data		= &sysctl_unprivileged_userfaultfd,
42		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
43		.mode		= 0644,
44		.proc_handler	= proc_dointvec_minmax,
45		.extra1		= SYSCTL_ZERO,
46		.extra2		= SYSCTL_ONE,
47	},
48};
49#endif
50
51static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
52
53struct userfaultfd_fork_ctx {
54	struct userfaultfd_ctx *orig;
55	struct userfaultfd_ctx *new;
56	struct list_head list;
57};
58
59struct userfaultfd_unmap_ctx {
60	struct userfaultfd_ctx *ctx;
61	unsigned long start;
62	unsigned long end;
63	struct list_head list;
64};
65
66struct userfaultfd_wait_queue {
67	struct uffd_msg msg;
68	wait_queue_entry_t wq;
69	struct userfaultfd_ctx *ctx;
70	bool waken;
71};
72
73struct userfaultfd_wake_range {
74	unsigned long start;
75	unsigned long len;
76};
77
78/* internal indication that UFFD_API ioctl was successfully executed */
79#define UFFD_FEATURE_INITIALIZED		(1u << 31)
80
81static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
82{
83	return ctx->features & UFFD_FEATURE_INITIALIZED;
84}
85
86static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
87{
88	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
89}
90
91/*
92 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
93 * meaningful when userfaultfd_wp()==true on the vma and when it's
94 * anonymous.
95 */
96bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
97{
98	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
99
100	if (!ctx)
101		return false;
102
103	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
104}
105
106static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
107				     vm_flags_t flags)
108{
109	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
110
111	vm_flags_reset(vma, flags);
112	/*
113	 * For shared mappings, we want to enable writenotify while
114	 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
115	 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
116	 */
117	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
118		vma_set_page_prot(vma);
119}
120
121static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
122				     int wake_flags, void *key)
123{
124	struct userfaultfd_wake_range *range = key;
125	int ret;
126	struct userfaultfd_wait_queue *uwq;
127	unsigned long start, len;
128
129	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
130	ret = 0;
131	/* len == 0 means wake all */
132	start = range->start;
133	len = range->len;
134	if (len && (start > uwq->msg.arg.pagefault.address ||
135		    start + len <= uwq->msg.arg.pagefault.address))
136		goto out;
137	WRITE_ONCE(uwq->waken, true);
138	/*
139	 * The Program-Order guarantees provided by the scheduler
140	 * ensure uwq->waken is visible before the task is woken.
141	 */
142	ret = wake_up_state(wq->private, mode);
143	if (ret) {
144		/*
145		 * Wake only once, autoremove behavior.
146		 *
147		 * After the effect of list_del_init is visible to the other
148		 * CPUs, the waitqueue may disappear from under us, see the
149		 * !list_empty_careful() in handle_userfault().
150		 *
151		 * try_to_wake_up() has an implicit smp_mb(), and the
152		 * wq->private is read before calling the extern function
153		 * "wake_up_state" (which in turns calls try_to_wake_up).
154		 */
155		list_del_init(&wq->entry);
156	}
157out:
158	return ret;
159}
160
161/**
162 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
163 * context.
164 * @ctx: [in] Pointer to the userfaultfd context.
165 */
166static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
167{
168	refcount_inc(&ctx->refcount);
169}
170
171/**
172 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
173 * context.
174 * @ctx: [in] Pointer to userfaultfd context.
175 *
176 * The userfaultfd context reference must have been previously acquired either
177 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
178 */
179static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
180{
181	if (refcount_dec_and_test(&ctx->refcount)) {
182		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
183		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
184		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
185		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
186		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
187		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
188		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
189		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
190		mmdrop(ctx->mm);
191		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
192	}
193}
194
195static inline void msg_init(struct uffd_msg *msg)
196{
197	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
198	/*
199	 * Must use memset to zero out the paddings or kernel data is
200	 * leaked to userland.
201	 */
202	memset(msg, 0, sizeof(struct uffd_msg));
203}
204
205static inline struct uffd_msg userfault_msg(unsigned long address,
206					    unsigned long real_address,
207					    unsigned int flags,
208					    unsigned long reason,
209					    unsigned int features)
210{
211	struct uffd_msg msg;
212
213	msg_init(&msg);
214	msg.event = UFFD_EVENT_PAGEFAULT;
215
216	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
217				    real_address : address;
218
219	/*
220	 * These flags indicate why the userfault occurred:
221	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
222	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
223	 * - Neither of these flags being set indicates a MISSING fault.
224	 *
225	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
226	 * fault. Otherwise, it was a read fault.
227	 */
228	if (flags & FAULT_FLAG_WRITE)
229		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
230	if (reason & VM_UFFD_WP)
231		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
232	if (reason & VM_UFFD_MINOR)
233		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
234	if (features & UFFD_FEATURE_THREAD_ID)
235		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
236	return msg;
237}
238
239#ifdef CONFIG_HUGETLB_PAGE
240/*
241 * Same functionality as userfaultfd_must_wait below with modifications for
242 * hugepmd ranges.
243 */
244static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
245					      struct vm_fault *vmf,
246					      unsigned long reason)
247{
248	struct vm_area_struct *vma = vmf->vma;
249	pte_t *ptep, pte;
250	bool ret = true;
251
252	assert_fault_locked(vmf);
253
254	ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
255	if (!ptep)
256		goto out;
257
258	ret = false;
259	pte = huge_ptep_get(ptep);
260
261	/*
262	 * Lockless access: we're in a wait_event so it's ok if it
263	 * changes under us.  PTE markers should be handled the same as none
264	 * ptes here.
265	 */
266	if (huge_pte_none_mostly(pte))
267		ret = true;
268	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
269		ret = true;
270out:
271	return ret;
272}
273#else
274static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
275					      struct vm_fault *vmf,
276					      unsigned long reason)
277{
278	return false;	/* should never get here */
279}
280#endif /* CONFIG_HUGETLB_PAGE */
281
282/*
283 * Verify the pagetables are still not ok after having reigstered into
284 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
285 * userfault that has already been resolved, if userfaultfd_read and
286 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
287 * threads.
288 */
289static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
290					 struct vm_fault *vmf,
291					 unsigned long reason)
292{
293	struct mm_struct *mm = ctx->mm;
294	unsigned long address = vmf->address;
295	pgd_t *pgd;
296	p4d_t *p4d;
297	pud_t *pud;
298	pmd_t *pmd, _pmd;
299	pte_t *pte;
300	pte_t ptent;
301	bool ret = true;
302
303	assert_fault_locked(vmf);
304
305	pgd = pgd_offset(mm, address);
306	if (!pgd_present(*pgd))
307		goto out;
308	p4d = p4d_offset(pgd, address);
309	if (!p4d_present(*p4d))
310		goto out;
311	pud = pud_offset(p4d, address);
312	if (!pud_present(*pud))
313		goto out;
314	pmd = pmd_offset(pud, address);
315again:
316	_pmd = pmdp_get_lockless(pmd);
317	if (pmd_none(_pmd))
318		goto out;
319
320	ret = false;
321	if (!pmd_present(_pmd) || pmd_devmap(_pmd))
322		goto out;
323
324	if (pmd_trans_huge(_pmd)) {
325		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
326			ret = true;
327		goto out;
328	}
329
330	pte = pte_offset_map(pmd, address);
331	if (!pte) {
332		ret = true;
333		goto again;
334	}
335	/*
336	 * Lockless access: we're in a wait_event so it's ok if it
337	 * changes under us.  PTE markers should be handled the same as none
338	 * ptes here.
339	 */
340	ptent = ptep_get(pte);
341	if (pte_none_mostly(ptent))
342		ret = true;
343	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
344		ret = true;
345	pte_unmap(pte);
346
347out:
348	return ret;
349}
350
351static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
352{
353	if (flags & FAULT_FLAG_INTERRUPTIBLE)
354		return TASK_INTERRUPTIBLE;
355
356	if (flags & FAULT_FLAG_KILLABLE)
357		return TASK_KILLABLE;
358
359	return TASK_UNINTERRUPTIBLE;
360}
361
362/*
363 * The locking rules involved in returning VM_FAULT_RETRY depending on
364 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
365 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
366 * recommendation in __lock_page_or_retry is not an understatement.
367 *
368 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
369 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
370 * not set.
371 *
372 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
373 * set, VM_FAULT_RETRY can still be returned if and only if there are
374 * fatal_signal_pending()s, and the mmap_lock must be released before
375 * returning it.
376 */
377vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
378{
379	struct vm_area_struct *vma = vmf->vma;
380	struct mm_struct *mm = vma->vm_mm;
381	struct userfaultfd_ctx *ctx;
382	struct userfaultfd_wait_queue uwq;
383	vm_fault_t ret = VM_FAULT_SIGBUS;
384	bool must_wait;
385	unsigned int blocking_state;
386
387	/*
388	 * We don't do userfault handling for the final child pid update.
389	 *
390	 * We also don't do userfault handling during
391	 * coredumping. hugetlbfs has the special
392	 * hugetlb_follow_page_mask() to skip missing pages in the
393	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
394	 * the no_page_table() helper in follow_page_mask(), but the
395	 * shmem_vm_ops->fault method is invoked even during
396	 * coredumping and it ends up here.
397	 */
398	if (current->flags & (PF_EXITING|PF_DUMPCORE))
399		goto out;
400
401	assert_fault_locked(vmf);
402
403	ctx = vma->vm_userfaultfd_ctx.ctx;
404	if (!ctx)
405		goto out;
406
407	BUG_ON(ctx->mm != mm);
408
409	/* Any unrecognized flag is a bug. */
410	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
411	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
412	VM_BUG_ON(!reason || (reason & (reason - 1)));
413
414	if (ctx->features & UFFD_FEATURE_SIGBUS)
415		goto out;
416	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
417		goto out;
418
419	/*
420	 * If it's already released don't get it. This avoids to loop
421	 * in __get_user_pages if userfaultfd_release waits on the
422	 * caller of handle_userfault to release the mmap_lock.
423	 */
424	if (unlikely(READ_ONCE(ctx->released))) {
425		/*
426		 * Don't return VM_FAULT_SIGBUS in this case, so a non
427		 * cooperative manager can close the uffd after the
428		 * last UFFDIO_COPY, without risking to trigger an
429		 * involuntary SIGBUS if the process was starting the
430		 * userfaultfd while the userfaultfd was still armed
431		 * (but after the last UFFDIO_COPY). If the uffd
432		 * wasn't already closed when the userfault reached
433		 * this point, that would normally be solved by
434		 * userfaultfd_must_wait returning 'false'.
435		 *
436		 * If we were to return VM_FAULT_SIGBUS here, the non
437		 * cooperative manager would be instead forced to
438		 * always call UFFDIO_UNREGISTER before it can safely
439		 * close the uffd.
440		 */
441		ret = VM_FAULT_NOPAGE;
442		goto out;
443	}
444
445	/*
446	 * Check that we can return VM_FAULT_RETRY.
447	 *
448	 * NOTE: it should become possible to return VM_FAULT_RETRY
449	 * even if FAULT_FLAG_TRIED is set without leading to gup()
450	 * -EBUSY failures, if the userfaultfd is to be extended for
451	 * VM_UFFD_WP tracking and we intend to arm the userfault
452	 * without first stopping userland access to the memory. For
453	 * VM_UFFD_MISSING userfaults this is enough for now.
454	 */
455	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
456		/*
457		 * Validate the invariant that nowait must allow retry
458		 * to be sure not to return SIGBUS erroneously on
459		 * nowait invocations.
460		 */
461		BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
462#ifdef CONFIG_DEBUG_VM
463		if (printk_ratelimit()) {
464			printk(KERN_WARNING
465			       "FAULT_FLAG_ALLOW_RETRY missing %x\n",
466			       vmf->flags);
467			dump_stack();
468		}
469#endif
470		goto out;
471	}
472
473	/*
474	 * Handle nowait, not much to do other than tell it to retry
475	 * and wait.
476	 */
477	ret = VM_FAULT_RETRY;
478	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
479		goto out;
480
481	/* take the reference before dropping the mmap_lock */
482	userfaultfd_ctx_get(ctx);
483
484	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
485	uwq.wq.private = current;
486	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
487				reason, ctx->features);
488	uwq.ctx = ctx;
489	uwq.waken = false;
490
491	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
492
493        /*
494         * Take the vma lock now, in order to safely call
495         * userfaultfd_huge_must_wait() later. Since acquiring the
496         * (sleepable) vma lock can modify the current task state, that
497         * must be before explicitly calling set_current_state().
498         */
499	if (is_vm_hugetlb_page(vma))
500		hugetlb_vma_lock_read(vma);
501
502	spin_lock_irq(&ctx->fault_pending_wqh.lock);
503	/*
504	 * After the __add_wait_queue the uwq is visible to userland
505	 * through poll/read().
506	 */
507	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
508	/*
509	 * The smp_mb() after __set_current_state prevents the reads
510	 * following the spin_unlock to happen before the list_add in
511	 * __add_wait_queue.
512	 */
513	set_current_state(blocking_state);
514	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
515
516	if (!is_vm_hugetlb_page(vma))
517		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
518	else
519		must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
520	if (is_vm_hugetlb_page(vma))
521		hugetlb_vma_unlock_read(vma);
522	release_fault_lock(vmf);
523
524	if (likely(must_wait && !READ_ONCE(ctx->released))) {
525		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
526		schedule();
527	}
528
529	__set_current_state(TASK_RUNNING);
530
531	/*
532	 * Here we race with the list_del; list_add in
533	 * userfaultfd_ctx_read(), however because we don't ever run
534	 * list_del_init() to refile across the two lists, the prev
535	 * and next pointers will never point to self. list_add also
536	 * would never let any of the two pointers to point to
537	 * self. So list_empty_careful won't risk to see both pointers
538	 * pointing to self at any time during the list refile. The
539	 * only case where list_del_init() is called is the full
540	 * removal in the wake function and there we don't re-list_add
541	 * and it's fine not to block on the spinlock. The uwq on this
542	 * kernel stack can be released after the list_del_init.
543	 */
544	if (!list_empty_careful(&uwq.wq.entry)) {
545		spin_lock_irq(&ctx->fault_pending_wqh.lock);
546		/*
547		 * No need of list_del_init(), the uwq on the stack
548		 * will be freed shortly anyway.
549		 */
550		list_del(&uwq.wq.entry);
551		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
552	}
553
554	/*
555	 * ctx may go away after this if the userfault pseudo fd is
556	 * already released.
557	 */
558	userfaultfd_ctx_put(ctx);
559
560out:
561	return ret;
562}
563
564static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
565					      struct userfaultfd_wait_queue *ewq)
566{
567	struct userfaultfd_ctx *release_new_ctx;
568
569	if (WARN_ON_ONCE(current->flags & PF_EXITING))
570		goto out;
571
572	ewq->ctx = ctx;
573	init_waitqueue_entry(&ewq->wq, current);
574	release_new_ctx = NULL;
575
576	spin_lock_irq(&ctx->event_wqh.lock);
577	/*
578	 * After the __add_wait_queue the uwq is visible to userland
579	 * through poll/read().
580	 */
581	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
582	for (;;) {
583		set_current_state(TASK_KILLABLE);
584		if (ewq->msg.event == 0)
585			break;
586		if (READ_ONCE(ctx->released) ||
587		    fatal_signal_pending(current)) {
588			/*
589			 * &ewq->wq may be queued in fork_event, but
590			 * __remove_wait_queue ignores the head
591			 * parameter. It would be a problem if it
592			 * didn't.
593			 */
594			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
595			if (ewq->msg.event == UFFD_EVENT_FORK) {
596				struct userfaultfd_ctx *new;
597
598				new = (struct userfaultfd_ctx *)
599					(unsigned long)
600					ewq->msg.arg.reserved.reserved1;
601				release_new_ctx = new;
602			}
603			break;
604		}
605
606		spin_unlock_irq(&ctx->event_wqh.lock);
607
608		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
609		schedule();
610
611		spin_lock_irq(&ctx->event_wqh.lock);
612	}
613	__set_current_state(TASK_RUNNING);
614	spin_unlock_irq(&ctx->event_wqh.lock);
615
616	if (release_new_ctx) {
617		struct vm_area_struct *vma;
618		struct mm_struct *mm = release_new_ctx->mm;
619		VMA_ITERATOR(vmi, mm, 0);
620
621		/* the various vma->vm_userfaultfd_ctx still points to it */
622		mmap_write_lock(mm);
623		for_each_vma(vmi, vma) {
624			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
625				vma_start_write(vma);
626				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
627				userfaultfd_set_vm_flags(vma,
628							 vma->vm_flags & ~__VM_UFFD_FLAGS);
629			}
630		}
631		mmap_write_unlock(mm);
632
633		userfaultfd_ctx_put(release_new_ctx);
634	}
635
636	/*
637	 * ctx may go away after this if the userfault pseudo fd is
638	 * already released.
639	 */
640out:
641	atomic_dec(&ctx->mmap_changing);
642	VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
643	userfaultfd_ctx_put(ctx);
644}
645
646static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
647				       struct userfaultfd_wait_queue *ewq)
648{
649	ewq->msg.event = 0;
650	wake_up_locked(&ctx->event_wqh);
651	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
652}
653
654int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
655{
656	struct userfaultfd_ctx *ctx = NULL, *octx;
657	struct userfaultfd_fork_ctx *fctx;
658
659	octx = vma->vm_userfaultfd_ctx.ctx;
660	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
661		vma_start_write(vma);
662		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
663		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
664		return 0;
665	}
666
667	list_for_each_entry(fctx, fcs, list)
668		if (fctx->orig == octx) {
669			ctx = fctx->new;
670			break;
671		}
672
673	if (!ctx) {
674		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
675		if (!fctx)
676			return -ENOMEM;
677
678		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
679		if (!ctx) {
680			kfree(fctx);
681			return -ENOMEM;
682		}
683
684		refcount_set(&ctx->refcount, 1);
685		ctx->flags = octx->flags;
686		ctx->features = octx->features;
687		ctx->released = false;
688		init_rwsem(&ctx->map_changing_lock);
689		atomic_set(&ctx->mmap_changing, 0);
690		ctx->mm = vma->vm_mm;
691		mmgrab(ctx->mm);
692
693		userfaultfd_ctx_get(octx);
694		down_write(&octx->map_changing_lock);
695		atomic_inc(&octx->mmap_changing);
696		up_write(&octx->map_changing_lock);
697		fctx->orig = octx;
698		fctx->new = ctx;
699		list_add_tail(&fctx->list, fcs);
700	}
701
702	vma->vm_userfaultfd_ctx.ctx = ctx;
703	return 0;
704}
705
706static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
707{
708	struct userfaultfd_ctx *ctx = fctx->orig;
709	struct userfaultfd_wait_queue ewq;
710
711	msg_init(&ewq.msg);
712
713	ewq.msg.event = UFFD_EVENT_FORK;
714	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
715
716	userfaultfd_event_wait_completion(ctx, &ewq);
717}
718
719void dup_userfaultfd_complete(struct list_head *fcs)
720{
721	struct userfaultfd_fork_ctx *fctx, *n;
722
723	list_for_each_entry_safe(fctx, n, fcs, list) {
724		dup_fctx(fctx);
725		list_del(&fctx->list);
726		kfree(fctx);
727	}
728}
729
730void mremap_userfaultfd_prep(struct vm_area_struct *vma,
731			     struct vm_userfaultfd_ctx *vm_ctx)
732{
733	struct userfaultfd_ctx *ctx;
734
735	ctx = vma->vm_userfaultfd_ctx.ctx;
736
737	if (!ctx)
738		return;
739
740	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
741		vm_ctx->ctx = ctx;
742		userfaultfd_ctx_get(ctx);
743		down_write(&ctx->map_changing_lock);
744		atomic_inc(&ctx->mmap_changing);
745		up_write(&ctx->map_changing_lock);
746	} else {
747		/* Drop uffd context if remap feature not enabled */
748		vma_start_write(vma);
749		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
750		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
751	}
752}
753
754void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
755				 unsigned long from, unsigned long to,
756				 unsigned long len)
757{
758	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
759	struct userfaultfd_wait_queue ewq;
760
761	if (!ctx)
762		return;
763
764	if (to & ~PAGE_MASK) {
765		userfaultfd_ctx_put(ctx);
766		return;
767	}
768
769	msg_init(&ewq.msg);
770
771	ewq.msg.event = UFFD_EVENT_REMAP;
772	ewq.msg.arg.remap.from = from;
773	ewq.msg.arg.remap.to = to;
774	ewq.msg.arg.remap.len = len;
775
776	userfaultfd_event_wait_completion(ctx, &ewq);
777}
778
779bool userfaultfd_remove(struct vm_area_struct *vma,
780			unsigned long start, unsigned long end)
781{
782	struct mm_struct *mm = vma->vm_mm;
783	struct userfaultfd_ctx *ctx;
784	struct userfaultfd_wait_queue ewq;
785
786	ctx = vma->vm_userfaultfd_ctx.ctx;
787	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
788		return true;
789
790	userfaultfd_ctx_get(ctx);
791	down_write(&ctx->map_changing_lock);
792	atomic_inc(&ctx->mmap_changing);
793	up_write(&ctx->map_changing_lock);
794	mmap_read_unlock(mm);
795
796	msg_init(&ewq.msg);
797
798	ewq.msg.event = UFFD_EVENT_REMOVE;
799	ewq.msg.arg.remove.start = start;
800	ewq.msg.arg.remove.end = end;
801
802	userfaultfd_event_wait_completion(ctx, &ewq);
803
804	return false;
805}
806
807static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
808			  unsigned long start, unsigned long end)
809{
810	struct userfaultfd_unmap_ctx *unmap_ctx;
811
812	list_for_each_entry(unmap_ctx, unmaps, list)
813		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
814		    unmap_ctx->end == end)
815			return true;
816
817	return false;
818}
819
820int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
821			   unsigned long end, struct list_head *unmaps)
822{
823	struct userfaultfd_unmap_ctx *unmap_ctx;
824	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
825
826	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
827	    has_unmap_ctx(ctx, unmaps, start, end))
828		return 0;
829
830	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
831	if (!unmap_ctx)
832		return -ENOMEM;
833
834	userfaultfd_ctx_get(ctx);
835	down_write(&ctx->map_changing_lock);
836	atomic_inc(&ctx->mmap_changing);
837	up_write(&ctx->map_changing_lock);
838	unmap_ctx->ctx = ctx;
839	unmap_ctx->start = start;
840	unmap_ctx->end = end;
841	list_add_tail(&unmap_ctx->list, unmaps);
842
843	return 0;
844}
845
846void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
847{
848	struct userfaultfd_unmap_ctx *ctx, *n;
849	struct userfaultfd_wait_queue ewq;
850
851	list_for_each_entry_safe(ctx, n, uf, list) {
852		msg_init(&ewq.msg);
853
854		ewq.msg.event = UFFD_EVENT_UNMAP;
855		ewq.msg.arg.remove.start = ctx->start;
856		ewq.msg.arg.remove.end = ctx->end;
857
858		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
859
860		list_del(&ctx->list);
861		kfree(ctx);
862	}
863}
864
865static int userfaultfd_release(struct inode *inode, struct file *file)
866{
867	struct userfaultfd_ctx *ctx = file->private_data;
868	struct mm_struct *mm = ctx->mm;
869	struct vm_area_struct *vma, *prev;
870	/* len == 0 means wake all */
871	struct userfaultfd_wake_range range = { .len = 0, };
872	unsigned long new_flags;
873	VMA_ITERATOR(vmi, mm, 0);
874
875	WRITE_ONCE(ctx->released, true);
876
877	if (!mmget_not_zero(mm))
878		goto wakeup;
879
880	/*
881	 * Flush page faults out of all CPUs. NOTE: all page faults
882	 * must be retried without returning VM_FAULT_SIGBUS if
883	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
884	 * changes while handle_userfault released the mmap_lock. So
885	 * it's critical that released is set to true (above), before
886	 * taking the mmap_lock for writing.
887	 */
888	mmap_write_lock(mm);
889	prev = NULL;
890	for_each_vma(vmi, vma) {
891		cond_resched();
892		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
893		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
894		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
895			prev = vma;
896			continue;
897		}
898		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
899		vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
900					    vma->vm_end, new_flags,
901					    NULL_VM_UFFD_CTX);
902
903		vma_start_write(vma);
904		userfaultfd_set_vm_flags(vma, new_flags);
905		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
906
907		prev = vma;
908	}
909	mmap_write_unlock(mm);
910	mmput(mm);
911wakeup:
912	/*
913	 * After no new page faults can wait on this fault_*wqh, flush
914	 * the last page faults that may have been already waiting on
915	 * the fault_*wqh.
916	 */
917	spin_lock_irq(&ctx->fault_pending_wqh.lock);
918	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
919	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
920	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
921
922	/* Flush pending events that may still wait on event_wqh */
923	wake_up_all(&ctx->event_wqh);
924
925	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
926	userfaultfd_ctx_put(ctx);
927	return 0;
928}
929
930/* fault_pending_wqh.lock must be hold by the caller */
931static inline struct userfaultfd_wait_queue *find_userfault_in(
932		wait_queue_head_t *wqh)
933{
934	wait_queue_entry_t *wq;
935	struct userfaultfd_wait_queue *uwq;
936
937	lockdep_assert_held(&wqh->lock);
938
939	uwq = NULL;
940	if (!waitqueue_active(wqh))
941		goto out;
942	/* walk in reverse to provide FIFO behavior to read userfaults */
943	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
944	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
945out:
946	return uwq;
947}
948
949static inline struct userfaultfd_wait_queue *find_userfault(
950		struct userfaultfd_ctx *ctx)
951{
952	return find_userfault_in(&ctx->fault_pending_wqh);
953}
954
955static inline struct userfaultfd_wait_queue *find_userfault_evt(
956		struct userfaultfd_ctx *ctx)
957{
958	return find_userfault_in(&ctx->event_wqh);
959}
960
961static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
962{
963	struct userfaultfd_ctx *ctx = file->private_data;
964	__poll_t ret;
965
966	poll_wait(file, &ctx->fd_wqh, wait);
967
968	if (!userfaultfd_is_initialized(ctx))
969		return EPOLLERR;
970
971	/*
972	 * poll() never guarantees that read won't block.
973	 * userfaults can be waken before they're read().
974	 */
975	if (unlikely(!(file->f_flags & O_NONBLOCK)))
976		return EPOLLERR;
977	/*
978	 * lockless access to see if there are pending faults
979	 * __pollwait last action is the add_wait_queue but
980	 * the spin_unlock would allow the waitqueue_active to
981	 * pass above the actual list_add inside
982	 * add_wait_queue critical section. So use a full
983	 * memory barrier to serialize the list_add write of
984	 * add_wait_queue() with the waitqueue_active read
985	 * below.
986	 */
987	ret = 0;
988	smp_mb();
989	if (waitqueue_active(&ctx->fault_pending_wqh))
990		ret = EPOLLIN;
991	else if (waitqueue_active(&ctx->event_wqh))
992		ret = EPOLLIN;
993
994	return ret;
995}
996
997static const struct file_operations userfaultfd_fops;
998
999static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1000				  struct inode *inode,
1001				  struct uffd_msg *msg)
1002{
1003	int fd;
1004
1005	fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
1006			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1007	if (fd < 0)
1008		return fd;
1009
1010	msg->arg.reserved.reserved1 = 0;
1011	msg->arg.fork.ufd = fd;
1012	return 0;
1013}
1014
1015static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1016				    struct uffd_msg *msg, struct inode *inode)
1017{
1018	ssize_t ret;
1019	DECLARE_WAITQUEUE(wait, current);
1020	struct userfaultfd_wait_queue *uwq;
1021	/*
1022	 * Handling fork event requires sleeping operations, so
1023	 * we drop the event_wqh lock, then do these ops, then
1024	 * lock it back and wake up the waiter. While the lock is
1025	 * dropped the ewq may go away so we keep track of it
1026	 * carefully.
1027	 */
1028	LIST_HEAD(fork_event);
1029	struct userfaultfd_ctx *fork_nctx = NULL;
1030
1031	/* always take the fd_wqh lock before the fault_pending_wqh lock */
1032	spin_lock_irq(&ctx->fd_wqh.lock);
1033	__add_wait_queue(&ctx->fd_wqh, &wait);
1034	for (;;) {
1035		set_current_state(TASK_INTERRUPTIBLE);
1036		spin_lock(&ctx->fault_pending_wqh.lock);
1037		uwq = find_userfault(ctx);
1038		if (uwq) {
1039			/*
1040			 * Use a seqcount to repeat the lockless check
1041			 * in wake_userfault() to avoid missing
1042			 * wakeups because during the refile both
1043			 * waitqueue could become empty if this is the
1044			 * only userfault.
1045			 */
1046			write_seqcount_begin(&ctx->refile_seq);
1047
1048			/*
1049			 * The fault_pending_wqh.lock prevents the uwq
1050			 * to disappear from under us.
1051			 *
1052			 * Refile this userfault from
1053			 * fault_pending_wqh to fault_wqh, it's not
1054			 * pending anymore after we read it.
1055			 *
1056			 * Use list_del() by hand (as
1057			 * userfaultfd_wake_function also uses
1058			 * list_del_init() by hand) to be sure nobody
1059			 * changes __remove_wait_queue() to use
1060			 * list_del_init() in turn breaking the
1061			 * !list_empty_careful() check in
1062			 * handle_userfault(). The uwq->wq.head list
1063			 * must never be empty at any time during the
1064			 * refile, or the waitqueue could disappear
1065			 * from under us. The "wait_queue_head_t"
1066			 * parameter of __remove_wait_queue() is unused
1067			 * anyway.
1068			 */
1069			list_del(&uwq->wq.entry);
1070			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1071
1072			write_seqcount_end(&ctx->refile_seq);
1073
1074			/* careful to always initialize msg if ret == 0 */
1075			*msg = uwq->msg;
1076			spin_unlock(&ctx->fault_pending_wqh.lock);
1077			ret = 0;
1078			break;
1079		}
1080		spin_unlock(&ctx->fault_pending_wqh.lock);
1081
1082		spin_lock(&ctx->event_wqh.lock);
1083		uwq = find_userfault_evt(ctx);
1084		if (uwq) {
1085			*msg = uwq->msg;
1086
1087			if (uwq->msg.event == UFFD_EVENT_FORK) {
1088				fork_nctx = (struct userfaultfd_ctx *)
1089					(unsigned long)
1090					uwq->msg.arg.reserved.reserved1;
1091				list_move(&uwq->wq.entry, &fork_event);
1092				/*
1093				 * fork_nctx can be freed as soon as
1094				 * we drop the lock, unless we take a
1095				 * reference on it.
1096				 */
1097				userfaultfd_ctx_get(fork_nctx);
1098				spin_unlock(&ctx->event_wqh.lock);
1099				ret = 0;
1100				break;
1101			}
1102
1103			userfaultfd_event_complete(ctx, uwq);
1104			spin_unlock(&ctx->event_wqh.lock);
1105			ret = 0;
1106			break;
1107		}
1108		spin_unlock(&ctx->event_wqh.lock);
1109
1110		if (signal_pending(current)) {
1111			ret = -ERESTARTSYS;
1112			break;
1113		}
1114		if (no_wait) {
1115			ret = -EAGAIN;
1116			break;
1117		}
1118		spin_unlock_irq(&ctx->fd_wqh.lock);
1119		schedule();
1120		spin_lock_irq(&ctx->fd_wqh.lock);
1121	}
1122	__remove_wait_queue(&ctx->fd_wqh, &wait);
1123	__set_current_state(TASK_RUNNING);
1124	spin_unlock_irq(&ctx->fd_wqh.lock);
1125
1126	if (!ret && msg->event == UFFD_EVENT_FORK) {
1127		ret = resolve_userfault_fork(fork_nctx, inode, msg);
1128		spin_lock_irq(&ctx->event_wqh.lock);
1129		if (!list_empty(&fork_event)) {
1130			/*
1131			 * The fork thread didn't abort, so we can
1132			 * drop the temporary refcount.
1133			 */
1134			userfaultfd_ctx_put(fork_nctx);
1135
1136			uwq = list_first_entry(&fork_event,
1137					       typeof(*uwq),
1138					       wq.entry);
1139			/*
1140			 * If fork_event list wasn't empty and in turn
1141			 * the event wasn't already released by fork
1142			 * (the event is allocated on fork kernel
1143			 * stack), put the event back to its place in
1144			 * the event_wq. fork_event head will be freed
1145			 * as soon as we return so the event cannot
1146			 * stay queued there no matter the current
1147			 * "ret" value.
1148			 */
1149			list_del(&uwq->wq.entry);
1150			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
1151
1152			/*
1153			 * Leave the event in the waitqueue and report
1154			 * error to userland if we failed to resolve
1155			 * the userfault fork.
1156			 */
1157			if (likely(!ret))
1158				userfaultfd_event_complete(ctx, uwq);
1159		} else {
1160			/*
1161			 * Here the fork thread aborted and the
1162			 * refcount from the fork thread on fork_nctx
1163			 * has already been released. We still hold
1164			 * the reference we took before releasing the
1165			 * lock above. If resolve_userfault_fork
1166			 * failed we've to drop it because the
1167			 * fork_nctx has to be freed in such case. If
1168			 * it succeeded we'll hold it because the new
1169			 * uffd references it.
1170			 */
1171			if (ret)
1172				userfaultfd_ctx_put(fork_nctx);
1173		}
1174		spin_unlock_irq(&ctx->event_wqh.lock);
1175	}
1176
1177	return ret;
1178}
1179
1180static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1181				size_t count, loff_t *ppos)
1182{
1183	struct userfaultfd_ctx *ctx = file->private_data;
1184	ssize_t _ret, ret = 0;
1185	struct uffd_msg msg;
1186	int no_wait = file->f_flags & O_NONBLOCK;
1187	struct inode *inode = file_inode(file);
1188
1189	if (!userfaultfd_is_initialized(ctx))
1190		return -EINVAL;
1191
1192	for (;;) {
1193		if (count < sizeof(msg))
1194			return ret ? ret : -EINVAL;
1195		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1196		if (_ret < 0)
1197			return ret ? ret : _ret;
1198		if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1199			return ret ? ret : -EFAULT;
1200		ret += sizeof(msg);
1201		buf += sizeof(msg);
1202		count -= sizeof(msg);
1203		/*
1204		 * Allow to read more than one fault at time but only
1205		 * block if waiting for the very first one.
1206		 */
1207		no_wait = O_NONBLOCK;
1208	}
1209}
1210
1211static void __wake_userfault(struct userfaultfd_ctx *ctx,
1212			     struct userfaultfd_wake_range *range)
1213{
1214	spin_lock_irq(&ctx->fault_pending_wqh.lock);
1215	/* wake all in the range and autoremove */
1216	if (waitqueue_active(&ctx->fault_pending_wqh))
1217		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1218				     range);
1219	if (waitqueue_active(&ctx->fault_wqh))
1220		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1221	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1222}
1223
1224static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1225					   struct userfaultfd_wake_range *range)
1226{
1227	unsigned seq;
1228	bool need_wakeup;
1229
1230	/*
1231	 * To be sure waitqueue_active() is not reordered by the CPU
1232	 * before the pagetable update, use an explicit SMP memory
1233	 * barrier here. PT lock release or mmap_read_unlock(mm) still
1234	 * have release semantics that can allow the
1235	 * waitqueue_active() to be reordered before the pte update.
1236	 */
1237	smp_mb();
1238
1239	/*
1240	 * Use waitqueue_active because it's very frequent to
1241	 * change the address space atomically even if there are no
1242	 * userfaults yet. So we take the spinlock only when we're
1243	 * sure we've userfaults to wake.
1244	 */
1245	do {
1246		seq = read_seqcount_begin(&ctx->refile_seq);
1247		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1248			waitqueue_active(&ctx->fault_wqh);
1249		cond_resched();
1250	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1251	if (need_wakeup)
1252		__wake_userfault(ctx, range);
1253}
1254
1255static __always_inline int validate_unaligned_range(
1256	struct mm_struct *mm, __u64 start, __u64 len)
1257{
1258	__u64 task_size = mm->task_size;
1259
1260	if (len & ~PAGE_MASK)
1261		return -EINVAL;
1262	if (!len)
1263		return -EINVAL;
1264	if (start < mmap_min_addr)
1265		return -EINVAL;
1266	if (start >= task_size)
1267		return -EINVAL;
1268	if (len > task_size - start)
1269		return -EINVAL;
1270	if (start + len <= start)
1271		return -EINVAL;
1272	return 0;
1273}
1274
1275static __always_inline int validate_range(struct mm_struct *mm,
1276					  __u64 start, __u64 len)
1277{
1278	if (start & ~PAGE_MASK)
1279		return -EINVAL;
1280
1281	return validate_unaligned_range(mm, start, len);
1282}
1283
1284static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1285				unsigned long arg)
1286{
1287	struct mm_struct *mm = ctx->mm;
1288	struct vm_area_struct *vma, *prev, *cur;
1289	int ret;
1290	struct uffdio_register uffdio_register;
1291	struct uffdio_register __user *user_uffdio_register;
1292	unsigned long vm_flags, new_flags;
1293	bool found;
1294	bool basic_ioctls;
1295	unsigned long start, end, vma_end;
1296	struct vma_iterator vmi;
1297	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1298
1299	user_uffdio_register = (struct uffdio_register __user *) arg;
1300
1301	ret = -EFAULT;
1302	if (copy_from_user(&uffdio_register, user_uffdio_register,
1303			   sizeof(uffdio_register)-sizeof(__u64)))
1304		goto out;
1305
1306	ret = -EINVAL;
1307	if (!uffdio_register.mode)
1308		goto out;
1309	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1310		goto out;
1311	vm_flags = 0;
1312	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1313		vm_flags |= VM_UFFD_MISSING;
1314	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1315#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1316		goto out;
1317#endif
1318		vm_flags |= VM_UFFD_WP;
1319	}
1320	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1321#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1322		goto out;
1323#endif
1324		vm_flags |= VM_UFFD_MINOR;
1325	}
1326
1327	ret = validate_range(mm, uffdio_register.range.start,
1328			     uffdio_register.range.len);
1329	if (ret)
1330		goto out;
1331
1332	start = uffdio_register.range.start;
1333	end = start + uffdio_register.range.len;
1334
1335	ret = -ENOMEM;
1336	if (!mmget_not_zero(mm))
1337		goto out;
1338
1339	ret = -EINVAL;
1340	mmap_write_lock(mm);
1341	vma_iter_init(&vmi, mm, start);
1342	vma = vma_find(&vmi, end);
1343	if (!vma)
1344		goto out_unlock;
1345
1346	/*
1347	 * If the first vma contains huge pages, make sure start address
1348	 * is aligned to huge page size.
1349	 */
1350	if (is_vm_hugetlb_page(vma)) {
1351		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1352
1353		if (start & (vma_hpagesize - 1))
1354			goto out_unlock;
1355	}
1356
1357	/*
1358	 * Search for not compatible vmas.
1359	 */
1360	found = false;
1361	basic_ioctls = false;
1362	cur = vma;
1363	do {
1364		cond_resched();
1365
1366		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1367		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
1368
1369		/* check not compatible vmas */
1370		ret = -EINVAL;
1371		if (!vma_can_userfault(cur, vm_flags, wp_async))
1372			goto out_unlock;
1373
1374		/*
1375		 * UFFDIO_COPY will fill file holes even without
1376		 * PROT_WRITE. This check enforces that if this is a
1377		 * MAP_SHARED, the process has write permission to the backing
1378		 * file. If VM_MAYWRITE is set it also enforces that on a
1379		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1380		 * F_WRITE_SEAL can be taken until the vma is destroyed.
1381		 */
1382		ret = -EPERM;
1383		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1384			goto out_unlock;
1385
1386		/*
1387		 * If this vma contains ending address, and huge pages
1388		 * check alignment.
1389		 */
1390		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1391		    end > cur->vm_start) {
1392			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1393
1394			ret = -EINVAL;
1395
1396			if (end & (vma_hpagesize - 1))
1397				goto out_unlock;
1398		}
1399		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1400			goto out_unlock;
1401
1402		/*
1403		 * Check that this vma isn't already owned by a
1404		 * different userfaultfd. We can't allow more than one
1405		 * userfaultfd to own a single vma simultaneously or we
1406		 * wouldn't know which one to deliver the userfaults to.
1407		 */
1408		ret = -EBUSY;
1409		if (cur->vm_userfaultfd_ctx.ctx &&
1410		    cur->vm_userfaultfd_ctx.ctx != ctx)
1411			goto out_unlock;
1412
1413		/*
1414		 * Note vmas containing huge pages
1415		 */
1416		if (is_vm_hugetlb_page(cur))
1417			basic_ioctls = true;
1418
1419		found = true;
1420	} for_each_vma_range(vmi, cur, end);
1421	BUG_ON(!found);
1422
1423	vma_iter_set(&vmi, start);
1424	prev = vma_prev(&vmi);
1425	if (vma->vm_start < start)
1426		prev = vma;
1427
1428	ret = 0;
1429	for_each_vma_range(vmi, vma, end) {
1430		cond_resched();
1431
1432		BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1433		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1434		       vma->vm_userfaultfd_ctx.ctx != ctx);
1435		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1436
1437		/*
1438		 * Nothing to do: this vma is already registered into this
1439		 * userfaultfd and with the right tracking mode too.
1440		 */
1441		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1442		    (vma->vm_flags & vm_flags) == vm_flags)
1443			goto skip;
1444
1445		if (vma->vm_start > start)
1446			start = vma->vm_start;
1447		vma_end = min(end, vma->vm_end);
1448
1449		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1450		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1451					    new_flags,
1452					    (struct vm_userfaultfd_ctx){ctx});
1453		if (IS_ERR(vma)) {
1454			ret = PTR_ERR(vma);
1455			break;
1456		}
1457
1458		/*
1459		 * In the vma_merge() successful mprotect-like case 8:
1460		 * the next vma was merged into the current one and
1461		 * the current one has not been updated yet.
1462		 */
1463		vma_start_write(vma);
1464		userfaultfd_set_vm_flags(vma, new_flags);
1465		vma->vm_userfaultfd_ctx.ctx = ctx;
1466
1467		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1468			hugetlb_unshare_all_pmds(vma);
1469
1470	skip:
1471		prev = vma;
1472		start = vma->vm_end;
1473	}
1474
1475out_unlock:
1476	mmap_write_unlock(mm);
1477	mmput(mm);
1478	if (!ret) {
1479		__u64 ioctls_out;
1480
1481		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1482		    UFFD_API_RANGE_IOCTLS;
1483
1484		/*
1485		 * Declare the WP ioctl only if the WP mode is
1486		 * specified and all checks passed with the range
1487		 */
1488		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1489			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1490
1491		/* CONTINUE ioctl is only supported for MINOR ranges. */
1492		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1493			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1494
1495		/*
1496		 * Now that we scanned all vmas we can already tell
1497		 * userland which ioctls methods are guaranteed to
1498		 * succeed on this range.
1499		 */
1500		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1501			ret = -EFAULT;
1502	}
1503out:
1504	return ret;
1505}
1506
1507static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1508				  unsigned long arg)
1509{
1510	struct mm_struct *mm = ctx->mm;
1511	struct vm_area_struct *vma, *prev, *cur;
1512	int ret;
1513	struct uffdio_range uffdio_unregister;
1514	unsigned long new_flags;
1515	bool found;
1516	unsigned long start, end, vma_end;
1517	const void __user *buf = (void __user *)arg;
1518	struct vma_iterator vmi;
1519	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1520
1521	ret = -EFAULT;
1522	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1523		goto out;
1524
1525	ret = validate_range(mm, uffdio_unregister.start,
1526			     uffdio_unregister.len);
1527	if (ret)
1528		goto out;
1529
1530	start = uffdio_unregister.start;
1531	end = start + uffdio_unregister.len;
1532
1533	ret = -ENOMEM;
1534	if (!mmget_not_zero(mm))
1535		goto out;
1536
1537	mmap_write_lock(mm);
1538	ret = -EINVAL;
1539	vma_iter_init(&vmi, mm, start);
1540	vma = vma_find(&vmi, end);
1541	if (!vma)
1542		goto out_unlock;
1543
1544	/*
1545	 * If the first vma contains huge pages, make sure start address
1546	 * is aligned to huge page size.
1547	 */
1548	if (is_vm_hugetlb_page(vma)) {
1549		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1550
1551		if (start & (vma_hpagesize - 1))
1552			goto out_unlock;
1553	}
1554
1555	/*
1556	 * Search for not compatible vmas.
1557	 */
1558	found = false;
1559	cur = vma;
1560	do {
1561		cond_resched();
1562
1563		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1564		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
1565
1566		/*
1567		 * Check not compatible vmas, not strictly required
1568		 * here as not compatible vmas cannot have an
1569		 * userfaultfd_ctx registered on them, but this
1570		 * provides for more strict behavior to notice
1571		 * unregistration errors.
1572		 */
1573		if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
1574			goto out_unlock;
1575
1576		found = true;
1577	} for_each_vma_range(vmi, cur, end);
1578	BUG_ON(!found);
1579
1580	vma_iter_set(&vmi, start);
1581	prev = vma_prev(&vmi);
1582	if (vma->vm_start < start)
1583		prev = vma;
1584
1585	ret = 0;
1586	for_each_vma_range(vmi, vma, end) {
1587		cond_resched();
1588
1589		BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1590
1591		/*
1592		 * Nothing to do: this vma is already registered into this
1593		 * userfaultfd and with the right tracking mode too.
1594		 */
1595		if (!vma->vm_userfaultfd_ctx.ctx)
1596			goto skip;
1597
1598		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1599
1600		if (vma->vm_start > start)
1601			start = vma->vm_start;
1602		vma_end = min(end, vma->vm_end);
1603
1604		if (userfaultfd_missing(vma)) {
1605			/*
1606			 * Wake any concurrent pending userfault while
1607			 * we unregister, so they will not hang
1608			 * permanently and it avoids userland to call
1609			 * UFFDIO_WAKE explicitly.
1610			 */
1611			struct userfaultfd_wake_range range;
1612			range.start = start;
1613			range.len = vma_end - start;
1614			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1615		}
1616
1617		/* Reset ptes for the whole vma range if wr-protected */
1618		if (userfaultfd_wp(vma))
1619			uffd_wp_range(vma, start, vma_end - start, false);
1620
1621		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1622		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1623					    new_flags, NULL_VM_UFFD_CTX);
1624		if (IS_ERR(vma)) {
1625			ret = PTR_ERR(vma);
1626			break;
1627		}
1628
1629		/*
1630		 * In the vma_merge() successful mprotect-like case 8:
1631		 * the next vma was merged into the current one and
1632		 * the current one has not been updated yet.
1633		 */
1634		vma_start_write(vma);
1635		userfaultfd_set_vm_flags(vma, new_flags);
1636		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1637
1638	skip:
1639		prev = vma;
1640		start = vma->vm_end;
1641	}
1642
1643out_unlock:
1644	mmap_write_unlock(mm);
1645	mmput(mm);
1646out:
1647	return ret;
1648}
1649
1650/*
1651 * userfaultfd_wake may be used in combination with the
1652 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1653 */
1654static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1655			    unsigned long arg)
1656{
1657	int ret;
1658	struct uffdio_range uffdio_wake;
1659	struct userfaultfd_wake_range range;
1660	const void __user *buf = (void __user *)arg;
1661
1662	ret = -EFAULT;
1663	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1664		goto out;
1665
1666	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1667	if (ret)
1668		goto out;
1669
1670	range.start = uffdio_wake.start;
1671	range.len = uffdio_wake.len;
1672
1673	/*
1674	 * len == 0 means wake all and we don't want to wake all here,
1675	 * so check it again to be sure.
1676	 */
1677	VM_BUG_ON(!range.len);
1678
1679	wake_userfault(ctx, &range);
1680	ret = 0;
1681
1682out:
1683	return ret;
1684}
1685
1686static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1687			    unsigned long arg)
1688{
1689	__s64 ret;
1690	struct uffdio_copy uffdio_copy;
1691	struct uffdio_copy __user *user_uffdio_copy;
1692	struct userfaultfd_wake_range range;
1693	uffd_flags_t flags = 0;
1694
1695	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1696
1697	ret = -EAGAIN;
1698	if (atomic_read(&ctx->mmap_changing))
1699		goto out;
1700
1701	ret = -EFAULT;
1702	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1703			   /* don't copy "copy" last field */
1704			   sizeof(uffdio_copy)-sizeof(__s64)))
1705		goto out;
1706
1707	ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
1708				       uffdio_copy.len);
1709	if (ret)
1710		goto out;
1711	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1712	if (ret)
1713		goto out;
1714
1715	ret = -EINVAL;
1716	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1717		goto out;
1718	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1719		flags |= MFILL_ATOMIC_WP;
1720	if (mmget_not_zero(ctx->mm)) {
1721		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
1722					uffdio_copy.len, flags);
1723		mmput(ctx->mm);
1724	} else {
1725		return -ESRCH;
1726	}
1727	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1728		return -EFAULT;
1729	if (ret < 0)
1730		goto out;
1731	BUG_ON(!ret);
1732	/* len == 0 would wake all */
1733	range.len = ret;
1734	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1735		range.start = uffdio_copy.dst;
1736		wake_userfault(ctx, &range);
1737	}
1738	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1739out:
1740	return ret;
1741}
1742
1743static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1744				unsigned long arg)
1745{
1746	__s64 ret;
1747	struct uffdio_zeropage uffdio_zeropage;
1748	struct uffdio_zeropage __user *user_uffdio_zeropage;
1749	struct userfaultfd_wake_range range;
1750
1751	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1752
1753	ret = -EAGAIN;
1754	if (atomic_read(&ctx->mmap_changing))
1755		goto out;
1756
1757	ret = -EFAULT;
1758	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1759			   /* don't copy "zeropage" last field */
1760			   sizeof(uffdio_zeropage)-sizeof(__s64)))
1761		goto out;
1762
1763	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1764			     uffdio_zeropage.range.len);
1765	if (ret)
1766		goto out;
1767	ret = -EINVAL;
1768	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1769		goto out;
1770
1771	if (mmget_not_zero(ctx->mm)) {
1772		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
1773					   uffdio_zeropage.range.len);
1774		mmput(ctx->mm);
1775	} else {
1776		return -ESRCH;
1777	}
1778	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1779		return -EFAULT;
1780	if (ret < 0)
1781		goto out;
1782	/* len == 0 would wake all */
1783	BUG_ON(!ret);
1784	range.len = ret;
1785	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1786		range.start = uffdio_zeropage.range.start;
1787		wake_userfault(ctx, &range);
1788	}
1789	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1790out:
1791	return ret;
1792}
1793
1794static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1795				    unsigned long arg)
1796{
1797	int ret;
1798	struct uffdio_writeprotect uffdio_wp;
1799	struct uffdio_writeprotect __user *user_uffdio_wp;
1800	struct userfaultfd_wake_range range;
1801	bool mode_wp, mode_dontwake;
1802
1803	if (atomic_read(&ctx->mmap_changing))
1804		return -EAGAIN;
1805
1806	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1807
1808	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1809			   sizeof(struct uffdio_writeprotect)))
1810		return -EFAULT;
1811
1812	ret = validate_range(ctx->mm, uffdio_wp.range.start,
1813			     uffdio_wp.range.len);
1814	if (ret)
1815		return ret;
1816
1817	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1818			       UFFDIO_WRITEPROTECT_MODE_WP))
1819		return -EINVAL;
1820
1821	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1822	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1823
1824	if (mode_wp && mode_dontwake)
1825		return -EINVAL;
1826
1827	if (mmget_not_zero(ctx->mm)) {
1828		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
1829					  uffdio_wp.range.len, mode_wp);
1830		mmput(ctx->mm);
1831	} else {
1832		return -ESRCH;
1833	}
1834
1835	if (ret)
1836		return ret;
1837
1838	if (!mode_wp && !mode_dontwake) {
1839		range.start = uffdio_wp.range.start;
1840		range.len = uffdio_wp.range.len;
1841		wake_userfault(ctx, &range);
1842	}
1843	return ret;
1844}
1845
1846static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1847{
1848	__s64 ret;
1849	struct uffdio_continue uffdio_continue;
1850	struct uffdio_continue __user *user_uffdio_continue;
1851	struct userfaultfd_wake_range range;
1852	uffd_flags_t flags = 0;
1853
1854	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1855
1856	ret = -EAGAIN;
1857	if (atomic_read(&ctx->mmap_changing))
1858		goto out;
1859
1860	ret = -EFAULT;
1861	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1862			   /* don't copy the output fields */
1863			   sizeof(uffdio_continue) - (sizeof(__s64))))
1864		goto out;
1865
1866	ret = validate_range(ctx->mm, uffdio_continue.range.start,
1867			     uffdio_continue.range.len);
1868	if (ret)
1869		goto out;
1870
1871	ret = -EINVAL;
1872	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1873				     UFFDIO_CONTINUE_MODE_WP))
1874		goto out;
1875	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1876		flags |= MFILL_ATOMIC_WP;
1877
1878	if (mmget_not_zero(ctx->mm)) {
1879		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
1880					    uffdio_continue.range.len, flags);
1881		mmput(ctx->mm);
1882	} else {
1883		return -ESRCH;
1884	}
1885
1886	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1887		return -EFAULT;
1888	if (ret < 0)
1889		goto out;
1890
1891	/* len == 0 would wake all */
1892	BUG_ON(!ret);
1893	range.len = ret;
1894	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1895		range.start = uffdio_continue.range.start;
1896		wake_userfault(ctx, &range);
1897	}
1898	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1899
1900out:
1901	return ret;
1902}
1903
1904static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1905{
1906	__s64 ret;
1907	struct uffdio_poison uffdio_poison;
1908	struct uffdio_poison __user *user_uffdio_poison;
1909	struct userfaultfd_wake_range range;
1910
1911	user_uffdio_poison = (struct uffdio_poison __user *)arg;
1912
1913	ret = -EAGAIN;
1914	if (atomic_read(&ctx->mmap_changing))
1915		goto out;
1916
1917	ret = -EFAULT;
1918	if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1919			   /* don't copy the output fields */
1920			   sizeof(uffdio_poison) - (sizeof(__s64))))
1921		goto out;
1922
1923	ret = validate_range(ctx->mm, uffdio_poison.range.start,
1924			     uffdio_poison.range.len);
1925	if (ret)
1926		goto out;
1927
1928	ret = -EINVAL;
1929	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1930		goto out;
1931
1932	if (mmget_not_zero(ctx->mm)) {
1933		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
1934					  uffdio_poison.range.len, 0);
1935		mmput(ctx->mm);
1936	} else {
1937		return -ESRCH;
1938	}
1939
1940	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1941		return -EFAULT;
1942	if (ret < 0)
1943		goto out;
1944
1945	/* len == 0 would wake all */
1946	BUG_ON(!ret);
1947	range.len = ret;
1948	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1949		range.start = uffdio_poison.range.start;
1950		wake_userfault(ctx, &range);
1951	}
1952	ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
1953
1954out:
1955	return ret;
1956}
1957
1958bool userfaultfd_wp_async(struct vm_area_struct *vma)
1959{
1960	return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
1961}
1962
1963static inline unsigned int uffd_ctx_features(__u64 user_features)
1964{
1965	/*
1966	 * For the current set of features the bits just coincide. Set
1967	 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1968	 */
1969	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
1970}
1971
1972static int userfaultfd_move(struct userfaultfd_ctx *ctx,
1973			    unsigned long arg)
1974{
1975	__s64 ret;
1976	struct uffdio_move uffdio_move;
1977	struct uffdio_move __user *user_uffdio_move;
1978	struct userfaultfd_wake_range range;
1979	struct mm_struct *mm = ctx->mm;
1980
1981	user_uffdio_move = (struct uffdio_move __user *) arg;
1982
1983	if (atomic_read(&ctx->mmap_changing))
1984		return -EAGAIN;
1985
1986	if (copy_from_user(&uffdio_move, user_uffdio_move,
1987			   /* don't copy "move" last field */
1988			   sizeof(uffdio_move)-sizeof(__s64)))
1989		return -EFAULT;
1990
1991	/* Do not allow cross-mm moves. */
1992	if (mm != current->mm)
1993		return -EINVAL;
1994
1995	ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
1996	if (ret)
1997		return ret;
1998
1999	ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
2000	if (ret)
2001		return ret;
2002
2003	if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
2004				  UFFDIO_MOVE_MODE_DONTWAKE))
2005		return -EINVAL;
2006
2007	if (mmget_not_zero(mm)) {
2008		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
2009				 uffdio_move.len, uffdio_move.mode);
2010		mmput(mm);
2011	} else {
2012		return -ESRCH;
2013	}
2014
2015	if (unlikely(put_user(ret, &user_uffdio_move->move)))
2016		return -EFAULT;
2017	if (ret < 0)
2018		goto out;
2019
2020	/* len == 0 would wake all */
2021	VM_WARN_ON(!ret);
2022	range.len = ret;
2023	if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
2024		range.start = uffdio_move.dst;
2025		wake_userfault(ctx, &range);
2026	}
2027	ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
2028
2029out:
2030	return ret;
2031}
2032
2033/*
2034 * userland asks for a certain API version and we return which bits
2035 * and ioctl commands are implemented in this kernel for such API
2036 * version or -EINVAL if unknown.
2037 */
2038static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2039			   unsigned long arg)
2040{
2041	struct uffdio_api uffdio_api;
2042	void __user *buf = (void __user *)arg;
2043	unsigned int ctx_features;
2044	int ret;
2045	__u64 features;
2046
2047	ret = -EFAULT;
2048	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
2049		goto out;
2050	features = uffdio_api.features;
2051	ret = -EINVAL;
2052	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2053		goto err_out;
2054	ret = -EPERM;
2055	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2056		goto err_out;
2057
2058	/* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
2059	if (features & UFFD_FEATURE_WP_ASYNC)
2060		features |= UFFD_FEATURE_WP_UNPOPULATED;
2061
2062	/* report all available features and ioctls to userland */
2063	uffdio_api.features = UFFD_API_FEATURES;
2064#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2065	uffdio_api.features &=
2066		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
2067#endif
2068#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2069	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2070#endif
2071#ifndef CONFIG_PTE_MARKER_UFFD_WP
2072	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2073	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2074	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2075#endif
2076	uffdio_api.ioctls = UFFD_API_IOCTLS;
2077	ret = -EFAULT;
2078	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2079		goto out;
2080
2081	/* only enable the requested features for this uffd context */
2082	ctx_features = uffd_ctx_features(features);
2083	ret = -EINVAL;
2084	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2085		goto err_out;
2086
2087	ret = 0;
2088out:
2089	return ret;
2090err_out:
2091	memset(&uffdio_api, 0, sizeof(uffdio_api));
2092	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2093		ret = -EFAULT;
2094	goto out;
2095}
2096
2097static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2098			      unsigned long arg)
2099{
2100	int ret = -EINVAL;
2101	struct userfaultfd_ctx *ctx = file->private_data;
2102
2103	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2104		return -EINVAL;
2105
2106	switch(cmd) {
2107	case UFFDIO_API:
2108		ret = userfaultfd_api(ctx, arg);
2109		break;
2110	case UFFDIO_REGISTER:
2111		ret = userfaultfd_register(ctx, arg);
2112		break;
2113	case UFFDIO_UNREGISTER:
2114		ret = userfaultfd_unregister(ctx, arg);
2115		break;
2116	case UFFDIO_WAKE:
2117		ret = userfaultfd_wake(ctx, arg);
2118		break;
2119	case UFFDIO_COPY:
2120		ret = userfaultfd_copy(ctx, arg);
2121		break;
2122	case UFFDIO_ZEROPAGE:
2123		ret = userfaultfd_zeropage(ctx, arg);
2124		break;
2125	case UFFDIO_MOVE:
2126		ret = userfaultfd_move(ctx, arg);
2127		break;
2128	case UFFDIO_WRITEPROTECT:
2129		ret = userfaultfd_writeprotect(ctx, arg);
2130		break;
2131	case UFFDIO_CONTINUE:
2132		ret = userfaultfd_continue(ctx, arg);
2133		break;
2134	case UFFDIO_POISON:
2135		ret = userfaultfd_poison(ctx, arg);
2136		break;
2137	}
2138	return ret;
2139}
2140
2141#ifdef CONFIG_PROC_FS
2142static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2143{
2144	struct userfaultfd_ctx *ctx = f->private_data;
2145	wait_queue_entry_t *wq;
2146	unsigned long pending = 0, total = 0;
2147
2148	spin_lock_irq(&ctx->fault_pending_wqh.lock);
2149	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2150		pending++;
2151		total++;
2152	}
2153	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2154		total++;
2155	}
2156	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2157
2158	/*
2159	 * If more protocols will be added, there will be all shown
2160	 * separated by a space. Like this:
2161	 *	protocols: aa:... bb:...
2162	 */
2163	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2164		   pending, total, UFFD_API, ctx->features,
2165		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2166}
2167#endif
2168
2169static const struct file_operations userfaultfd_fops = {
2170#ifdef CONFIG_PROC_FS
2171	.show_fdinfo	= userfaultfd_show_fdinfo,
2172#endif
2173	.release	= userfaultfd_release,
2174	.poll		= userfaultfd_poll,
2175	.read		= userfaultfd_read,
2176	.unlocked_ioctl = userfaultfd_ioctl,
2177	.compat_ioctl	= compat_ptr_ioctl,
2178	.llseek		= noop_llseek,
2179};
2180
2181static void init_once_userfaultfd_ctx(void *mem)
2182{
2183	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2184
2185	init_waitqueue_head(&ctx->fault_pending_wqh);
2186	init_waitqueue_head(&ctx->fault_wqh);
2187	init_waitqueue_head(&ctx->event_wqh);
2188	init_waitqueue_head(&ctx->fd_wqh);
2189	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2190}
2191
2192static int new_userfaultfd(int flags)
2193{
2194	struct userfaultfd_ctx *ctx;
2195	int fd;
2196
2197	BUG_ON(!current->mm);
2198
2199	/* Check the UFFD_* constants for consistency.  */
2200	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2201	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2202	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2203
2204	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2205		return -EINVAL;
2206
2207	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2208	if (!ctx)
2209		return -ENOMEM;
2210
2211	refcount_set(&ctx->refcount, 1);
2212	ctx->flags = flags;
2213	ctx->features = 0;
2214	ctx->released = false;
2215	init_rwsem(&ctx->map_changing_lock);
2216	atomic_set(&ctx->mmap_changing, 0);
2217	ctx->mm = current->mm;
2218	/* prevent the mm struct to be freed */
2219	mmgrab(ctx->mm);
2220
2221	/* Create a new inode so that the LSM can block the creation.  */
2222	fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
2223			O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2224	if (fd < 0) {
2225		mmdrop(ctx->mm);
2226		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2227	}
2228	return fd;
2229}
2230
2231static inline bool userfaultfd_syscall_allowed(int flags)
2232{
2233	/* Userspace-only page faults are always allowed */
2234	if (flags & UFFD_USER_MODE_ONLY)
2235		return true;
2236
2237	/*
2238	 * The user is requesting a userfaultfd which can handle kernel faults.
2239	 * Privileged users are always allowed to do this.
2240	 */
2241	if (capable(CAP_SYS_PTRACE))
2242		return true;
2243
2244	/* Otherwise, access to kernel fault handling is sysctl controlled. */
2245	return sysctl_unprivileged_userfaultfd;
2246}
2247
2248SYSCALL_DEFINE1(userfaultfd, int, flags)
2249{
2250	if (!userfaultfd_syscall_allowed(flags))
2251		return -EPERM;
2252
2253	return new_userfaultfd(flags);
2254}
2255
2256static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2257{
2258	if (cmd != USERFAULTFD_IOC_NEW)
2259		return -EINVAL;
2260
2261	return new_userfaultfd(flags);
2262}
2263
2264static const struct file_operations userfaultfd_dev_fops = {
2265	.unlocked_ioctl = userfaultfd_dev_ioctl,
2266	.compat_ioctl = userfaultfd_dev_ioctl,
2267	.owner = THIS_MODULE,
2268	.llseek = noop_llseek,
2269};
2270
2271static struct miscdevice userfaultfd_misc = {
2272	.minor = MISC_DYNAMIC_MINOR,
2273	.name = "userfaultfd",
2274	.fops = &userfaultfd_dev_fops
2275};
2276
2277static int __init userfaultfd_init(void)
2278{
2279	int ret;
2280
2281	ret = misc_register(&userfaultfd_misc);
2282	if (ret)
2283		return ret;
2284
2285	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2286						sizeof(struct userfaultfd_ctx),
2287						0,
2288						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2289						init_once_userfaultfd_ctx);
2290#ifdef CONFIG_SYSCTL
2291	register_sysctl_init("vm", vm_userfaultfd_table);
2292#endif
2293	return 0;
2294}
2295__initcall(userfaultfd_init);
2296